From 70b251d8861c69c6e64f8dfeb4a0348c3a140be6 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 9 Feb 2026 17:47:09 +0100 Subject: [PATCH 01/43] implement racing for strong branching --- cpp/src/dual_simplex/pseudo_costs.cpp | 186 +++++++++++++++++++++++--- cpp/src/linear_programming/pdlp.cu | 3 +- 2 files changed, 168 insertions(+), 21 deletions(-) diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp index 682bdaa6f9..13a0308b92 100644 --- a/cpp/src/dual_simplex/pseudo_costs.cpp +++ b/cpp/src/dual_simplex/pseudo_costs.cpp @@ -31,10 +31,17 @@ void strong_branch_helper(i_t start, const std::vector& root_soln, const std::vector& root_vstatus, const std::vector& edge_norms, - pseudo_costs_t& pc) + pseudo_costs_t& pc, + std::vector& ds_obj_down, + std::vector& ds_obj_up, + std::vector& ds_status_down, + std::vector& ds_status_up, + std::atomic* concurrent_halt) { lp_problem_t child_problem = original_lp; + assert(concurrent_halt != nullptr && "Concurrent halt pointer cannot be nullptr"); + constexpr bool verbose = false; f_t last_log = tic(); i_t thread_id = omp_get_thread_num(); @@ -55,7 +62,7 @@ void strong_branch_helper(i_t start, child_settings.set_log(false); f_t lp_start_time = tic(); f_t elapsed_time = toc(start_time); - if (elapsed_time > settings.time_limit) { break; } + if (elapsed_time > settings.time_limit || *concurrent_halt == 1) { break; } child_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time); child_settings.iteration_limit = 200; lp_solution_t solution(original_lp.num_rows, original_lp.num_cols); @@ -89,6 +96,7 @@ void strong_branch_helper(i_t start, if (branch == 0) { pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0); + ds_status_down[k] = status; if (verbose) { settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n", thread_id, @@ -100,6 +108,7 @@ void strong_branch_helper(i_t start, } } else { pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0); + ds_status_up[k] = status; if (verbose) { settings.log.printf( "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e " @@ -109,14 +118,18 @@ void strong_branch_helper(i_t start, j, branch, obj, - pc.strong_branch_down[k], - pc.strong_branch_up[k], + ds_obj_down[k], + ds_obj_up[k], toc(start_time)); } } - if (toc(start_time) > settings.time_limit) { break; } + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { + break; + } + } + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { + break; } - if (toc(start_time) > settings.time_limit) { break; } const i_t completed = pc.num_strong_branches_completed++; @@ -131,7 +144,12 @@ void strong_branch_helper(i_t start, child_problem.lower[j] = original_lp.lower[j]; child_problem.upper[j] = original_lp.upper[j]; - if (toc(start_time) > settings.time_limit) { break; } + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { + if (*concurrent_halt == 1) { + std::cout << "Concurrent halt reached in Dual Simplex" << std::endl; + } + break; + } } } @@ -292,6 +310,40 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data return mps_model; } +// Merge a single strong branching result from Dual Simplex and PDLP. +// Rules: +// 1. If both found optimal -> keep DS (higher quality vertex solution) +// 2. Else if Dual Simplex found infeasible -> declare infeasible +// 3. Else if one is optimal -> keep the optimal one +// 4. Else if Dual Simplex hit iteration limit -> keep DS +// 5. Else if none converged -> NaN (original objective) +// Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both +template +static std::pair merge_sb_result(f_t ds_val, + dual::status_t ds_status, + f_t pdlp_dual_obj, + bool pdlp_optimal) +{ + // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify + + // Rule 1: Both optimal -> keep DS + if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; } + + // Rule 2: Dual Simplex found infeasible -> declare infeasible + if (ds_status == dual::status_t::DUAL_UNBOUNDED) { return {std::numeric_limits::infinity(), 0}; } + + // Rule 3: Only one converged -> keep that + if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; } + if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; } + + // Rule 4: Dual Simplex hit iteration limit -> keep DS + if (ds_status == dual::status_t::ITERATION_LIMIT) { return {ds_val, 0}; } + + // Rule 5: None converged -> NaN + return {std::numeric_limits::quiet_NaN(), 2}; +} + + template void strong_branching(const user_problem_t& original_problem, const lp_problem_t& original_lp, @@ -310,10 +362,26 @@ void strong_branching(const user_problem_t& original_problem, pc.strong_branch_up.assign(fractional.size(), 0); pc.num_strong_branches_completed = 0; - if (settings.mip_batch_pdlp_strong_branching) { - settings.log.printf("Batch PDLP strong branching enabled\n"); + settings.log.printf("Strong branching using %d threads and %ld fractional variables\n", + settings.num_threads, + fractional.size()); + + // Race both batch PDLP and parallel Dual Simplex + std::atomic concurrent_halt{0}; + + std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); + std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); + + auto pdlp_thread = std::thread([&]() { + + if (settings.mip_batch_pdlp_strong_branching == 0) + return; + + settings.log.printf("Racing batch PDLP and Dual Simplex for strong branching\n"); f_t start_batch = tic(); + pdlp_solver_settings_t pdlp_settings; + pdlp_settings.concurrent_halt = &concurrent_halt; // Use original_problem to create the BatchLP problem csr_matrix_t A_row(original_problem.A.m, original_problem.A.n, 0); @@ -332,7 +400,7 @@ void strong_branching(const user_problem_t& original_problem, const auto mps_model = simplex_problem_to_mps_data_model(original_problem); const auto solutions = - batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values); + batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Find max iteration on how many are done accross the batch @@ -377,14 +445,19 @@ void strong_branching(const user_problem_t& original_problem, ? solutions.get_dual_objective_value(k + fractional.size()) : std::numeric_limits::quiet_NaN(); - pc.strong_branch_down[k] = obj_down - root_obj; - pc.strong_branch_up[k] = obj_up - root_obj; + pdlp_obj_down[k] = obj_down - root_obj; + pdlp_obj_up[k] = obj_up - root_obj; } - } else { - settings.log.printf("Strong branching using %d threads and %ld fractional variables\n", - settings.num_threads, - fractional.size()); - f_t strong_branching_start_time = tic(); + + // Batch PDLP finished – tell Dual Simplex to stop + concurrent_halt.store(1); + }); + + std::vector ds_status_down(fractional.size(), dual::status_t::UNSET); + std::vector ds_status_up(fractional.size(), dual::status_t::UNSET); + std::vector ds_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); + std::vector ds_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); + f_t dual_simplex_strong_branching_time = tic(); #pragma omp parallel num_threads(settings.num_threads) { @@ -418,10 +491,85 @@ void strong_branching(const user_problem_t& original_problem, root_soln, root_vstatus, edge_norms, - pc); + pc, + ds_obj_down, + ds_obj_up, + ds_status_down, + ds_status_up, + &concurrent_halt); } } - settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); + + if (settings.mip_batch_pdlp_strong_branching == 1) { + if (concurrent_halt.load() == 1) { + settings.log.printf("Batch PDLP finished before Dual Simplex\n"); + } + else { + settings.log.printf("Dual Simplex finished before Batch PDLP\n"); + } + } + + // Dual Simplex finished all subproblems – tell Batch PDLP to stop + concurrent_halt.store(1); + + pdlp_thread.join(); + + settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time)); + + + // Collect Dual Simplex statistics + i_t ds_optimal_count = 0; + i_t ds_dual_feasible_only_count = 0; + for (i_t k = 0; k < fractional.size(); k++) { + if (ds_status_down[k] == dual::status_t::OPTIMAL) ds_optimal_count++; + if (ds_status_up[k] == dual::status_t::OPTIMAL) ds_optimal_count++; + if (ds_status_down[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++; + if (ds_status_up[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++; + } + + settings.log.printf( + "Dual Simplex found %d/%d optimal solutions and %d/%d dual feasible only solutions\n", + ds_optimal_count, + fractional.size() * 2, + ds_dual_feasible_only_count, + fractional.size() * 2); + + if (settings.mip_batch_pdlp_strong_branching == 1) { + // Collect Batch PDLP statistics + i_t pdlp_optimal_count = 0; + for (i_t k = 0; k < fractional.size(); k++) { + if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++; + if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++; + } + + settings.log.printf( + "Batch PDLP found %d/%d optimal solutions\n", + pdlp_optimal_count, + fractional.size() * 2); + } + + i_t merged_from_ds = 0; + i_t merged_from_pdlp = 0; + i_t merged_nan = 0; + for (i_t k = 0; k < fractional.size(); k++) { + const auto [value_down, source_down] = merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], !std::isnan(pdlp_obj_down[k])); + pc.strong_branch_down[k] = value_down; + if (source_down == 0) merged_from_ds++; + else if (source_down == 1) merged_from_pdlp++; + else merged_nan++; + const auto [value_up, source_up] = merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], !std::isnan(pdlp_obj_up[k])); + pc.strong_branch_up[k] = value_up; + if (source_up == 0) merged_from_ds++; + else if (source_up == 1) merged_from_pdlp++; + else merged_nan++; + } + + if (settings.mip_batch_pdlp_strong_branching == 1) { + settings.log.printf( + "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN)\n", + merged_from_ds, + merged_from_pdlp, + merged_nan); } pc.update_pseudo_costs_from_strong_branching(fractional, root_soln); diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 8a05f1b2a1..69e822a1f0 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -444,8 +444,7 @@ std::optional> pdlp_solver_t } // Check for concurrent limit - if (settings_.method == method_t::Concurrent && settings_.concurrent_halt != nullptr && - *settings_.concurrent_halt == 1) { + if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Concurrent Limit reached, returning current solution" << std::endl; From 6848df1fb82940a7f0fad7f30a401c4220eea59c Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 9 Feb 2026 17:55:22 +0100 Subject: [PATCH 02/43] race on by default --- cpp/include/cuopt/linear_programming/mip/solver_settings.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index c5c26884f5..1034af41b2 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -96,7 +96,7 @@ class mip_solver_settings_t { i_t reduced_cost_strengthening = -1; f_t cut_change_threshold = 1e-3; f_t cut_min_orthogonality = 0.5; - i_t mip_batch_pdlp_strong_branching = 0; + i_t mip_batch_pdlp_strong_branching = 1; i_t num_gpus = 1; bool log_to_console = true; From fc7aa0468c3e54c124c7932b2468e2e06024240d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 17 Feb 2026 14:11:54 +0100 Subject: [PATCH 03/43] tmp --- .../linear_programming/cuopt/run_pdlp.cu | 2 +- compile.sh | 2 + .../initial_scaling.cu | 2 +- cpp/src/pdlp/pdhg.cu | 35 ++++- cpp/src/pdlp/pdlp.cu | 142 +++++++++++++++++- .../restart_strategy/pdlp_restart_strategy.cu | 6 +- cpp/src/pdlp/solve.cu | 3 + .../adaptive_step_size_strategy.cu | 108 +++++++++---- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 2 +- .../solver_settings/solver_settings.py | 38 +++++ .../linear_programming/data_definition.py | 5 + run_multiple.sh | 3 + test.py | 12 ++ 13 files changed, 318 insertions(+), 42 deletions(-) create mode 100755 compile.sh create mode 100755 run_multiple.sh create mode 100755 test.py diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index 229c72a49b..c3d6ad42f4 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -107,7 +107,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); - settings.presolve = program.get("--presolve"); + //settings.presolve = program.get("--presolve"); return settings; } diff --git a/compile.sh b/compile.sh new file mode 100755 index 0000000000..bedf3a7506 --- /dev/null +++ b/compile.sh @@ -0,0 +1,2 @@ +./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF +./build.sh cuopt cuopt_mps_parser \ No newline at end of file diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index 031cd9c3b6..5a08c3bb53 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t::scale_problem() #ifdef CUPDLP_DEBUG_MODE print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds); print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds); - std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds); + std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_); std::vector lower_bounds; std::vector upper_bounds; for (const auto& variable_bound : variable_bounds) { diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 9a44bd31e3..c5efdcd722 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection"); + const f_t next = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; @@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op { potential_next_primal[idx] = next_clamped; dual_slack[idx] = (next_clamped - next) / step_size; reflected_primal[idx] = f_t(2.0) * next_clamped - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_major_projection"); } }; @@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op { const f_t current_dual = dual_solution[idx]; const f_t Ax = dual_gradient[idx]; + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection"); + cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection"); + const f_t tmp = current_dual / step_size - Ax; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], @@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op { potential_next_dual[idx] = next_dual; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_major_projection"); } }; @@ -631,12 +649,19 @@ struct primal_reflected_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection"); + f_t reflected = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds)); reflected_primal[idx] = f_t(2.0) * reflected - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_projection"); } }; @@ -659,13 +684,21 @@ struct dual_reflected_projection_bulk_op { const f_t step_size = dual_step_size[batch_idx]; const f_t current_dual = dual_solution[idx]; - const f_t tmp = current_dual / step_size - dual_gradient[idx]; + + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection"); + + const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], cuda::std::min(tmp, -constraint_lower_bounds[constraint_idx])); const f_t next_dual = (tmp - tmp_proj) * step_size; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_projection"); } }; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 67e001db29..e1ab866b5b 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -34,8 +34,10 @@ #include #include +#include #include +#include #include #include @@ -1406,10 +1408,27 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, const f_t interaction, f_t* fixed_point_error) { + cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN"); + cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN"); + cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN"); + cuopt_assert(!isnan(step_size), "step_size must not be NaN"); + cuopt_assert(!isnan(interaction), "interaction must not be NaN"); + cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0"); + cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0"); + cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0"); + cuopt_assert(step_size > f_t(0.0), "step_size must be > 0"); + const f_t movement = norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; + //printf("movement %lf\n", movement); + //printf("computed_interaction %lf\n", computed_interaction); + + cuopt_assert( + movement + computed_interaction >= f_t(0.0), + "Movement + computed interaction must be >= 0"); + *fixed_point_error = cuda::std::sqrt(movement + computed_interaction); #ifdef CUPDLP_DEBUG_MODE @@ -1790,6 +1809,68 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Sync to make sure all previous cuSparse operations are finished before setting the // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + // Validate reflected solutions have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_reflected_primal().data(), + pdhg_solver_.get_reflected_primal().data() + + pdhg_solver_.get_reflected_primal().size(), + is_nan_or_inf{}), + "reflected_primal contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_reflected_dual().data(), + pdhg_solver_.get_reflected_dual().data() + + pdhg_solver_.get_reflected_dual().size(), + is_nan_or_inf{}), + "reflected_dual contains NaN or Inf in compute_fixed_error"); + + // Validate primal/dual solutions have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().data() + + pdhg_solver_.get_primal_solution().size(), + is_nan_or_inf{}), + "primal_solution contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_dual_solution().data(), + pdhg_solver_.get_dual_solution().data() + + pdhg_solver_.get_dual_solution().size(), + is_nan_or_inf{}), + "dual_solution contains NaN or Inf in compute_fixed_error"); + + // Validate deltas have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data() + + pdhg_solver_.get_saddle_point_state().get_delta_primal().size(), + is_nan_or_inf{}), + "delta_primal contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data() + + pdhg_solver_.get_saddle_point_state().get_delta_dual().size(), + is_nan_or_inf{}), + "delta_dual contains NaN or Inf in compute_fixed_error"); + + // Validate primal_weight and step_size have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + primal_weight_.data(), + primal_weight_.data() + primal_weight_.size(), + is_nan_or_inf{}), + "primal_weight_ contains NaN or Inf in compute_fixed_error"); + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + step_size_.data(), + step_size_.data() + step_size_.size(), + is_nan_or_inf{}), + "step_size_ contains NaN or Inf in compute_fixed_error"); + // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); @@ -1813,6 +1894,49 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte RAFT_CUDA_TRY(cudaStreamSynchronize( stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0 + { + const auto bs = climber_strategies_.size(); + std::vector h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs); + RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(), + step_size_strategy_.get_norm_squared_delta_primal().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(), + step_size_strategy_.get_norm_squared_delta_dual().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy( + h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY( + cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(), + step_size_strategy_.get_interaction().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < bs; ++i) { + const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i]; + const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i]; + if (movement + comp_inter < f_t(0.0)) { + fprintf(stderr, + "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e " + "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n", + i, + (double)movement, + (double)comp_inter, + (double)(movement + comp_inter), + (double)h_nsq_dp[i], + (double)h_nsq_dd[i], + (double)h_pw[i], + (double)h_ss[i], + (double)h_inter[i]); + } + cuopt_assert(movement + comp_inter >= f_t(0.0), + "Host check: movement + computed_interaction must be >= 0"); + } + } + #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif @@ -1847,9 +1971,15 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte #endif for (size_t i = 0; i < climber_strategies_.size(); ++i) { + cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]), + "fixed_point_error_ must not be NaN after compute_fixed_error"); + cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0), + "fixed_point_error_ must be >= 0 after compute_fixed_error"); if (has_restarted[i]) { restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i]; - has_restarted[i] = false; + cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]), + "initial_fixed_point_error_ must not be NaN after assignment"); + has_restarted[i] = false; } } } @@ -1869,6 +1999,7 @@ void pdlp_solver_t::transpose_primal_dual_to_row( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -1945,6 +2076,7 @@ void pdlp_solver_t::transpose_primal_dual_back_to_col( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -2632,7 +2764,7 @@ void pdlp_solver_t::compute_initial_step_size() rmm::device_uvector d_atq(n, stream_view_); std::mt19937 gen(1); - std::normal_distribution dist(0.0, 1.0); + std::normal_distribution dist(f_t(0.0), f_t(1.0)); for (int i = 0; i < m; ++i) z[i] = dist(gen); @@ -2684,7 +2816,7 @@ void pdlp_solver_t::compute_initial_step_size() vecATQ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); // z = A @ A_t_q RAFT_CUSPARSE_TRY( @@ -2697,7 +2829,7 @@ void pdlp_solver_t::compute_initial_step_size() vecZ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), - stream_view_)); + stream_view_.value())); // sigma_max_sq = dot(q, z) RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), m, @@ -2706,7 +2838,7 @@ void pdlp_solver_t::compute_initial_step_size() d_z.data(), primal_stride, sigma_max_sq.data(), - stream_view_)); + stream_view_.value())); cub::DeviceTransform::Transform( cuda::std::make_tuple(d_q.data(), d_z.data()), diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index bc12fb360f..a6304a8568 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -1995,14 +1995,14 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(), - thrust::make_reverse_iterator(thrust::device_ptr(end)), - thrust::make_reverse_iterator(thrust::device_ptr(start)), + thrust::device_ptr(end), + thrust::device_ptr(start), -std::numeric_limits::infinity()); // Set ranges accordingly i_t index_start_primal = 0; i_t index_end_primal = primal_size_h_ + dual_size_h_; - if (highest_negInf_primal != thrust::make_reverse_iterator(thrust::device_ptr(start))) { + if (highest_negInf_primal != thrust::device_ptr(start)) { cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) == -std::numeric_limits::infinity(), "Incorrect primal reverse iterator"); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index a2766be98a..374c9ff513 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -793,6 +793,7 @@ optimization_problem_solution_t run_batch_pdlp( // If need warm start, solve the LP alone if (primal_dual_init || primal_weight_init) { + std::cout << "Solving LP for warm start" << std::endl; pdlp_solver_settings_t warm_start_settings = settings; warm_start_settings.new_bounds.clear(); warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; @@ -841,6 +842,8 @@ optimization_problem_solution_t run_batch_pdlp( } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } + std::cout << "Solving batch PDLP" << std::endl; + for (int i = 0; i < max_batch_size; i += optimal_batch_size) { const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i); // Only take the new bounds from [i, i + current_batch_size) diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 3c1b85aeac..47ba16a297 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,6 +28,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -80,7 +82,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -90,7 +92,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -100,10 +102,10 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); - dot_product_storage.resize(dot_product_bytes, stream_view_); + dot_product_storage.resize(dot_product_bytes, stream_view_.value()); } } @@ -143,7 +145,7 @@ void adaptive_step_size_strategy_t::swap_context( const auto [grid_size, block_size] = kernel_config_from_batch_size(static_cast(swap_pairs.size())); adaptive_step_size_swap_device_vectors_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), + <<>>(thrust::raw_pointer_cast(swap_pairs.data()), static_cast(swap_pairs.size()), make_span(interaction_), make_span(norm_squared_delta_primal_), @@ -159,9 +161,9 @@ void adaptive_step_size_strategy_t::resize_context(i_t new_size) cuopt_assert(new_size > 0, "New size must be greater than 0"); cuopt_assert(new_size < batch_size, "New size must be less than batch size"); - interaction_.resize(new_size, stream_view_); - norm_squared_delta_primal_.resize(new_size, stream_view_); - norm_squared_delta_dual_.resize(new_size, stream_view_); + interaction_.resize(new_size, stream_view_.value()); + norm_squared_delta_primal_.resize(new_size, stream_view_.value()); + norm_squared_delta_dual_.resize(new_size, stream_view_.value()); } template @@ -276,19 +278,19 @@ i_t adaptive_step_size_strategy_t::get_valid_step_size() const template f_t adaptive_step_size_strategy_t::get_interaction(i_t i) const { - return interaction_.element(i, stream_view_); + return interaction_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_primal(i_t i) const { - return norm_squared_delta_primal_.element(i, stream_view_); + return norm_squared_delta_primal_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_dual(i_t i) const { - return norm_squared_delta_dual_.element(i, stream_view_); + return norm_squared_delta_dual_.element(i, stream_view_.value()); } template @@ -337,7 +339,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction - <<<1, 1, 0, stream_view_>>>(this->view(), + <<<1, 1, 0, stream_view_.value()>>>(this->view(), primal_step_size.data(), dual_step_size.data(), pdhg_solver.get_d_total_pdhg_iterations().data()); @@ -345,7 +347,27 @@ void adaptive_step_size_strategy_t::compute_step_sizes( } graph.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); +} + +template +__global__ void validate_interaction_and_movement_outputs( + raft::device_span norm_squared_delta_primal, + raft::device_span norm_squared_delta_dual, + raft::device_span interaction) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= norm_squared_delta_primal.size()) { return; } + cuopt_assert(!isnan(norm_squared_delta_primal[idx]), + "norm_squared_delta_primal is NaN after reduction"); + cuopt_assert(!isnan(norm_squared_delta_dual[idx]), + "norm_squared_delta_dual is NaN after reduction"); + cuopt_assert(!isnan(interaction[idx]), + "interaction is NaN after reduction"); + cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0), + "norm_squared_delta_primal must be >= 0 after reduction"); + cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0), + "norm_squared_delta_dual must be >= 0 after reduction"); } template @@ -382,7 +404,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // We need to make sure both dot products happens after previous operations (next_primal/dual) // Thus, we add another node in the main stream before starting the SpMVs - if (!batch_mode_) deltas_are_done_.record(stream_view_); + if (!batch_mode_) deltas_are_done_.record(stream_view_.value()); // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' - // A@x)) to save one SpMV @@ -406,7 +428,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.next_AtY, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); } else { // TODO later batch mode: handle if not all restart RAFT_CUSPARSE_TRY( @@ -420,7 +442,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.batch_next_AtYs, CUSPARSE_SPMM_CSR_ALG3, (f_t*)cusparse_view.buffer_transpose_batch.data(), - stream_view_)); + stream_view_.value())); } // Compute Ay' - Ay = next_Aty - current_Aty @@ -433,6 +455,31 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::minus<>{}, stream_view_.value()); + // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + tmp_primal.data(), + tmp_primal.data() + tmp_primal.size(), + is_nan_or_inf{}), + "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement"); + + // Validate delta_primal and delta_dual inputs have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + current_saddle_point_state.get_delta_primal().data(), + current_saddle_point_state.get_delta_primal().data() + + current_saddle_point_state.get_delta_primal().size(), + is_nan_or_inf{}), + "delta_primal contains NaN or Inf in compute_interaction_and_movement"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + current_saddle_point_state.get_delta_dual().data(), + current_saddle_point_state.get_delta_dual().data() + + current_saddle_point_state.get_delta_dual().size(), + is_nan_or_inf{}), + "delta_dual contains NaN or Inf in compute_interaction_and_movement"); + if (!batch_mode_) { // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( @@ -443,7 +490,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, interaction_.data(), - stream_view_)); + stream_view_.value())); // Compute movement // compute euclidean norm squared which is @@ -453,7 +500,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); + // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access + // (cuBLAS uses a single internal workspace shared across all streams for the same handle) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -462,10 +510,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, norm_squared_delta_primal_.data(), - stream_pool_.get_stream(0))); - dot_delta_X_.record(stream_pool_.get_stream(0)); + stream_view_.value())); - deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_dual_size(), @@ -474,12 +520,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_dual().data(), dual_stride, norm_squared_delta_dual_.data(), - stream_pool_.get_stream(1))); - dot_delta_Y_.record(stream_pool_.get_stream(1)); - - // Wait on main stream for both dot to be done before launching the next kernel - dot_delta_X_.stream_wait(stream_view_); - dot_delta_Y_.stream_wait(stream_view_); + stream_view_.value())); } else { // TODO later batch mode: remove this once you want to do per climber restart cub::DeviceSegmentedReduce::Sum( @@ -492,7 +533,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -502,7 +543,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -512,7 +553,14 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_); + stream_view_.value()); + + validate_interaction_and_movement_outputs + <<<1, climber_strategies_.size(), 0, stream_view_>>>( + make_span(norm_squared_delta_primal_), + make_span(norm_squared_delta_dual_), + make_span(interaction_)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 16e2d64957..14180b5bfd 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -21,7 +21,7 @@ template class ping_pong_graph_t { public: ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false) - : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) + : stream_view_(stream_view), is_legacy_batch_mode_(true) { } diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py index 19db315349..4ec4a9aaf2 100644 --- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py +++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py @@ -207,6 +207,44 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data): """ self.pdlp_warm_start_data = pdlp_warm_start_data + def set_mip_batch_pdlp_strong_branching(self, enable): + """ + Note: Only supported for MILP + + Toggle batch PDLP strong branching in the MIP solver. + + Parameters + ---------- + enable : bool + If True, enable batch PDLP strong branching (value 1). + If False, disable it (value 0). + + Examples + -------- + >>> settings.set_mip_batch_pdlp_strong_branching(True) + """ + self.set_parameter( + "mip_batch_pdlp_strong_branching", 1 if enable else 0 + ) + + def get_mip_batch_pdlp_strong_branching(self): + """ + Note: Only supported for MILP + + Get the current value of the batch PDLP strong branching setting. + + Returns + ------- + bool + True if batch PDLP strong branching is enabled, False otherwise. + + Examples + -------- + >>> settings.get_mip_batch_pdlp_strong_branching() + False + """ + return bool(self.get_parameter("mip_batch_pdlp_strong_branching")) + def set_mip_callback(self, callback, user_data): """ Note: Only supported for MILP diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 8412c745b5..59ea62089d 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -450,6 +450,11 @@ class SolverConfig(BaseModel): description="Set True to run heuristics only, False to run " "heuristics and branch and bound for MILP", ) + mip_batch_pdlp_strong_branching: Optional[int] = Field( + default=0, + description="Set 1 to enable batch PDLP strong branching " + "in the MIP solver, 0 to disable.", + ) num_cpu_threads: Optional[int] = Field( default=None, description="Set the number of CPU threads to use for branch and bound.", # noqa diff --git a/run_multiple.sh b/run_multiple.sh new file mode 100755 index 0000000000..183b25b46e --- /dev/null +++ b/run_multiple.sh @@ -0,0 +1,3 @@ +for i in {1..5}; do + python test.py +done \ No newline at end of file diff --git a/test.py b/test.py new file mode 100755 index 0000000000..6cb236dae2 --- /dev/null +++ b/test.py @@ -0,0 +1,12 @@ +import cuopt_mps_parser +from cuopt.linear_programming import Solve, SolverSettings + +data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps") + +settings = SolverSettings() +settings.set_mip_batch_pdlp_strong_branching(True) + +solution = Solve(data_model, settings) + +print(solution.get_termination_reason()) +print(solution.get_primal_objective()) From 1614bc14836050bbf30308c7c7edf140f352770e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 17 Feb 2026 17:42:10 +0100 Subject: [PATCH 04/43] fix --- cpp/src/pdlp/pdlp.cu | 89 +++++++------------ .../restart_strategy/pdlp_restart_strategy.cu | 1 - .../convergence_information.cu | 66 ++++++++------ cpp/src/pdlp/utils.cuh | 20 +++-- 4 files changed, 82 insertions(+), 94 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index e1ab866b5b..082299902d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1188,24 +1188,42 @@ static void compute_stats(const rmm::device_uvector& vec, f_t& avg) { auto abs_op = [] __host__ __device__(f_t x) { return abs(x); }; - auto min_nonzero = [] __host__ __device__(f_t x) { + auto min_nonzero = [] __host__ __device__(f_t x) -> f_t { return x == 0 ? std::numeric_limits::max() : abs(x); }; - smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()), - vec.begin(), - vec.end(), - min_nonzero, - std::numeric_limits::max(), - thrust::minimum()); - - largest = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum()); - - f_t sum = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus()); - - avg = sum / vec.size(); + auto stream = vec.stream(); + auto n = static_cast(vec.size()); + + rmm::device_scalar d_smallest(stream); + rmm::device_scalar d_largest(stream); + rmm::device_scalar d_sum(stream); + + auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero); + auto abs_iter = thrust::make_transform_iterator(vec.cbegin(), abs_op); + + void* d_temp = nullptr; + size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1; + cub::DeviceReduce::Reduce( + d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); + cub::DeviceReduce::Reduce( + d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); + cub::DeviceReduce::Reduce( + d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + + size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); + rmm::device_buffer temp_buf(max_bytes, stream); + + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + + smallest = d_smallest.value(stream); + largest = d_largest.value(stream); + avg = d_sum.value(stream) / vec.size(); }; template @@ -1895,47 +1913,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); - // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0 - { - const auto bs = climber_strategies_.size(); - std::vector h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs); - RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(), - step_size_strategy_.get_norm_squared_delta_primal().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(), - step_size_strategy_.get_norm_squared_delta_dual().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy( - h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY( - cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(), - step_size_strategy_.get_interaction().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - for (size_t i = 0; i < bs; ++i) { - const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i]; - const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i]; - if (movement + comp_inter < f_t(0.0)) { - fprintf(stderr, - "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e " - "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n", - i, - (double)movement, - (double)comp_inter, - (double)(movement + comp_inter), - (double)h_nsq_dp[i], - (double)h_nsq_dd[i], - (double)h_pw[i], - (double)h_ss[i], - (double)h_inter[i]); - } - cuopt_assert(movement + comp_inter >= f_t(0.0), - "Host check: movement + computed_interaction must be >= 0"); - } - } #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index a6304a8568..b2ed166a2d 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -38,7 +38,6 @@ #include #include #include -#include #include diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 97e0e9c0e9..eec078f8d7 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -395,28 +395,28 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) - thrust::device_ptr result_ptr(linf_primal_residual_.data()); - const f_t neutral = f_t(0.0); - if (settings.save_best_primal_so_far) { const i_t zero_int = 0; nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream()); - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); - } else { - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); } + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), + relative_residual_t{settings.tolerances.relative_primal_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_); } compute_dual_residual(op_problem_cusparse_view_, @@ -458,16 +458,26 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) - thrust::device_ptr result_ptr(linf_dual_residual_.data()); - const f_t neutral = f_t(0.0); - - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), - thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()), - relative_residual_t{settings.tolerances.relative_dual_tolerance}, - neutral, - thrust::maximum()); + { + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), + relative_residual_t{settings.tolerances.relative_dual_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + } } const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index d48ae21c1a..9150ab8c51 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -604,15 +604,17 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, f_t* result, raft::handle_t const* handle_ptr) { - const f_t neutral = f_t(0.0); - thrust::device_ptr result_ptr(result); - - *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(), - input_vector.data(), - input_vector.data() + input_vector.size(), - abs_t{}, - neutral, - thrust::maximum()); + auto stream = handle_ptr->get_stream(); + auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); + auto n = static_cast(input_vector.size()); + + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceReduce::Max( + d_temp, temp_bytes, abs_iter, result, n, stream); + rmm::device_buffer temp_buf(temp_bytes, stream); + cub::DeviceReduce::Max( + temp_buf.data(), temp_bytes, abs_iter, result, n, stream); } template From 4f3353191f6178ce02ffb3449daa9891ee8eb38e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 13:18:41 +0100 Subject: [PATCH 05/43] fix --- .../adaptive_step_size_strategy.cu | 25 ---- cpp/src/pdlp/utilities/ping_pong_graph.cu | 123 ++++++++++++++++++ cpp/src/pdlp/utilities/ping_pong_graph.cuh | 87 ++----------- 3 files changed, 137 insertions(+), 98 deletions(-) create mode 100644 cpp/src/pdlp/utilities/ping_pong_graph.cu diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 47ba16a297..32e21cfbf6 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -455,31 +455,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::minus<>{}, stream_view_.value()); - // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - tmp_primal.data(), - tmp_primal.data() + tmp_primal.size(), - is_nan_or_inf{}), - "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement"); - - // Validate delta_primal and delta_dual inputs have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - current_saddle_point_state.get_delta_primal().data(), - current_saddle_point_state.get_delta_primal().data() + - current_saddle_point_state.get_delta_primal().size(), - is_nan_or_inf{}), - "delta_primal contains NaN or Inf in compute_interaction_and_movement"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - current_saddle_point_state.get_delta_dual().data(), - current_saddle_point_state.get_delta_dual().data() + - current_saddle_point_state.get_delta_dual().size(), - is_nan_or_inf{}), - "delta_dual contains NaN or Inf in compute_interaction_and_movement"); - if (!batch_mode_) { // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu new file mode 100644 index 0000000000..08045b47a1 --- /dev/null +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -0,0 +1,123 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +namespace cuopt::linear_programming::detail { + +template +ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, + bool is_legacy_batch_mode) + : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) +{ +} + +template +void ping_pong_graph_t::cancel_active_capture() +{ + CUOPT_LOG_ERROR( + "Canceling active capture in ping_pong_graph_t"); + if (capture_even_active_) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + capture_even_active_ = false; + } + if (capture_odd_active_) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + capture_odd_active_ = false; + } +} + +template +ping_pong_graph_t::~ping_pong_graph_t() +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + // This should not happen, but in case a graph was capturing while destroying the object + if (capture_even_active_ || capture_odd_active_) { + cancel_active_capture(); + } + if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } + if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } + } +#endif +} + +template +void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY( + cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_even_active_ = true; + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY( + cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_odd_active_ = true; + } + } +#endif +} + +template +void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + capture_even_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); + even_initialized = true; + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + capture_odd_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); + odd_initialized = true; + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + } + } +#endif +} + +template +void ping_pong_graph_t::launch(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && even_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); + } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); + } + } +#endif +} + +template +bool ping_pong_graph_t::is_initialized(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + return (total_pdlp_iterations % 2 == 0 && even_initialized) || + (total_pdlp_iterations % 2 == 1 && odd_initialized); + } +#endif + return false; +} + +template class ping_pong_graph_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 14180b5bfd..9d6ead8cf7 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -9,6 +9,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -17,83 +19,20 @@ namespace cuopt::linear_programming::detail { // No additional checks for safe usage (calling launch() before initializing the graph) use with // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph -template + template class ping_pong_graph_t { public: - ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false) - : stream_view_(stream_view), is_legacy_batch_mode_(true) - { - } - - ~ping_pong_graph_t() - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } - if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } - } -#endif - } - - void start_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } - } -#endif - } - - void end_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); - even_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); - odd_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); - } - } -#endif - } + ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); + ~ping_pong_graph_t(); - void launch(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); - } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); - } - } -#endif - } - - bool is_initialized(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - return (total_pdlp_iterations % 2 == 0 && even_initialized) || - (total_pdlp_iterations % 2 == 1 && odd_initialized); - } -#endif - return false; - } + void start_capture(i_t total_pdlp_iterations); + void end_capture(i_t total_pdlp_iterations); + void launch(i_t total_pdlp_iterations); + bool is_initialized(i_t total_pdlp_iterations); private: + void cancel_active_capture(); + cudaGraph_t even_graph; cudaGraph_t odd_graph; cudaGraphExec_t even_instance; @@ -101,7 +40,9 @@ class ping_pong_graph_t { rmm::cuda_stream_view stream_view_; bool even_initialized{false}; bool odd_initialized{false}; - // Temporary fix to disable cuda graph in legacy batch mode + bool capture_even_active_{false}; + bool capture_odd_active_{false}; bool is_legacy_batch_mode_{false}; }; + } // namespace cuopt::linear_programming::detail From e0a530ed449f7c904d651701aa066dd90a5edd54 Mon Sep 17 00:00:00 2001 From: Trevor McKay Date: Tue, 17 Feb 2026 13:49:33 -0500 Subject: [PATCH 06/43] workaround for thrust reverse iterator build error --- .../restart_strategy/pdlp_restart_strategy.cu | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index b2ed166a2d..e42a05e1e6 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1990,18 +1991,18 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( threshold_.end(), std::numeric_limits::infinity()); // Easier / Cleaner than to do reverse iterator arithmetic - f_t* start = threshold_.data(); - f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; - auto highest_negInf_primal = - thrust::find(handle_ptr_->get_thrust_policy(), - thrust::device_ptr(end), - thrust::device_ptr(start), - -std::numeric_limits::infinity()); + f_t* start = threshold_.data(); + f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; + using rev_iter_t = thrust::reverse_iterator>; + auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(), + rev_iter_t(thrust::device_ptr(end)), + rev_iter_t(thrust::device_ptr(start)), + -std::numeric_limits::infinity()); // Set ranges accordingly i_t index_start_primal = 0; i_t index_end_primal = primal_size_h_ + dual_size_h_; - if (highest_negInf_primal != thrust::device_ptr(start)) { + if (highest_negInf_primal != rev_iter_t(thrust::device_ptr(start))) { cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) == -std::numeric_limits::infinity(), "Incorrect primal reverse iterator"); From e330718a2f40799de9a22615d1e99954bd821922 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 13:55:20 +0100 Subject: [PATCH 07/43] remove compile file --- compile.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100755 compile.sh diff --git a/compile.sh b/compile.sh deleted file mode 100755 index bedf3a7506..0000000000 --- a/compile.sh +++ /dev/null @@ -1,2 +0,0 @@ -./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF -./build.sh cuopt cuopt_mps_parser \ No newline at end of file From dce6d4fec3d5a2ae6d3313e4f1af2db4ce55ccea Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 15:56:09 +0100 Subject: [PATCH 08/43] fix --- .../linear_programming/cuopt/run_pdlp.cu | 20 ++++++--- cpp/src/branch_and_bound/pseudo_costs.cpp | 3 +- cpp/src/pdlp/CMakeLists.txt | 1 + cpp/src/pdlp/pdlp.cu | 3 -- .../adaptive_step_size_strategy.cu | 44 +------------------ .../adaptive_step_size_strategy.hpp | 9 ---- cpp/src/pdlp/utilities/ping_pong_graph.cu | 16 +++---- .../solver_settings/solver_settings.py | 38 ---------------- 8 files changed, 26 insertions(+), 108 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index c3d6ad42f4..64897264c9 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program) "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver " "modes."); - program.add_argument("--presolve") - .help("enable/disable presolve (default: true for MIP problems, false for LP problems)") - .default_value(0) - .scan<'i', int>() - .choices(0, 1); + program.add_argument("--presolver") + .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") + .default_value("Default") + .choices("None", "Papilo", "PSLP", "Default"); program.add_argument("--solution-path").help("Path where solution file will be generated"); } +static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver) +{ + if (presolver == "None") return cuopt::linear_programming::presolver_t::None; + if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo; + if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP; + if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default; + return cuopt::linear_programming::presolver_t::Default; +} + static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode( const std::string& mode) { @@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); - //settings.presolve = program.get("--presolve"); + settings.presolver = string_to_presolver(program.get("--presolver")); return settings; } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 2ddc672750..e06f497bdc 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -335,8 +335,9 @@ void strong_branching(const user_problem_t& original_problem, } const auto mps_model = simplex_problem_to_mps_data_model(original_problem); + const raft::handle_t batch_pdlp_handle; const auto solutions = - batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values); + batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Find max iteration on how many are done accross the batch diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index ced9da8edc..2071bdfdef 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -24,6 +24,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 082299902d..eaafd1293e 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1440,9 +1440,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - //printf("movement %lf\n", movement); - //printf("computed_interaction %lf\n", computed_interaction); - cuopt_assert( movement + computed_interaction >= f_t(0.0), "Movement + computed interaction must be >= 0"); diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 32e21cfbf6..d491106aaf 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,14 +28,10 @@ #include -#include - #include namespace cuopt::linear_programming::detail { -constexpr int parallel_stream_computation = 2; - template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, @@ -47,10 +43,6 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( const std::vector& climber_strategies, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params) : batch_mode_(climber_strategies.size() > 1), - stream_pool_(parallel_stream_computation), - dot_delta_X_(cudaEventDisableTiming), - dot_delta_Y_(cudaEventDisableTiming), - deltas_are_done_(cudaEventDisableTiming), handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_(primal_size), @@ -350,26 +342,6 @@ void adaptive_step_size_strategy_t::compute_step_sizes( RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); } -template -__global__ void validate_interaction_and_movement_outputs( - raft::device_span norm_squared_delta_primal, - raft::device_span norm_squared_delta_dual, - raft::device_span interaction) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= norm_squared_delta_primal.size()) { return; } - cuopt_assert(!isnan(norm_squared_delta_primal[idx]), - "norm_squared_delta_primal is NaN after reduction"); - cuopt_assert(!isnan(norm_squared_delta_dual[idx]), - "norm_squared_delta_dual is NaN after reduction"); - cuopt_assert(!isnan(interaction[idx]), - "interaction is NaN after reduction"); - cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0), - "norm_squared_delta_primal must be >= 0 after reduction"); - cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0), - "norm_squared_delta_dual must be >= 0 after reduction"); -} - template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, @@ -393,7 +365,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( Deltas x & y were computed during pdhg step - We will compute in parallel (parallel cuda graph): + We will compute: ||(x' - x)|| ||(y' - y)|| (y' - y)_t . A @ (x' - x) @@ -401,11 +373,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( And finally merge the results */ - // We need to make sure both dot products happens after previous operations (next_primal/dual) - // Thus, we add another node in the main stream before starting the SpMVs - - if (!batch_mode_) deltas_are_done_.record(stream_view_.value()); - // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' - // A@x)) to save one SpMV // Instead we do: delta_x . (A_t @ y' - A_t @ y) @@ -475,8 +442,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access - // (cuBLAS uses a single internal workspace shared across all streams for the same handle) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -529,13 +494,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( climber_strategies_.size(), dual_size_, stream_view_.value()); - - validate_interaction_and_movement_outputs - <<<1, climber_strategies_.size(), 0, stream_view_>>>( - make_span(norm_squared_delta_primal_), - make_span(norm_squared_delta_dual_), - make_span(interaction_)); - RAFT_CUDA_TRY(cudaPeekAtLastError()); } } diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 8e7e048b18..1e969150e7 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t { private: const bool batch_mode_; - // Stream pool to run different step size computation in parallel - // Because we already have the main stream, we just need 2 extra streams from this - rmm::cuda_stream_pool stream_pool_; - - // Events to record when dot product of both delta_x and y are done and when to start them - event_handler_t deltas_are_done_; - event_handler_t dot_delta_X_; - event_handler_t dot_delta_Y_; - raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 08045b47a1..647672e535 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -58,11 +58,11 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( + RAFT_CUDA_TRY_NO_THROW( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( + RAFT_CUDA_TRY_NO_THROW( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } @@ -76,15 +76,15 @@ void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph)); capture_even_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph)); even_initialized = true; RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); capture_odd_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph)); odd_initialized = true; RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); } @@ -98,9 +98,9 @@ void ping_pong_graph_t::launch(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); + RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value())); } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); + RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value())); } } #endif diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py index 4ec4a9aaf2..19db315349 100644 --- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py +++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py @@ -207,44 +207,6 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data): """ self.pdlp_warm_start_data = pdlp_warm_start_data - def set_mip_batch_pdlp_strong_branching(self, enable): - """ - Note: Only supported for MILP - - Toggle batch PDLP strong branching in the MIP solver. - - Parameters - ---------- - enable : bool - If True, enable batch PDLP strong branching (value 1). - If False, disable it (value 0). - - Examples - -------- - >>> settings.set_mip_batch_pdlp_strong_branching(True) - """ - self.set_parameter( - "mip_batch_pdlp_strong_branching", 1 if enable else 0 - ) - - def get_mip_batch_pdlp_strong_branching(self): - """ - Note: Only supported for MILP - - Get the current value of the batch PDLP strong branching setting. - - Returns - ------- - bool - True if batch PDLP strong branching is enabled, False otherwise. - - Examples - -------- - >>> settings.get_mip_batch_pdlp_strong_branching() - False - """ - return bool(self.get_parameter("mip_batch_pdlp_strong_branching")) - def set_mip_callback(self, callback, user_data): """ Note: Only supported for MILP From 9c03faf0f2e5e5d44670507c8ac348ca3f142659 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 17:28:11 +0100 Subject: [PATCH 09/43] final cleanup --- cpp/src/pdlp/pdlp.cu | 60 ------------------------------------------- cpp/src/pdlp/solve.cu | 3 --- run_multiple.sh | 3 --- test.py | 12 --------- 4 files changed, 78 deletions(-) delete mode 100755 run_multiple.sh delete mode 100755 test.py diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index eaafd1293e..4b4eed1f32 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1825,66 +1825,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Validate reflected solutions have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_reflected_primal().data(), - pdhg_solver_.get_reflected_primal().data() + - pdhg_solver_.get_reflected_primal().size(), - is_nan_or_inf{}), - "reflected_primal contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_reflected_dual().data(), - pdhg_solver_.get_reflected_dual().data() + - pdhg_solver_.get_reflected_dual().size(), - is_nan_or_inf{}), - "reflected_dual contains NaN or Inf in compute_fixed_error"); - - // Validate primal/dual solutions have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().data() + - pdhg_solver_.get_primal_solution().size(), - is_nan_or_inf{}), - "primal_solution contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_dual_solution().data(), - pdhg_solver_.get_dual_solution().data() + - pdhg_solver_.get_dual_solution().size(), - is_nan_or_inf{}), - "dual_solution contains NaN or Inf in compute_fixed_error"); - - // Validate deltas have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data() + - pdhg_solver_.get_saddle_point_state().get_delta_primal().size(), - is_nan_or_inf{}), - "delta_primal contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data() + - pdhg_solver_.get_saddle_point_state().get_delta_dual().size(), - is_nan_or_inf{}), - "delta_dual contains NaN or Inf in compute_fixed_error"); - - // Validate primal_weight and step_size have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - primal_weight_.data(), - primal_weight_.data() + primal_weight_.size(), - is_nan_or_inf{}), - "primal_weight_ contains NaN or Inf in compute_fixed_error"); - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - step_size_.data(), - step_size_.data() + step_size_.size(), - is_nan_or_inf{}), - "step_size_ contains NaN or Inf in compute_fixed_error"); // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 374c9ff513..a2766be98a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -793,7 +793,6 @@ optimization_problem_solution_t run_batch_pdlp( // If need warm start, solve the LP alone if (primal_dual_init || primal_weight_init) { - std::cout << "Solving LP for warm start" << std::endl; pdlp_solver_settings_t warm_start_settings = settings; warm_start_settings.new_bounds.clear(); warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; @@ -842,8 +841,6 @@ optimization_problem_solution_t run_batch_pdlp( } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } - std::cout << "Solving batch PDLP" << std::endl; - for (int i = 0; i < max_batch_size; i += optimal_batch_size) { const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i); // Only take the new bounds from [i, i + current_batch_size) diff --git a/run_multiple.sh b/run_multiple.sh deleted file mode 100755 index 183b25b46e..0000000000 --- a/run_multiple.sh +++ /dev/null @@ -1,3 +0,0 @@ -for i in {1..5}; do - python test.py -done \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100755 index 6cb236dae2..0000000000 --- a/test.py +++ /dev/null @@ -1,12 +0,0 @@ -import cuopt_mps_parser -from cuopt.linear_programming import Solve, SolverSettings - -data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps") - -settings = SolverSettings() -settings.set_mip_batch_pdlp_strong_branching(True) - -solution = Solve(data_model, settings) - -print(solution.get_termination_reason()) -print(solution.get_primal_objective()) From 6c2fe356a9ffc0994f0e7b177233504aa73c1c7d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 17:39:07 +0100 Subject: [PATCH 10/43] addtional cleanup --- cpp/src/pdlp/pdhg.cu | 4 ++++ cpp/src/pdlp/utils.cuh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index c5efdcd722..286d6de5b5 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -652,6 +652,8 @@ struct primal_reflected_projection_bulk_op { cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); f_t reflected = primal_val - step_size * (obj_coef - aty_val); @@ -688,6 +690,8 @@ struct dual_reflected_projection_bulk_op { cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection"); cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection"); cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 9150ab8c51..0f2ed44c42 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -606,7 +606,7 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, { auto stream = handle_ptr->get_stream(); auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); - auto n = static_cast(input_vector.size()); + auto n = input_vector.size(); void* d_temp = nullptr; size_t temp_bytes = 0; From a43dc0c78b0d8bd833e8c0b05d8e6e46b0483703 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:43:06 +0100 Subject: [PATCH 11/43] address PR comments, add tests, update doc --- cpp/src/pdlp/pdlp.cu | 33 ++++++++------- .../convergence_information.cu | 8 ++-- cpp/src/pdlp/utilities/ping_pong_graph.cu | 41 +++++-------------- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 1 - docs/cuopt/source/lp-qp-milp-settings.rst | 10 +++++ .../linear_programming/test_python_API.py | 33 +++++++++++++++ 6 files changed, 75 insertions(+), 51 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 4b4eed1f32..eebfede7f9 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1192,8 +1192,10 @@ static void compute_stats(const rmm::device_uvector& vec, return x == 0 ? std::numeric_limits::max() : abs(x); }; + cuopt_assert(vec.size() > 0, "Vector must not be empty"); + auto stream = vec.stream(); - auto n = static_cast(vec.size()); + size_t n = vec.size(); rmm::device_scalar d_smallest(stream); rmm::device_scalar d_largest(stream); @@ -1203,23 +1205,23 @@ static void compute_stats(const rmm::device_uvector& vec, auto abs_iter = thrust::make_transform_iterator(vec.cbegin(), abs_op); void* d_temp = nullptr; - size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1; - cub::DeviceReduce::Reduce( - d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); - cub::DeviceReduce::Reduce( - d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); - cub::DeviceReduce::Reduce( - d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); rmm::device_buffer temp_buf(max_bytes, stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); smallest = d_smallest.value(stream); largest = d_largest.value(stream); @@ -1444,7 +1446,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, movement + computed_interaction >= f_t(0.0), "Movement + computed interaction must be >= 0"); - *fixed_point_error = cuda::std::sqrt(movement + computed_interaction); + // Clamp to 0 to avoid NaN + *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); #ifdef CUPDLP_DEBUG_MODE printf("movement %lf\n", movement); diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index eec078f8d7..269cb58f5d 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -404,19 +404,19 @@ void convergence_information_t::compute_convergence_information( relative_residual_t{settings.tolerances.relative_primal_tolerance}); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - cub::DeviceReduce::Max(d_temp_storage, + RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, transform_iter, linf_primal_residual_.data(), primal_residual_.size(), - stream_view_); + stream_view_)); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); - cub::DeviceReduce::Max(temp_buf.data(), + RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), temp_storage_bytes, transform_iter, linf_primal_residual_.data(), primal_residual_.size(), - stream_view_); + stream_view_)); } compute_dual_residual(op_problem_cusparse_view_, diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 647672e535..5effbcdc48 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -20,32 +20,11 @@ ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, { } -template -void ping_pong_graph_t::cancel_active_capture() -{ - CUOPT_LOG_ERROR( - "Canceling active capture in ping_pong_graph_t"); - if (capture_even_active_) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); - capture_even_active_ = false; - } - if (capture_odd_active_) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); - capture_odd_active_ = false; - } -} - template ping_pong_graph_t::~ping_pong_graph_t() { #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { - // This should not happen, but in case a graph was capturing while destroying the object - if (capture_even_active_ || capture_odd_active_) { - cancel_active_capture(); - } if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } } @@ -58,11 +37,11 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY_NO_THROW( + RAFT_CUDA_TRY( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY_NO_THROW( + RAFT_CUDA_TRY( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } @@ -76,17 +55,17 @@ void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); capture_even_active_ = false; - RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph)); + RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); even_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + RAFT_CUDA_TRY(cudaGraphDestroy(even_graph)); } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); capture_odd_active_ = false; - RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph)); + RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); odd_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph)); } } #endif @@ -98,9 +77,9 @@ void ping_pong_graph_t::launch(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value())); + RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value())); + RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); } } #endif diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 9d6ead8cf7..5113f804d6 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -31,7 +31,6 @@ class ping_pong_graph_t { bool is_initialized(i_t total_pdlp_iterations); private: - void cancel_active_capture(); cudaGraph_t even_graph; cudaGraph_t odd_graph; diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst index 51c6142c2b..bd1372f70e 100644 --- a/docs/cuopt/source/lp-qp-milp-settings.rst +++ b/docs/cuopt/source/lp-qp-milp-settings.rst @@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching. Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times. .. note:: The default value is ``-1`` (automatic). + +Batch PDLP Strong Branching +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root. +When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling. +Set this value to 0 to disable batched PDLP strong branching. +Set this value to 1 to enable batched PDLP strong branching. + +.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem. diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py index dc470f3828..0eca50ba9b 100644 --- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py +++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py @@ -30,6 +30,7 @@ CUOPT_ELIMINATE_DENSE_COLUMNS, CUOPT_FOLDING, CUOPT_INFEASIBILITY_DETECTION, + CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, CUOPT_MIP_CUT_PASSES, CUOPT_METHOD, CUOPT_ORDERING, @@ -997,3 +998,35 @@ def test_cuts(): assert problem.Status.name == "Optimal" assert problem.ObjValue == pytest.approx(-126, abs=1e-3) assert problem.SolutionStats.num_nodes == 0 + + +def test_batch_pdlp_strong_branching(): + # Minimize - 86*y1 - 4*y2 - 40*y3 + # subject to 774*y1 + 76*y2 + 42*y3 <= 875 + # 67*y1 + 27*y2 + 53*y3 <= 875 + # y1, y2, y3 in {0, 1} + + problem = Problem() + y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1") + y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2") + y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3") + + problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875) + problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875) + + problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3) + + settings = SolverSettings() + settings.set_parameter(CUOPT_PRESOLVE, 0) + settings.set_parameter(CUOPT_TIME_LIMIT, 10) + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) + + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) From c8b8b74bc6cc5f65df94f868a0334464e360c374 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:44:54 +0100 Subject: [PATCH 12/43] format --- cpp/src/pdlp/pdlp.cu | 36 ++++++++++------- .../adaptive_step_size_strategy.cu | 17 ++++---- .../convergence_information.cu | 40 +++++++++---------- cpp/src/pdlp/utilities/ping_pong_graph.cu | 6 +-- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 3 +- cpp/src/pdlp/utils.cuh | 10 ++--- 6 files changed, 58 insertions(+), 54 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index eebfede7f9..72aead03d0 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -34,8 +34,8 @@ #include #include -#include #include +#include #include #include @@ -1188,14 +1188,13 @@ static void compute_stats(const rmm::device_uvector& vec, f_t& avg) { auto abs_op = [] __host__ __device__(f_t x) { return abs(x); }; - auto min_nonzero = [] __host__ __device__(f_t x) -> f_t { - return x == 0 ? std::numeric_limits::max() : abs(x); - }; + auto min_nonzero = [] __host__ __device__(f_t x) + -> f_t { return x == 0 ? std::numeric_limits::max() : abs(x); }; cuopt_assert(vec.size() > 0, "Vector must not be empty"); auto stream = vec.stream(); - size_t n = vec.size(); + size_t n = vec.size(); rmm::device_scalar d_smallest(stream); rmm::device_scalar d_largest(stream); @@ -1206,8 +1205,14 @@ static void compute_stats(const rmm::device_uvector& vec, void* d_temp = nullptr; size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0; - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( - d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp, + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( @@ -1216,8 +1221,14 @@ static void compute_stats(const rmm::device_uvector& vec, size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); rmm::device_buffer temp_buf(max_bytes, stream); - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(), + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( @@ -1442,9 +1453,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - cuopt_assert( - movement + computed_interaction >= f_t(0.0), - "Movement + computed interaction must be >= 0"); + cuopt_assert(movement + computed_interaction >= f_t(0.0), + "Movement + computed interaction must be >= 0"); // Clamp to 0 to avoid NaN *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); @@ -1828,7 +1838,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); @@ -1853,7 +1862,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); - #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index d491106aaf..47e9a78a5e 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -137,11 +137,12 @@ void adaptive_step_size_strategy_t::swap_context( const auto [grid_size, block_size] = kernel_config_from_batch_size(static_cast(swap_pairs.size())); adaptive_step_size_swap_device_vectors_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), - static_cast(swap_pairs.size()), - make_span(interaction_), - make_span(norm_squared_delta_primal_), - make_span(norm_squared_delta_dual_)); + <<>>( + thrust::raw_pointer_cast(swap_pairs.data()), + static_cast(swap_pairs.size()), + make_span(interaction_), + make_span(norm_squared_delta_primal_), + make_span(norm_squared_delta_dual_)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -332,9 +333,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction <<<1, 1, 0, stream_view_.value()>>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), - pdhg_solver.get_d_total_pdhg_iterations().data()); + primal_step_size.data(), + dual_step_size.data(), + pdhg_solver.get_d_total_pdhg_iterations().data()); graph.end_capture(total_pdlp_iterations); } graph.launch(total_pdlp_iterations); diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 269cb58f5d..1e9a69d130 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -405,18 +405,18 @@ void convergence_information_t::compute_convergence_information( void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); } compute_dual_residual(op_problem_cusparse_view_, @@ -465,18 +465,18 @@ void convergence_information_t::compute_convergence_information( void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 5effbcdc48..4ec5bff8c1 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -37,12 +37,10 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 5113f804d6..dafecdd06e 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -19,7 +19,7 @@ namespace cuopt::linear_programming::detail { // No additional checks for safe usage (calling launch() before initializing the graph) use with // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph - template +template class ping_pong_graph_t { public: ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); @@ -31,7 +31,6 @@ class ping_pong_graph_t { bool is_initialized(i_t total_pdlp_iterations); private: - cudaGraph_t even_graph; cudaGraph_t odd_graph; cudaGraphExec_t even_instance; diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 0f2ed44c42..33625f7680 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -608,13 +608,11 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); auto n = input_vector.size(); - void* d_temp = nullptr; - size_t temp_bytes = 0; - cub::DeviceReduce::Max( - d_temp, temp_bytes, abs_iter, result, n, stream); + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream); rmm::device_buffer temp_buf(temp_bytes, stream); - cub::DeviceReduce::Max( - temp_buf.data(), temp_bytes, abs_iter, result, n, stream); + cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream); } template From b1be5bb2d0b729ddb685b722ad50c41f5a32aa58 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:45:01 +0100 Subject: [PATCH 13/43] format --- benchmarks/linear_programming/cuopt/run_pdlp.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index 64897264c9..e9b4f8296c 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -71,9 +71,9 @@ static void parse_arguments(argparse::ArgumentParser& program) "modes."); program.add_argument("--presolver") - .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") - .default_value("Default") - .choices("None", "Papilo", "PSLP", "Default"); + .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") + .default_value("Default") + .choices("None", "Papilo", "PSLP", "Default"); program.add_argument("--solution-path").help("Path where solution file will be generated"); } From d89af961b3d8af72d22f027114ac1921809c3ccc Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:53:59 +0100 Subject: [PATCH 14/43] style --- .../restart_strategy/pdlp_restart_strategy.cu | 827 +++++++++--------- 1 file changed, 415 insertions(+), 412 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index cf715e8a1d..0b1c109185 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -2008,462 +2008,465 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( "Incorrect primal reverse iterator"); index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() + 1; // + 1 to go after last negInf - if (lowest_inf != end) { + if (lowest_inf != end) { std::numeric_limits::infinity(), "Incorrect primal iterator"); - index_end_primal = - thrust::raw_pointer_cast(lowest_inf) - - threshold_.data(); // no - 1 to go before the first inf because end is not included - testing_range_high_.set_value_async(index_end_primal, stream_view_); - } else // No inf found, end is primal_size_h_ - testing_range_high_.set_value_async(index_end_primal, stream_view_); - cuopt_assert(index_start_primal <= index_end_primal, - "Start should be strictly smaller than end"); - - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - threshold_.data() + index_start_primal, - threshold_.data() + index_end_primal, - is_nan_or_inf()), - "Threshold vector should not contain inf or NaN values"); - - // Init parameters for live kernel - // Has to do this to pass lvalues (and not rvalue) to void* kernel_args - auto restart_view = this->view(); - auto op_view = problem_ptr->view(); - i_t* testing_range_low = testing_range_low_.data(); - i_t* testing_range_high = testing_range_high_.data(); - f_t* test_radius_squared = test_radius_squared_.data(); - f_t* low_radius_squared = low_radius_squared_.data(); - f_t* high_radius_squared = high_radius_squared_.data(); - f_t* distance_traveled = duality_gap.distance_traveled_.data(); - - void* kernel_args[] = { - &restart_view, - &op_view, - &testing_range_low, - &testing_range_high, - &test_radius_squared, - &low_radius_squared, - &high_radius_squared, - &distance_traveled, - }; - constexpr int numThreads = 128; - dim3 dimBlock(numThreads, 1, 1); - // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * - // numBlocksPerSm - dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); - // Compute the median for the join problem, while loop is inside the live kernel - RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( - (void*)solve_bound_constrained_trust_region_kernel, - dimGrid, - dimBlock, - kernel_args, - 0, - stream_view_)); - - // Find max threshold for the join problem - const f_t* max_threshold = - thrust::max_element(handle_ptr_->get_thrust_policy(), - threshold_.data(), - threshold_.data() + primal_size_h_ + dual_size_h_); - - // we have now determined the test_threshold that should minimize the objective value of the - // solution. - - // if no component got fixed by their upper bound we can pick the maximum threshold to be the - // target_threshold which was computed before the loop in the direction_and_threshold_kernel - // Otherwise use the test_threshold determined in the loop - // { - target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( - this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - // } + index_end_primal = + thrust::raw_pointer_cast(lowest_inf) - + threshold_ + .data(); // no - 1 to go before the first inf because end is not included + testing_range_high_.set_value_async(index_end_primal, stream_view_); + } else // No inf found, end is primal_size_h_ + testing_range_high_.set_value_async(index_end_primal, stream_view_); + cuopt_assert(index_start_primal <= index_end_primal, + "Start should be strictly smaller than end"); + + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + threshold_.data() + index_start_primal, + threshold_.data() + index_end_primal, + is_nan_or_inf()), + "Threshold vector should not contain inf or NaN values"); + + // Init parameters for live kernel + // Has to do this to pass lvalues (and not rvalue) to void* kernel_args + auto restart_view = this->view(); + auto op_view = problem_ptr->view(); + i_t* testing_range_low = testing_range_low_.data(); + i_t* testing_range_high = testing_range_high_.data(); + f_t* test_radius_squared = test_radius_squared_.data(); + f_t* low_radius_squared = low_radius_squared_.data(); + f_t* high_radius_squared = high_radius_squared_.data(); + f_t* distance_traveled = duality_gap.distance_traveled_.data(); + + void* kernel_args[] = { + &restart_view, + &op_view, + &testing_range_low, + &testing_range_high, + &test_radius_squared, + &low_radius_squared, + &high_radius_squared, + &distance_traveled, + }; + constexpr int numThreads = 128; + dim3 dimBlock(numThreads, 1, 1); + // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * + // numBlocksPerSm + dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); + // Compute the median for the join problem, while loop is inside the live kernel + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( + (void*)solve_bound_constrained_trust_region_kernel, + dimGrid, + dimBlock, + kernel_args, + 0, + stream_view_)); + + // Find max threshold for the join problem + const f_t* max_threshold = + thrust::max_element(handle_ptr_->get_thrust_policy(), + threshold_.data(), + threshold_.data() + primal_size_h_ + dual_size_h_); + + // we have now determined the test_threshold that should minimize the objective value of the + // solution. + + // if no component got fixed by their upper bound we can pick the maximum threshold to be the + // target_threshold which was computed before the loop in the direction_and_threshold_kernel + // Otherwise use the test_threshold determined in the loop + // { + target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( + this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // } + + // Compute x (the solution which is defined by moving each component test_threshold * + // direction[component]) clamp on upper and lower bounds. + // Used unsorted_direction_full_ as the other one got sorted + // { + raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), + duality_gap.primal_solution_.data(), + unsorted_direction_full_.data(), + primal_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_.data(), + unsorted_direction_full_.data() + primal_size_h_, + dual_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part + using f_t2 = typename type_2::type; + cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), + problem_ptr->variable_bounds.data()), + duality_gap.primal_solution_tr_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + + // project by max(min(y[i], upperbound[i]),lowerbound[i]) + raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_tr_.data(), + transformed_constraint_lower_bounds_.data(), + transformed_constraint_upper_bounds_.data(), + dual_size_h_, + constraint_clamp(), + stream_view_); + // } + } - // Compute x (the solution which is defined by moving each component test_threshold * - // direction[component]) clamp on upper and lower bounds. - // Used unsorted_direction_full_ as the other one got sorted + // Compute the current lower bound for the objective value using the primal solution_tr and + // upper bound for the objective value using the dual solution_tr // { - raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), - duality_gap.primal_solution_.data(), - unsorted_direction_full_.data(), - primal_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_.data(), - unsorted_direction_full_.data() + primal_size_h_, - dual_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part - using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), - problem_ptr->variable_bounds.data()), - duality_gap.primal_solution_tr_.data(), - primal_size_h_, - clamp(), - stream_view_.value()); - - // project by max(min(y[i], upperbound[i]),lowerbound[i]) - raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_tr_.data(), - transformed_constraint_lower_bounds_.data(), - transformed_constraint_upper_bounds_.data(), - dual_size_h_, - constraint_clamp(), - stream_view_); + // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, + // primal_gradient)) + compute_bound(duality_gap.primal_solution_tr_, + duality_gap.primal_solution_, + duality_gap.primal_gradient_, + duality_gap.lagrangian_value_, + primal_size_h_, + primal_stride, + tmp_primal, + duality_gap.lower_bound_value_); + + // compute 'upper bound' using dual + compute_bound(duality_gap.dual_solution_tr_, + duality_gap.dual_solution_, + duality_gap.dual_gradient_, + duality_gap.lagrangian_value_, + dual_size_h_, + dual_stride, + tmp_dual, + duality_gap.upper_bound_value_); + // } } - // Compute the current lower bound for the objective value using the primal solution_tr and - // upper bound for the objective value using the dual solution_tr - // { - // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, - // primal_gradient)) - compute_bound(duality_gap.primal_solution_tr_, - duality_gap.primal_solution_, - duality_gap.primal_gradient_, - duality_gap.lagrangian_value_, - primal_size_h_, - primal_stride, - tmp_primal, - duality_gap.lower_bound_value_); - - // compute 'upper bound' using dual - compute_bound(duality_gap.dual_solution_tr_, - duality_gap.dual_solution_, - duality_gap.dual_gradient_, - duality_gap.lagrangian_value_, - dual_size_h_, - dual_stride, - tmp_dual, - duality_gap.upper_bound_value_); - - // } -} - -template -void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( - localized_duality_gap_container_t& duality_gap, - rmm::device_uvector& primal_weight, - rmm::device_uvector& tmp_primal, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); - // norm( - // new_primal_solution - last_restart.primal_solution, - // )^2 - - // Julia / Paper use a weighted norm using primal weight for primal / dual distance - // We simply use L2 norm of diff - distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, - last_restart_duality_gap_.primal_solution_, - tmp_primal, - primal_size_h_, - primal_stride, - duality_gap.primal_distance_traveled_); - - // compute similarly for dual - distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, - last_restart_duality_gap_.dual_solution_, - tmp_dual, - dual_size_h_, - dual_stride, - duality_gap.dual_distance_traveled_); - - // distance_traveled = primal_distance * 0.5 * primal_weight - // + dual_distance * 0.5 / primal_weight - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} + template + void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( + localized_duality_gap_container_t & duality_gap, + rmm::device_uvector & primal_weight, + rmm::device_uvector & tmp_primal, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); + // norm( + // new_primal_solution - last_restart.primal_solution, + // )^2 + + // Julia / Paper use a weighted norm using primal weight for primal / dual distance + // We simply use L2 norm of diff + distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, + last_restart_duality_gap_.primal_solution_, + tmp_primal, + primal_size_h_, + primal_stride, + duality_gap.primal_distance_traveled_); + + // compute similarly for dual + distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, + last_restart_duality_gap_.dual_solution_, + tmp_dual, + dual_size_h_, + dual_stride, + duality_gap.dual_distance_traveled_); + + // distance_traveled = primal_distance * 0.5 * primal_weight + // + dual_distance * 0.5 / primal_weight + compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( + duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } -template -void pdlp_restart_strategy_t::compute_primal_gradient( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view) -{ - raft::common::nvtx::range fun_scope("compute_primal_gradient"); + template + void pdlp_restart_strategy_t::compute_primal_gradient( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view) + { + raft::common::nvtx::range fun_scope("compute_primal_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute primal gradient:" << std::endl; + std::cout << " Compute primal gradient:" << std::endl; #endif - // for QP add problem.objective_matrix * primal_solution as well - // c - A^T*y (copy c to primal_gradient for correct writing of result) - raft::copy(duality_gap.primal_gradient_.data(), - problem_ptr->objective_coefficients.data(), - primal_size_h_, - stream_view_); + // for QP add problem.objective_matrix * primal_solution as well + // c - A^T*y (copy c to primal_gradient for correct writing of result) + raft::copy(duality_gap.primal_gradient_.data(), + problem_ptr->objective_coefficients.data(), + primal_size_h_, + stream_view_); - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.primal_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); -} + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.primal_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + } -template -__global__ void compute_subgradient_kernel( - const typename pdlp_restart_strategy_t::view_t restart_strategy_view, - const typename problem_t::view_t op_problem_view, - const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* subgradient) -{ - i_t id = threadIdx.x + blockIdx.x * blockDim.x; - if (id >= duality_gap_view.dual_size) { return; } - - f_t lower = op_problem_view.constraint_lower_bounds[id]; - f_t upper = op_problem_view.constraint_upper_bounds[id]; - f_t primal_product = duality_gap_view.dual_gradient[id]; - f_t dual_solution = duality_gap_view.dual_solution[id]; - - f_t subgradient_coefficient; - - if (dual_solution < f_t(0)) { - subgradient_coefficient = upper; - } else if (dual_solution > f_t(0)) { - subgradient_coefficient = lower; - } else if (!isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = f_t(0); - } else if (!isfinite(upper) && isfinite(lower)) { - subgradient_coefficient = lower; - } else if (isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = upper; - } else { - if (primal_product < lower) { + template + __global__ void compute_subgradient_kernel( + const typename pdlp_restart_strategy_t::view_t restart_strategy_view, + const typename problem_t::view_t op_problem_view, + const typename localized_duality_gap_container_t::view_t duality_gap_view, + f_t* subgradient) + { + i_t id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= duality_gap_view.dual_size) { return; } + + f_t lower = op_problem_view.constraint_lower_bounds[id]; + f_t upper = op_problem_view.constraint_upper_bounds[id]; + f_t primal_product = duality_gap_view.dual_gradient[id]; + f_t dual_solution = duality_gap_view.dual_solution[id]; + + f_t subgradient_coefficient; + + if (dual_solution < f_t(0)) { + subgradient_coefficient = upper; + } else if (dual_solution > f_t(0)) { subgradient_coefficient = lower; - } else if (primal_product > upper) { + } else if (!isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = f_t(0); + } else if (!isfinite(upper) && isfinite(lower)) { + subgradient_coefficient = lower; + } else if (isfinite(upper) && !isfinite(lower)) { subgradient_coefficient = upper; } else { - subgradient_coefficient = primal_product; + if (primal_product < lower) { + subgradient_coefficient = lower; + } else if (primal_product > upper) { + subgradient_coefficient = upper; + } else { + subgradient_coefficient = primal_product; + } } - } - subgradient[id] = subgradient_coefficient; -} + subgradient[id] = subgradient_coefficient; + } -template -void pdlp_restart_strategy_t::compute_dual_gradient( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_dual_gradient"); + template + void pdlp_restart_strategy_t::compute_dual_gradient( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_dual_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute dual gradient:" << std::endl; + std::cout << " Compute dual gradient:" << std::endl; #endif - // b - A*x - // is changed with the introduction of constraint upper and lower bounds - - // gradient constains primal_product - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A, - cusparse_view.primal_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.dual_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), - stream_view_)); - - // tmp_dual will contain the subgradient - i_t number_of_blocks = dual_size_h_ / block_size; - if (dual_size_h_ % block_size) number_of_blocks++; - i_t number_of_threads = std::min(dual_size_h_, block_size); - compute_subgradient_kernel<<>>( - this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); - - // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) - raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), - tmp_dual.data(), - duality_gap.dual_gradient_.data(), - dual_size_h_, - stream_view_); -} + // b - A*x + // is changed with the introduction of constraint upper and lower bounds + + // gradient constains primal_product + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + + // tmp_dual will contain the subgradient + i_t number_of_blocks = dual_size_h_ / block_size; + if (dual_size_h_ % block_size) number_of_blocks++; + i_t number_of_threads = std::min(dual_size_h_, block_size); + compute_subgradient_kernel<<>>( + this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); + + // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) + raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), + tmp_dual.data(), + duality_gap.dual_gradient_.data(), + dual_size_h_, + stream_view_); + } -template -void pdlp_restart_strategy_t::compute_lagrangian_value( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view, - rmm::device_uvector& tmp_primal, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_lagrangian_value"); + template + void pdlp_restart_strategy_t::compute_lagrangian_value( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view, + rmm::device_uvector & tmp_primal, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_lagrangian_value"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute lagrangian value:" << std::endl; + std::cout << " Compute lagrangian value:" << std::endl; #endif - // if QP - // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + - // dot(primal_solution, problem.objective_vector) - - // dot(primal_solution, problem.constraint_matrix' * dual_solution) + - // dot(dual_solution, dual_gradient+primal_product) + - // problem.objective_constant + // if QP + // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + + // dot(primal_solution, problem.objective_vector) - + // dot(primal_solution, problem.constraint_matrix' * dual_solution) + + // dot(dual_solution, dual_gradient+primal_product) + + // problem.objective_constant - // when lp first term is irrelevant + // when lp first term is irrelevant - // second term - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - problem_ptr->objective_coefficients.data(), - primal_stride, - reusable_device_scalar_1_.data(), - stream_view_)); + // second term + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + reusable_device_scalar_1_.data(), + stream_view_)); - // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - tmp_primal.data(), - primal_stride, - reusable_device_scalar_2_.data(), - stream_view_)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + tmp_primal.data(), + primal_stride, + reusable_device_scalar_2_.data(), + stream_view_)); - // fourth term //tmp_dual still contains subgradient from the dual_gradient computation - reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - dual_size_h_, - duality_gap.dual_solution_.data(), - dual_stride, - tmp_dual.data(), - dual_stride, - reusable_device_scalar_3_.data(), - stream_view_)); + // fourth term //tmp_dual still contains subgradient from the dual_gradient computation + reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + dual_size_h_, + duality_gap.dual_solution_.data(), + dual_stride, + tmp_dual.data(), + dual_stride, + reusable_device_scalar_3_.data(), + stream_view_)); - // subtract third term from second up - raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_2_.data(), - 1, - stream_view_); - raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_3_.data(), - 1, - stream_view_); -} + // subtract third term from second up + raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_2_.data(), + 1, + stream_view_); + raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_3_.data(), + 1, + stream_view_); + } -template -void pdlp_restart_strategy_t::reset_internal() -{ - candidate_is_avg_.set_value_to_zero_async(stream_view_); - restart_triggered_.set_value_to_zero_async(stream_view_); -} + template + void pdlp_restart_strategy_t::reset_internal() + { + candidate_is_avg_.set_value_to_zero_async(stream_view_); + restart_triggered_.set_value_to_zero_async(stream_view_); + } -template -typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() -{ - pdlp_restart_strategy_t::view_t v{}; - v.primal_size = primal_size_h_; - v.dual_size = dual_size_h_; - v.transformed_constraint_lower_bounds = raft::device_span{ - transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; - v.transformed_constraint_upper_bounds = raft::device_span{ - transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; + template + typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() + { + pdlp_restart_strategy_t::view_t v{}; + v.primal_size = primal_size_h_; + v.dual_size = dual_size_h_; + v.transformed_constraint_lower_bounds = raft::device_span{ + transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; + v.transformed_constraint_upper_bounds = raft::device_span{ + transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; + v.last_restart_length = last_restart_length_; - v.weights = raft::device_span{weights_.data(), weights_.size()}; + v.weights = raft::device_span{weights_.data(), weights_.size()}; - v.candidate_is_avg = candidate_is_avg_.data(); - v.restart_triggered = restart_triggered_.data(); + v.candidate_is_avg = candidate_is_avg_.data(); + v.restart_triggered = restart_triggered_.data(); - v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); + v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); - v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; - v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; - v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; - v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; - v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; - v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; - v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; + v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; + v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; + v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; + v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; + v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; + v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; + v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; - v.target_threshold = target_threshold_.data(); - v.low_radius_squared = low_radius_squared_.data(); - v.high_radius_squared = high_radius_squared_.data(); - v.test_radius_squared = test_radius_squared_.data(); + v.target_threshold = target_threshold_.data(); + v.low_radius_squared = low_radius_squared_.data(); + v.high_radius_squared = high_radius_squared_.data(); + v.test_radius_squared = test_radius_squared_.data(); - v.testing_range_low = testing_range_low_.data(); - v.testing_range_high = testing_range_high_.data(); + v.testing_range_low = testing_range_low_.data(); + v.testing_range_high = testing_range_high_.data(); - v.shared_live_kernel_accumulator = raft::device_span{shared_live_kernel_accumulator_.data(), - shared_live_kernel_accumulator_.size()}; + v.shared_live_kernel_accumulator = raft::device_span{ + shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()}; - v.hyper_params = hyper_params_; + v.hyper_params = hyper_params_; - return v; -} + return v; + } -template -typename pdlp_restart_strategy_t::cupdlpx_restart_view_t -pdlp_restart_strategy_t::make_cupdlpx_restart_view( - const rmm::device_uvector& primal_distance, - const rmm::device_uvector& dual_distance, - const convergence_information_t& current_convergence_information, - const rmm::device_uvector& step_size, - rmm::device_uvector& primal_weight, - rmm::device_uvector& best_primal_weight, - rmm::device_uvector& primal_step_size, - rmm::device_uvector& dual_step_size) -{ - cupdlpx_restart_view_t v{}; - v.primal_distance = make_span(primal_distance); - v.dual_distance = make_span(dual_distance); - v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); - v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); - v.l2_norm_primal_linear_objective = - current_convergence_information.get_relative_dual_tolerance_factor(); - v.l2_norm_primal_right_hand_side = - current_convergence_information.get_relative_primal_tolerance_factor(); - v.step_size = make_span(step_size); - v.primal_weight = make_span(primal_weight); - v.primal_weight_error_sum = make_span(primal_weight_error_sum_); - v.primal_weight_last_error = make_span(primal_weight_last_error_); - v.best_primal_weight = make_span(best_primal_weight); - v.new_primal_step_size = make_span(primal_step_size); - v.new_dual_step_size = make_span(dual_step_size); - v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); - v.hyper_params = hyper_params_; - return v; -} + template + typename pdlp_restart_strategy_t::cupdlpx_restart_view_t + pdlp_restart_strategy_t::make_cupdlpx_restart_view( + const rmm::device_uvector& primal_distance, + const rmm::device_uvector& dual_distance, + const convergence_information_t& current_convergence_information, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_weight, + rmm::device_uvector& best_primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) + { + cupdlpx_restart_view_t v{}; + v.primal_distance = make_span(primal_distance); + v.dual_distance = make_span(dual_distance); + v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); + v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); + v.l2_norm_primal_linear_objective = + current_convergence_information.get_relative_dual_tolerance_factor(); + v.l2_norm_primal_right_hand_side = + current_convergence_information.get_relative_primal_tolerance_factor(); + v.step_size = make_span(step_size); + v.primal_weight = make_span(primal_weight); + v.primal_weight_error_sum = make_span(primal_weight_error_sum_); + v.primal_weight_last_error = make_span(primal_weight_last_error_); + v.best_primal_weight = make_span(best_primal_weight); + v.new_primal_step_size = make_span(primal_step_size); + v.new_dual_step_size = make_span(dual_step_size); + v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); + v.hyper_params = hyper_params_; + return v; + } -template -i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const -{ - return weighted_average_solution_.get_iterations_since_last_restart(); -} + template + i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const + { + return weighted_average_solution_.get_iterations_since_last_restart(); + } -template -void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) -{ - last_restart_was_average_ = value; -} + template + void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) + { + last_restart_was_average_ = value; + } -template -bool pdlp_restart_strategy_t::get_last_restart_was_average() const -{ - return last_restart_was_average_; -} + template + bool pdlp_restart_strategy_t::get_last_restart_was_average() const + { + return last_restart_was_average_; + } #define INSTANTIATE(F_TYPE) \ template class pdlp_restart_strategy_t; \ @@ -2520,11 +2523,11 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const F_TYPE* primal_product); #if MIP_INSTANTIATE_FLOAT -INSTANTIATE(float) + INSTANTIATE(float) #endif #if MIP_INSTANTIATE_DOUBLE -INSTANTIATE(double) + INSTANTIATE(double) #endif } // namespace cuopt::linear_programming::detail From c7e3e222d063b98907efa2f5f81e31d84f231f10 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 15:03:34 +0100 Subject: [PATCH 15/43] put back changes in restart --- .../restart_strategy/pdlp_restart_strategy.cu | 831 +++++++++--------- 1 file changed, 416 insertions(+), 415 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 0b1c109185..8eacd4d246 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -2008,465 +2008,466 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( "Incorrect primal reverse iterator"); index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() + 1; // + 1 to go after last negInf - if (lowest_inf != end) { + testing_range_low_.set_value_async(index_start_primal, stream_view_); + } else // No negInf found, start is 0 + testing_range_low_.set_value_async(index_start_primal, stream_view_); + if (lowest_inf != end) { + cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*lowest_inf)) == std::numeric_limits::infinity(), "Incorrect primal iterator"); - index_end_primal = - thrust::raw_pointer_cast(lowest_inf) - - threshold_ - .data(); // no - 1 to go before the first inf because end is not included - testing_range_high_.set_value_async(index_end_primal, stream_view_); - } else // No inf found, end is primal_size_h_ - testing_range_high_.set_value_async(index_end_primal, stream_view_); - cuopt_assert(index_start_primal <= index_end_primal, - "Start should be strictly smaller than end"); - - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - threshold_.data() + index_start_primal, - threshold_.data() + index_end_primal, - is_nan_or_inf()), - "Threshold vector should not contain inf or NaN values"); - - // Init parameters for live kernel - // Has to do this to pass lvalues (and not rvalue) to void* kernel_args - auto restart_view = this->view(); - auto op_view = problem_ptr->view(); - i_t* testing_range_low = testing_range_low_.data(); - i_t* testing_range_high = testing_range_high_.data(); - f_t* test_radius_squared = test_radius_squared_.data(); - f_t* low_radius_squared = low_radius_squared_.data(); - f_t* high_radius_squared = high_radius_squared_.data(); - f_t* distance_traveled = duality_gap.distance_traveled_.data(); - - void* kernel_args[] = { - &restart_view, - &op_view, - &testing_range_low, - &testing_range_high, - &test_radius_squared, - &low_radius_squared, - &high_radius_squared, - &distance_traveled, - }; - constexpr int numThreads = 128; - dim3 dimBlock(numThreads, 1, 1); - // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * - // numBlocksPerSm - dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); - // Compute the median for the join problem, while loop is inside the live kernel - RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( - (void*)solve_bound_constrained_trust_region_kernel, - dimGrid, - dimBlock, - kernel_args, - 0, - stream_view_)); - - // Find max threshold for the join problem - const f_t* max_threshold = - thrust::max_element(handle_ptr_->get_thrust_policy(), - threshold_.data(), - threshold_.data() + primal_size_h_ + dual_size_h_); - - // we have now determined the test_threshold that should minimize the objective value of the - // solution. - - // if no component got fixed by their upper bound we can pick the maximum threshold to be the - // target_threshold which was computed before the loop in the direction_and_threshold_kernel - // Otherwise use the test_threshold determined in the loop - // { - target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( - this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - // } - - // Compute x (the solution which is defined by moving each component test_threshold * - // direction[component]) clamp on upper and lower bounds. - // Used unsorted_direction_full_ as the other one got sorted - // { - raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), - duality_gap.primal_solution_.data(), - unsorted_direction_full_.data(), - primal_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_.data(), - unsorted_direction_full_.data() + primal_size_h_, - dual_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part - using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), - problem_ptr->variable_bounds.data()), - duality_gap.primal_solution_tr_.data(), - primal_size_h_, - clamp(), - stream_view_.value()); - - // project by max(min(y[i], upperbound[i]),lowerbound[i]) - raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_tr_.data(), - transformed_constraint_lower_bounds_.data(), - transformed_constraint_upper_bounds_.data(), - dual_size_h_, - constraint_clamp(), - stream_view_); - // } - } - - // Compute the current lower bound for the objective value using the primal solution_tr and - // upper bound for the objective value using the dual solution_tr + index_end_primal = + thrust::raw_pointer_cast(lowest_inf) - + threshold_.data(); // no - 1 to go before the first inf because end is not included + testing_range_high_.set_value_async(index_end_primal, stream_view_); + } else // No inf found, end is primal_size_h_ + testing_range_high_.set_value_async(index_end_primal, stream_view_); + cuopt_assert(index_start_primal <= index_end_primal, + "Start should be strictly smaller than end"); + + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + threshold_.data() + index_start_primal, + threshold_.data() + index_end_primal, + is_nan_or_inf()), + "Threshold vector should not contain inf or NaN values"); + + // Init parameters for live kernel + // Has to do this to pass lvalues (and not rvalue) to void* kernel_args + auto restart_view = this->view(); + auto op_view = problem_ptr->view(); + i_t* testing_range_low = testing_range_low_.data(); + i_t* testing_range_high = testing_range_high_.data(); + f_t* test_radius_squared = test_radius_squared_.data(); + f_t* low_radius_squared = low_radius_squared_.data(); + f_t* high_radius_squared = high_radius_squared_.data(); + f_t* distance_traveled = duality_gap.distance_traveled_.data(); + + void* kernel_args[] = { + &restart_view, + &op_view, + &testing_range_low, + &testing_range_high, + &test_radius_squared, + &low_radius_squared, + &high_radius_squared, + &distance_traveled, + }; + constexpr int numThreads = 128; + dim3 dimBlock(numThreads, 1, 1); + // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * + // numBlocksPerSm + dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); + // Compute the median for the join problem, while loop is inside the live kernel + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( + (void*)solve_bound_constrained_trust_region_kernel, + dimGrid, + dimBlock, + kernel_args, + 0, + stream_view_)); + + // Find max threshold for the join problem + const f_t* max_threshold = + thrust::max_element(handle_ptr_->get_thrust_policy(), + threshold_.data(), + threshold_.data() + primal_size_h_ + dual_size_h_); + + // we have now determined the test_threshold that should minimize the objective value of the + // solution. + + // if no component got fixed by their upper bound we can pick the maximum threshold to be the + // target_threshold which was computed before the loop in the direction_and_threshold_kernel + // Otherwise use the test_threshold determined in the loop // { - // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, - // primal_gradient)) - compute_bound(duality_gap.primal_solution_tr_, - duality_gap.primal_solution_, - duality_gap.primal_gradient_, - duality_gap.lagrangian_value_, - primal_size_h_, - primal_stride, - tmp_primal, - duality_gap.lower_bound_value_); - - // compute 'upper bound' using dual - compute_bound(duality_gap.dual_solution_tr_, - duality_gap.dual_solution_, - duality_gap.dual_gradient_, - duality_gap.lagrangian_value_, - dual_size_h_, - dual_stride, - tmp_dual, - duality_gap.upper_bound_value_); + target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( + this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // } + // Compute x (the solution which is defined by moving each component test_threshold * + // direction[component]) clamp on upper and lower bounds. + // Used unsorted_direction_full_ as the other one got sorted + // { + raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), + duality_gap.primal_solution_.data(), + unsorted_direction_full_.data(), + primal_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_.data(), + unsorted_direction_full_.data() + primal_size_h_, + dual_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part + using f_t2 = typename type_2::type; + cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), + problem_ptr->variable_bounds.data()), + duality_gap.primal_solution_tr_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + + // project by max(min(y[i], upperbound[i]),lowerbound[i]) + raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_tr_.data(), + transformed_constraint_lower_bounds_.data(), + transformed_constraint_upper_bounds_.data(), + dual_size_h_, + constraint_clamp(), + stream_view_); // } } - template - void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( - localized_duality_gap_container_t & duality_gap, - rmm::device_uvector & primal_weight, - rmm::device_uvector & tmp_primal, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); - // norm( - // new_primal_solution - last_restart.primal_solution, - // )^2 - - // Julia / Paper use a weighted norm using primal weight for primal / dual distance - // We simply use L2 norm of diff - distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, - last_restart_duality_gap_.primal_solution_, - tmp_primal, - primal_size_h_, - primal_stride, - duality_gap.primal_distance_traveled_); - - // compute similarly for dual - distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, - last_restart_duality_gap_.dual_solution_, - tmp_dual, - dual_size_h_, - dual_stride, - duality_gap.dual_distance_traveled_); - - // distance_traveled = primal_distance * 0.5 * primal_weight - // + dual_distance * 0.5 / primal_weight - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + // Compute the current lower bound for the objective value using the primal solution_tr and + // upper bound for the objective value using the dual solution_tr + // { + // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, + // primal_gradient)) + compute_bound(duality_gap.primal_solution_tr_, + duality_gap.primal_solution_, + duality_gap.primal_gradient_, + duality_gap.lagrangian_value_, + primal_size_h_, + primal_stride, + tmp_primal, + duality_gap.lower_bound_value_); + + // compute 'upper bound' using dual + compute_bound(duality_gap.dual_solution_tr_, + duality_gap.dual_solution_, + duality_gap.dual_gradient_, + duality_gap.lagrangian_value_, + dual_size_h_, + dual_stride, + tmp_dual, + duality_gap.upper_bound_value_); + + // } +} - template - void pdlp_restart_strategy_t::compute_primal_gradient( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view) - { - raft::common::nvtx::range fun_scope("compute_primal_gradient"); +template +void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( + localized_duality_gap_container_t& duality_gap, + rmm::device_uvector& primal_weight, + rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); + // norm( + // new_primal_solution - last_restart.primal_solution, + // )^2 + + // Julia / Paper use a weighted norm using primal weight for primal / dual distance + // We simply use L2 norm of diff + distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, + last_restart_duality_gap_.primal_solution_, + tmp_primal, + primal_size_h_, + primal_stride, + duality_gap.primal_distance_traveled_); + + // compute similarly for dual + distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, + last_restart_duality_gap_.dual_solution_, + tmp_dual, + dual_size_h_, + dual_stride, + duality_gap.dual_distance_traveled_); + + // distance_traveled = primal_distance * 0.5 * primal_weight + // + dual_distance * 0.5 / primal_weight + compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( + duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void pdlp_restart_strategy_t::compute_primal_gradient( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view) +{ + raft::common::nvtx::range fun_scope("compute_primal_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute primal gradient:" << std::endl; + std::cout << " Compute primal gradient:" << std::endl; #endif - // for QP add problem.objective_matrix * primal_solution as well - // c - A^T*y (copy c to primal_gradient for correct writing of result) - raft::copy(duality_gap.primal_gradient_.data(), - problem_ptr->objective_coefficients.data(), - primal_size_h_, - stream_view_); - - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.primal_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); - } - - template - __global__ void compute_subgradient_kernel( - const typename pdlp_restart_strategy_t::view_t restart_strategy_view, - const typename problem_t::view_t op_problem_view, - const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* subgradient) - { - i_t id = threadIdx.x + blockIdx.x * blockDim.x; - if (id >= duality_gap_view.dual_size) { return; } - - f_t lower = op_problem_view.constraint_lower_bounds[id]; - f_t upper = op_problem_view.constraint_upper_bounds[id]; - f_t primal_product = duality_gap_view.dual_gradient[id]; - f_t dual_solution = duality_gap_view.dual_solution[id]; + // for QP add problem.objective_matrix * primal_solution as well + // c - A^T*y (copy c to primal_gradient for correct writing of result) + raft::copy(duality_gap.primal_gradient_.data(), + problem_ptr->objective_coefficients.data(), + primal_size_h_, + stream_view_); - f_t subgradient_coefficient; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.primal_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); +} - if (dual_solution < f_t(0)) { - subgradient_coefficient = upper; - } else if (dual_solution > f_t(0)) { - subgradient_coefficient = lower; - } else if (!isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = f_t(0); - } else if (!isfinite(upper) && isfinite(lower)) { +template +__global__ void compute_subgradient_kernel( + const typename pdlp_restart_strategy_t::view_t restart_strategy_view, + const typename problem_t::view_t op_problem_view, + const typename localized_duality_gap_container_t::view_t duality_gap_view, + f_t* subgradient) +{ + i_t id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= duality_gap_view.dual_size) { return; } + + f_t lower = op_problem_view.constraint_lower_bounds[id]; + f_t upper = op_problem_view.constraint_upper_bounds[id]; + f_t primal_product = duality_gap_view.dual_gradient[id]; + f_t dual_solution = duality_gap_view.dual_solution[id]; + + f_t subgradient_coefficient; + + if (dual_solution < f_t(0)) { + subgradient_coefficient = upper; + } else if (dual_solution > f_t(0)) { + subgradient_coefficient = lower; + } else if (!isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = f_t(0); + } else if (!isfinite(upper) && isfinite(lower)) { + subgradient_coefficient = lower; + } else if (isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = upper; + } else { + if (primal_product < lower) { subgradient_coefficient = lower; - } else if (isfinite(upper) && !isfinite(lower)) { + } else if (primal_product > upper) { subgradient_coefficient = upper; } else { - if (primal_product < lower) { - subgradient_coefficient = lower; - } else if (primal_product > upper) { - subgradient_coefficient = upper; - } else { - subgradient_coefficient = primal_product; - } + subgradient_coefficient = primal_product; } - - subgradient[id] = subgradient_coefficient; } - template - void pdlp_restart_strategy_t::compute_dual_gradient( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_dual_gradient"); + subgradient[id] = subgradient_coefficient; +} + +template +void pdlp_restart_strategy_t::compute_dual_gradient( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_dual_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute dual gradient:" << std::endl; + std::cout << " Compute dual gradient:" << std::endl; #endif - // b - A*x - // is changed with the introduction of constraint upper and lower bounds - - // gradient constains primal_product - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A, - cusparse_view.primal_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.dual_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), - stream_view_)); - - // tmp_dual will contain the subgradient - i_t number_of_blocks = dual_size_h_ / block_size; - if (dual_size_h_ % block_size) number_of_blocks++; - i_t number_of_threads = std::min(dual_size_h_, block_size); - compute_subgradient_kernel<<>>( - this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); - - // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) - raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), - tmp_dual.data(), - duality_gap.dual_gradient_.data(), - dual_size_h_, - stream_view_); - } + // b - A*x + // is changed with the introduction of constraint upper and lower bounds - template - void pdlp_restart_strategy_t::compute_lagrangian_value( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view, - rmm::device_uvector & tmp_primal, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_lagrangian_value"); + // gradient constains primal_product + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + + // tmp_dual will contain the subgradient + i_t number_of_blocks = dual_size_h_ / block_size; + if (dual_size_h_ % block_size) number_of_blocks++; + i_t number_of_threads = std::min(dual_size_h_, block_size); + compute_subgradient_kernel<<>>( + this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); + + // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) + raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), + tmp_dual.data(), + duality_gap.dual_gradient_.data(), + dual_size_h_, + stream_view_); +} + +template +void pdlp_restart_strategy_t::compute_lagrangian_value( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view, + rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_lagrangian_value"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute lagrangian value:" << std::endl; + std::cout << " Compute lagrangian value:" << std::endl; #endif - // if QP - // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + - // dot(primal_solution, problem.objective_vector) - - // dot(primal_solution, problem.constraint_matrix' * dual_solution) + - // dot(dual_solution, dual_gradient+primal_product) + - // problem.objective_constant + // if QP + // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + + // dot(primal_solution, problem.objective_vector) - + // dot(primal_solution, problem.constraint_matrix' * dual_solution) + + // dot(dual_solution, dual_gradient+primal_product) + + // problem.objective_constant - // when lp first term is irrelevant + // when lp first term is irrelevant - // second term - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - problem_ptr->objective_coefficients.data(), - primal_stride, - reusable_device_scalar_1_.data(), - stream_view_)); + // second term + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + reusable_device_scalar_1_.data(), + stream_view_)); - // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - tmp_primal.data(), - primal_stride, - reusable_device_scalar_2_.data(), - stream_view_)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + tmp_primal.data(), + primal_stride, + reusable_device_scalar_2_.data(), + stream_view_)); - // fourth term //tmp_dual still contains subgradient from the dual_gradient computation - reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - dual_size_h_, - duality_gap.dual_solution_.data(), - dual_stride, - tmp_dual.data(), - dual_stride, - reusable_device_scalar_3_.data(), - stream_view_)); + // fourth term //tmp_dual still contains subgradient from the dual_gradient computation + reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + dual_size_h_, + duality_gap.dual_solution_.data(), + dual_stride, + tmp_dual.data(), + dual_stride, + reusable_device_scalar_3_.data(), + stream_view_)); - // subtract third term from second up - raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_2_.data(), - 1, - stream_view_); - raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_3_.data(), - 1, - stream_view_); - } + // subtract third term from second up + raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_2_.data(), + 1, + stream_view_); + raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_3_.data(), + 1, + stream_view_); +} - template - void pdlp_restart_strategy_t::reset_internal() - { - candidate_is_avg_.set_value_to_zero_async(stream_view_); - restart_triggered_.set_value_to_zero_async(stream_view_); - } +template +void pdlp_restart_strategy_t::reset_internal() +{ + candidate_is_avg_.set_value_to_zero_async(stream_view_); + restart_triggered_.set_value_to_zero_async(stream_view_); +} - template - typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() - { - pdlp_restart_strategy_t::view_t v{}; - v.primal_size = primal_size_h_; - v.dual_size = dual_size_h_; - v.transformed_constraint_lower_bounds = raft::device_span{ - transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; - v.transformed_constraint_upper_bounds = raft::device_span{ - transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; +template +typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() +{ + pdlp_restart_strategy_t::view_t v{}; + v.primal_size = primal_size_h_; + v.dual_size = dual_size_h_; + v.transformed_constraint_lower_bounds = raft::device_span{ + transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; + v.transformed_constraint_upper_bounds = raft::device_span{ + transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; + v.last_restart_length = last_restart_length_; - v.weights = raft::device_span{weights_.data(), weights_.size()}; + v.weights = raft::device_span{weights_.data(), weights_.size()}; - v.candidate_is_avg = candidate_is_avg_.data(); - v.restart_triggered = restart_triggered_.data(); + v.candidate_is_avg = candidate_is_avg_.data(); + v.restart_triggered = restart_triggered_.data(); - v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); + v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); - v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; - v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; - v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; - v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; - v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; - v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; - v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; + v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; + v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; + v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; + v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; + v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; + v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; + v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; - v.target_threshold = target_threshold_.data(); - v.low_radius_squared = low_radius_squared_.data(); - v.high_radius_squared = high_radius_squared_.data(); - v.test_radius_squared = test_radius_squared_.data(); + v.target_threshold = target_threshold_.data(); + v.low_radius_squared = low_radius_squared_.data(); + v.high_radius_squared = high_radius_squared_.data(); + v.test_radius_squared = test_radius_squared_.data(); - v.testing_range_low = testing_range_low_.data(); - v.testing_range_high = testing_range_high_.data(); + v.testing_range_low = testing_range_low_.data(); + v.testing_range_high = testing_range_high_.data(); - v.shared_live_kernel_accumulator = raft::device_span{ - shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()}; + v.shared_live_kernel_accumulator = raft::device_span{shared_live_kernel_accumulator_.data(), + shared_live_kernel_accumulator_.size()}; - v.hyper_params = hyper_params_; + v.hyper_params = hyper_params_; - return v; - } + return v; +} - template - typename pdlp_restart_strategy_t::cupdlpx_restart_view_t - pdlp_restart_strategy_t::make_cupdlpx_restart_view( - const rmm::device_uvector& primal_distance, - const rmm::device_uvector& dual_distance, - const convergence_information_t& current_convergence_information, - const rmm::device_uvector& step_size, - rmm::device_uvector& primal_weight, - rmm::device_uvector& best_primal_weight, - rmm::device_uvector& primal_step_size, - rmm::device_uvector& dual_step_size) - { - cupdlpx_restart_view_t v{}; - v.primal_distance = make_span(primal_distance); - v.dual_distance = make_span(dual_distance); - v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); - v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); - v.l2_norm_primal_linear_objective = - current_convergence_information.get_relative_dual_tolerance_factor(); - v.l2_norm_primal_right_hand_side = - current_convergence_information.get_relative_primal_tolerance_factor(); - v.step_size = make_span(step_size); - v.primal_weight = make_span(primal_weight); - v.primal_weight_error_sum = make_span(primal_weight_error_sum_); - v.primal_weight_last_error = make_span(primal_weight_last_error_); - v.best_primal_weight = make_span(best_primal_weight); - v.new_primal_step_size = make_span(primal_step_size); - v.new_dual_step_size = make_span(dual_step_size); - v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); - v.hyper_params = hyper_params_; - return v; - } +template +typename pdlp_restart_strategy_t::cupdlpx_restart_view_t +pdlp_restart_strategy_t::make_cupdlpx_restart_view( + const rmm::device_uvector& primal_distance, + const rmm::device_uvector& dual_distance, + const convergence_information_t& current_convergence_information, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_weight, + rmm::device_uvector& best_primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) +{ + cupdlpx_restart_view_t v{}; + v.primal_distance = make_span(primal_distance); + v.dual_distance = make_span(dual_distance); + v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); + v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); + v.l2_norm_primal_linear_objective = + current_convergence_information.get_relative_dual_tolerance_factor(); + v.l2_norm_primal_right_hand_side = + current_convergence_information.get_relative_primal_tolerance_factor(); + v.step_size = make_span(step_size); + v.primal_weight = make_span(primal_weight); + v.primal_weight_error_sum = make_span(primal_weight_error_sum_); + v.primal_weight_last_error = make_span(primal_weight_last_error_); + v.best_primal_weight = make_span(best_primal_weight); + v.new_primal_step_size = make_span(primal_step_size); + v.new_dual_step_size = make_span(dual_step_size); + v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); + v.hyper_params = hyper_params_; + return v; +} - template - i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const - { - return weighted_average_solution_.get_iterations_since_last_restart(); - } +template +i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const +{ + return weighted_average_solution_.get_iterations_since_last_restart(); +} - template - void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) - { - last_restart_was_average_ = value; - } +template +void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) +{ + last_restart_was_average_ = value; +} - template - bool pdlp_restart_strategy_t::get_last_restart_was_average() const - { - return last_restart_was_average_; - } +template +bool pdlp_restart_strategy_t::get_last_restart_was_average() const +{ + return last_restart_was_average_; +} #define INSTANTIATE(F_TYPE) \ template class pdlp_restart_strategy_t; \ @@ -2523,11 +2524,11 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( F_TYPE* primal_product); #if MIP_INSTANTIATE_FLOAT - INSTANTIATE(float) +INSTANTIATE(float) #endif #if MIP_INSTANTIATE_DOUBLE - INSTANTIATE(double) +INSTANTIATE(double) #endif } // namespace cuopt::linear_programming::detail From 73a52b1adb0fd44f598844eb0244de2d1bacefa9 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 20 Feb 2026 15:50:26 +0000 Subject: [PATCH 16/43] fix use overall time limit, reduce memory consumtion and add a bigger buffer --- cpp/src/branch_and_bound/pseudo_costs.cpp | 5 +++- cpp/src/pdlp/pdlp.cu | 6 ++--- cpp/src/pdlp/solve.cu | 33 +++++++---------------- cpp/src/pdlp/translate.hpp | 2 ++ 4 files changed, 19 insertions(+), 27 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 1a745865e8..926b25cd89 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -402,10 +402,13 @@ void strong_branching(const user_problem_t& original_problem, fraction_values.push_back(original_root_soln_x[j]); } + f_t elapsed_time = toc(start_time); + pdlp_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time); + const auto mps_model = simplex_problem_to_mps_data_model(original_problem); const raft::handle_t batch_pdlp_handle; const auto solutions = - batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values); + batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Find max iteration on how many are done accross the batch diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index aab9ffdd5d..c3e1e7ab8f 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -446,7 +446,7 @@ std::optional> pdlp_solver_t } // Check for concurrent limit - if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { + if (settings_.concurrent_halt != nullptr && settings_.concurrent_halt->load() == 1) { #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Concurrent Limit reached, returning current solution" << std::endl; @@ -2295,8 +2295,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co if (is_major_iteration || artificial_restart_check_main_loop || error_occured || is_conditional_major) { if (verbose) { - std::cout << "-------------------------------" << std::endl; - std::cout << internal_solver_iterations_ << std::endl; + std::cout << "-------------------------------" << std::endl; + std::cout << internal_solver_iterations_ << std::endl; raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout); raft::print_device_vector( "primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index fa0c79e391..3592798545 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -727,13 +727,13 @@ static size_t batch_pdlp_memory_estimator(const optimization_problem_t total_memory += trial_batch_size * problem.get_n_constraints() * sizeof(f_t); // Data for the solution - total_memory += problem.get_n_variables() * max_batch_size * sizeof(f_t); - total_memory += problem.get_n_constraints() * max_batch_size * sizeof(f_t); - total_memory += problem.get_n_variables() * max_batch_size * sizeof(f_t); + total_memory += problem.get_n_variables() * trial_batch_size * sizeof(f_t); + total_memory += problem.get_n_constraints() * trial_batch_size * sizeof(f_t); + total_memory += problem.get_n_variables() * trial_batch_size * sizeof(f_t); - // Add a 50% overhead to make sure we have enough memory considering other parts of the solver may - // allocate at the same time - total_memory *= 1.5; + // Add a 70% overhead to make sure we have enough memory considering other parts of the solver may + // need memory later while the batch PDLP is running + total_memory *= 1.7; // Data from saddle point state return total_memory; @@ -815,9 +815,10 @@ optimization_problem_solution_t run_batch_pdlp( } } - rmm::device_uvector full_primal_solution(problem.get_n_variables() * max_batch_size, stream); - rmm::device_uvector full_dual_solution(problem.get_n_constraints() * max_batch_size, stream); - rmm::device_uvector full_reduced_cost(problem.get_n_variables() * max_batch_size, stream); + // We don't use the solutions vectors for now + rmm::device_uvector full_primal_solution(0, stream); + rmm::device_uvector full_dual_solution(0, stream); + rmm::device_uvector full_reduced_cost(0, stream); std::vector< typename optimization_problem_solution_t::additional_termination_information_t> @@ -849,20 +850,6 @@ optimization_problem_solution_t run_batch_pdlp( auto sol = solve_lp(problem, batch_settings); - // Copy results - raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), - sol.get_primal_solution().data(), - problem.get_n_variables() * current_batch_size, - stream); - raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(), - sol.get_dual_solution().data(), - problem.get_n_constraints() * current_batch_size, - stream); - raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(), - sol.get_reduced_cost().data(), - problem.get_n_variables() * current_batch_size, - stream); - auto info = sol.get_additional_termination_informations(); full_info.insert(full_info.end(), info.begin(), info.end()); diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index aebe87b140..cbef54b97f 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -9,6 +9,8 @@ #include +#include + #include #include From dbc94fd35b9898a15a355ff670e999f21f58e817 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 24 Feb 2026 15:32:57 +0000 Subject: [PATCH 17/43] switch to double for memory estimator as size_t was hitting overflow + fail safe if batch pdlp fails --- cpp/src/branch_and_bound/pseudo_costs.cpp | 6 +++ cpp/src/pdlp/solve.cu | 50 +++++++++++++++-------- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 926b25cd89..0c66053c50 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -411,6 +411,12 @@ void strong_branching(const user_problem_t& original_problem, batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); + // Fail safe in case the batch PDLP failed and produced no solutions + if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) { + settings.log.printf("Batch PDLP failed and produced no solutions\n"); + return; + } + // Find max iteration on how many are done accross the batch i_t max_iterations = 0; i_t amount_done = 0; diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 3592798545..33d8895e2a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -678,17 +678,17 @@ optimization_problem_solution_t run_pdlp(detail::problem_t& return sol; } +// Compute in double as some cases overflow when using size_t template -static size_t batch_pdlp_memory_estimator(const optimization_problem_t& problem, - int trial_batch_size, - int max_batch_size) +static double batch_pdlp_memory_estimator(const optimization_problem_t& problem, + double trial_batch_size) { - size_t total_memory = 0; + double total_memory = 0.0; // In PDLP we store the scaled version of the problem which contains all of those total_memory += problem.get_constraint_matrix_indices().size() * sizeof(i_t); total_memory += problem.get_constraint_matrix_offsets().size() * sizeof(i_t); total_memory += problem.get_constraint_matrix_values().size() * sizeof(f_t); - total_memory *= 2; // To account for the A_t matrix + total_memory *= 2.0; // To account for the A_t matrix total_memory += problem.get_objective_coefficients().size() * sizeof(f_t); total_memory += problem.get_constraint_bounds().size() * sizeof(f_t); total_memory += problem.get_variable_lower_bounds().size() * sizeof(f_t); @@ -759,32 +759,46 @@ optimization_problem_solution_t run_batch_pdlp( f_t initial_primal_weight = std::numeric_limits::signaling_NaN(); cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0"); - const int max_batch_size = settings.new_bounds.size(); - int memory_max_batch_size = max_batch_size; + const size_t max_batch_size = settings.new_bounds.size(); + size_t memory_max_batch_size = max_batch_size; // Check if we don't hit the limit using max_batch_size - const size_t memory_estimate = - batch_pdlp_memory_estimator(problem, max_batch_size, max_batch_size); - size_t free_mem, total_mem; - RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); + const double memory_estimate = batch_pdlp_memory_estimator(problem, max_batch_size); + size_t st_free_mem, st_total_mem; + RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem)); + const double free_mem = static_cast(st_free_mem); + const double total_mem = static_cast(st_total_mem); + + #ifdef BATCH_VERBOSE_MODE + std::cout << "Memory estimate: " << memory_estimate << std::endl; + std::cout << "Free memory: " << free_mem << std::endl; + std::cout << "Total memory: " << total_mem << std::endl; + #endif if (memory_estimate > free_mem) { use_optimal_batch_size = true; // Decrement batch size iteratively until we find a batch size that fits while (memory_max_batch_size > 1) { - const size_t memory_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size, max_batch_size); + const double memory_estimate = + batch_pdlp_memory_estimator(problem, memory_max_batch_size); if (memory_estimate <= free_mem) { break; } + #ifdef BATCH_VERBOSE_MODE + std::cout << "Memory estimate: " << memory_estimate << std::endl; + std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl; + std::cout << "Free memory: " << free_mem << std::endl; + std::cout << "Total memory: " << total_mem << std::endl; + std::cout << "--------------------------------" << std::endl; + #endif memory_max_batch_size--; } - const size_t min_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size, max_batch_size); + const double min_estimate = + batch_pdlp_memory_estimator(problem, memory_max_batch_size); cuopt_expects(min_estimate <= free_mem, error_type_t::OutOfMemoryError, "Insufficient GPU memory for batch PDLP (min batch size still too large)"); } - int optimal_batch_size = use_optimal_batch_size + size_t optimal_batch_size = use_optimal_batch_size ? detail::optimal_batch_size_handler(problem, memory_max_batch_size) : max_batch_size; cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size, @@ -842,8 +856,8 @@ optimization_problem_solution_t run_batch_pdlp( } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } - for (int i = 0; i < max_batch_size; i += optimal_batch_size) { - const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i); + for (size_t i = 0; i < max_batch_size; i += optimal_batch_size) { + const size_t current_batch_size = std::min(optimal_batch_size, max_batch_size - i); // Only take the new bounds from [i, i + current_batch_size) batch_settings.new_bounds = std::vector>( original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size); From 8b1ec9373299d9a73e9e3dcddf0508804899007e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 20 Feb 2026 15:50:26 +0000 Subject: [PATCH 18/43] add support for dual simplex warm start --- cpp/src/branch_and_bound/branch_and_bound.cpp | 2 ++ cpp/src/branch_and_bound/pseudo_costs.cpp | 32 ++++++++++++++++--- cpp/src/branch_and_bound/pseudo_costs.hpp | 4 ++- cpp/src/pdlp/pdlp.cu | 21 ++++-------- .../restart_strategy/pdlp_restart_strategy.cu | 6 ++++ cpp/src/pdlp/solve.cu | 23 ++++++++----- 6 files changed, 60 insertions(+), 28 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index c46f09258c..ea2c160e1b 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2357,6 +2357,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.start_time, var_types_, root_relax_soln_.x, + root_relax_soln_.y, + root_relax_soln_.z, fractional, root_objective_, root_vstatus_, diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 0c66053c50..c3268427c3 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -14,7 +14,9 @@ #include -#include +#include + +#include #include @@ -276,9 +278,11 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data } else if (user_problem.row_sense[i] == 'G') { constraint_lower[i] = user_problem.rhs[i]; constraint_upper[i] = std::numeric_limits::infinity(); - } else { + } else if (user_problem.row_sense[i] == 'E') { constraint_lower[i] = user_problem.rhs[i]; constraint_upper[i] = user_problem.rhs[i]; + } else { + throw std::runtime_error("Invalid row sense: " + std::string(1, user_problem.row_sense[i])); } } @@ -354,7 +358,9 @@ void strong_branching(const user_problem_t& original_problem, const simplex_solver_settings_t& settings, f_t start_time, const std::vector& var_types, - const std::vector root_soln, + const std::vector& root_soln, + const std::vector& root_soln_y, + const std::vector& root_soln_z, const std::vector& fractional, f_t root_obj, const std::vector& root_vstatus, @@ -397,6 +403,10 @@ void strong_branching(const user_problem_t& original_problem, std::vector fraction_values; + std::vector original_root_soln_y, original_root_soln_z; + uncrush_dual_solution( + original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, original_root_soln_z); + for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; fraction_values.push_back(original_root_soln_x[j]); @@ -404,9 +414,19 @@ void strong_branching(const user_problem_t& original_problem, f_t elapsed_time = toc(start_time); pdlp_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time); - + const auto mps_model = simplex_problem_to_mps_data_model(original_problem); const raft::handle_t batch_pdlp_handle; + + + constexpr bool dual_simplex_primal_dual = false; + if (dual_simplex_primal_dual) { + pdlp_settings.set_initial_primal_solution( + original_root_soln_x.data(), original_root_soln_x.size(), batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution( + original_root_soln_y.data(), original_root_soln_y.size(), batch_pdlp_handle.get_stream()); + } + const auto solutions = batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); @@ -929,7 +949,9 @@ template void strong_branching(const user_problem_t& o const simplex_solver_settings_t& settings, double start_time, const std::vector& var_types, - const std::vector root_soln, + const std::vector& root_soln, + const std::vector& root_soln_y, + const std::vector& root_soln_z, const std::vector& fractional, double root_obj, const std::vector& root_vstatus, diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 6b6c6917b6..e8aea11428 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -522,7 +522,9 @@ void strong_branching(const user_problem_t& original_problem, const simplex_solver_settings_t& settings, f_t start_time, const std::vector& var_types, - const std::vector root_soln, + const std::vector& root_soln, + const std::vector& root_soln_y, + const std::vector& root_soln_z, const std::vector& fractional, f_t root_obj, const std::vector& root_vstatus, diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index c3e1e7ab8f..08d2ef3cd2 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2263,13 +2263,6 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co bool warm_start_was_given = settings_.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0; - // In batch mode, before running the solver, we need to transpose the primal and dual solution to - // row format - if (batch_mode_) - transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - pdhg_solver_.get_dual_slack()); - if (!inside_mip_) { CUOPT_LOG_INFO( " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); @@ -2332,13 +2325,6 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co } } -#ifdef CUPDLP_DEBUG_MODE - print("before scale slack", pdhg_solver_.get_dual_slack()); - print("before scale potential next primal", - pdhg_solver_.get_potential_next_primal_solution()); - print("before scale potential next dual", pdhg_solver_.get_potential_next_dual_solution()); -#endif - // In case of batch mode, primal and dual matrices are in row format // We need to transpose them to column format before doing any checks if (batch_mode_) { @@ -2354,6 +2340,13 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); } +#ifdef CUPDLP_DEBUG_MODE + print("before scale slack", pdhg_solver_.get_dual_slack()); + print("before scale potential next primal", + pdhg_solver_.get_potential_next_primal_solution()); + print("before scale potential next dual", pdhg_solver_.get_potential_next_dual_solution()); +#endif + // We go back to the unscaled problem here. It ensures that we do not terminate 'too early' // because of the error margin being evaluated on the scaled problem diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 8eacd4d246..5adcb74439 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -691,6 +691,12 @@ void pdlp_restart_strategy_t::should_cupdlpx_restart(i_t total_number_ { std::fill(should_restart.begin(), should_restart.end(), 0); + #ifdef CUPDLP_DEBUG_MODE + // Print the current stats of initial fixed point error and fixed point error + print("initial_fixed_point_error", initial_fixed_point_error_); + print("fixed_point_error", fixed_point_error_); + #endif + if (total_number_of_iterations == hyper_params_.major_iteration) { #ifdef CUPDLP_DEBUG_MODE printf("forced restart at first major\n"); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 33d8895e2a..3a277b4f85 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -744,12 +744,10 @@ optimization_problem_solution_t run_batch_pdlp( optimization_problem_t& problem, pdlp_solver_settings_t const& settings) { // Hyper parameter than can be changed, I have put what I believe to be the best - bool primal_dual_init = true; + bool pdlp_primal_dual_init = true; bool primal_weight_init = true; bool use_optimal_batch_size = false; constexpr int iteration_limit = 100000; - // Shouldn't we work on the unpresolved and/or unscaled problem for PDLP? - // Shouldn't we put an iteration limit? If yes what should we do with the partial solutions? rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream(); @@ -805,8 +803,15 @@ optimization_problem_solution_t run_batch_pdlp( "Optimal batch size should be between 1 and max batch size"); using f_t2 = typename type_2::type; - // If need warm start, solve the LP alone - if (primal_dual_init || primal_weight_init) { + // In case Dual Simplex already provided the initial primal and dual solution + if (settings.has_initial_primal_solution() && settings.has_initial_dual_solution()) { + initial_primal = rmm::device_uvector( + settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream()); + initial_dual = rmm::device_uvector( + settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream()); + } + + if (pdlp_primal_dual_init || primal_weight_init) { pdlp_solver_settings_t warm_start_settings = settings; warm_start_settings.new_bounds.clear(); warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; @@ -817,7 +822,7 @@ optimization_problem_solution_t run_batch_pdlp( warm_start_settings.inside_mip = true; optimization_problem_solution_t original_solution = solve_lp(problem, warm_start_settings); - if (primal_dual_init) { + if (pdlp_primal_dual_init) { initial_primal = rmm::device_uvector(original_solution.get_primal_solution(), original_solution.get_primal_solution().stream()); initial_dual = rmm::device_uvector(original_solution.get_dual_solution(), @@ -847,12 +852,14 @@ optimization_problem_solution_t run_batch_pdlp( batch_settings.detect_infeasibility = false; batch_settings.iteration_limit = iteration_limit; batch_settings.inside_mip = true; - if (primal_dual_init) { + if (initial_primal.size() > 0) { batch_settings.set_initial_primal_solution( initial_primal.data(), initial_primal.size(), initial_primal.stream()); batch_settings.set_initial_dual_solution( initial_dual.data(), initial_dual.size(), initial_dual.stream()); - batch_settings.set_initial_step_size(initial_step_size); + if (!std::isnan(initial_step_size)) { + batch_settings.set_initial_step_size(initial_step_size); + } } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } From 71e47ebbfeb41b3a65d76fd559b59175a53a0c4a Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 5 Mar 2026 15:30:54 +0100 Subject: [PATCH 19/43] handle batch pdlp being out of memory not has teramintion error --- cpp/src/pdlp/solve.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index e821e50f07..de538a1351 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -797,9 +797,10 @@ optimization_problem_solution_t run_batch_pdlp( } const double min_estimate = batch_pdlp_memory_estimator(problem, memory_max_batch_size); - cuopt_expects(min_estimate <= free_mem, - error_type_t::OutOfMemoryError, - "Insufficient GPU memory for batch PDLP (min batch size still too large)"); + if (min_estimate > free_mem) { + return optimization_problem_solution_t( + pdlp_termination_status_t::NumericalError, stream); + } } size_t optimal_batch_size = use_optimal_batch_size From d02544181b97fbd36ba915970c7362e57422f742 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 5 Mar 2026 15:34:00 +0100 Subject: [PATCH 20/43] add a basic batch pdlp race strategy in strong branching --- cpp/src/branch_and_bound/branch_and_bound.cpp | 3 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 196 +++++++++++++++++- 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 3d5cbcc64f..0d0ac23e92 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -806,7 +806,8 @@ branch_variable_t branch_and_bound_t::variable_selection( branch_and_bound_worker_t* worker) { logger_t log; - log.log = false; + // TODO put back false + log.log = true; i_t branch_var = -1; rounding_direction_t round_dir = rounding_direction_t::NONE; std::vector current_incumbent; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 2757f7f680..1adfb2355b 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -315,6 +315,36 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data return mps_model; } +template +static cuopt::mps_parser::mps_data_model_t lp_problem_to_mps_data_model( + const lp_problem_t& lp_problem) +{ + cuopt::mps_parser::mps_data_model_t mps_model; + int m = lp_problem.num_rows; + int n = lp_problem.num_cols; + + csr_matrix_t csr_A(m, n, 0); + lp_problem.A.to_compressed_row(csr_A); + + int nz = csr_A.row_start[m]; + + mps_model.set_csr_constraint_matrix( + csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1); + + mps_model.set_objective_coefficients(lp_problem.objective.data(), n); + mps_model.set_objective_scaling_factor(lp_problem.obj_scale); + mps_model.set_objective_offset(lp_problem.obj_constant); + + mps_model.set_variable_lower_bounds(lp_problem.lower.data(), n); + mps_model.set_variable_upper_bounds(lp_problem.upper.data(), n); + + mps_model.set_constraint_lower_bounds(lp_problem.rhs.data(), m); + mps_model.set_constraint_upper_bounds(lp_problem.rhs.data(), m); + mps_model.set_maximize(lp_problem.obj_scale < 0); + + return mps_model; +} + // Merge a single strong branching result from Dual Simplex and PDLP. // Rules: // 1. If both found optimal -> keep DS (higher quality vertex solution) @@ -793,13 +823,97 @@ i_t pseudo_costs_t::reliable_variable_selection( // Shuffle the unreliable list so every variable has the same chance to be selected. if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); } + // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex + std::vector pdlp_overflow_list; + bool use_pdlp = settings.mip_batch_pdlp_strong_branching == 1 && + static_cast(unreliable_list.size()) > num_candidates; + if (use_pdlp) { + pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end()); + } + + const i_t num_pdlp_vars = pdlp_overflow_list.size(); + std::vector pdlp_obj_down(num_pdlp_vars, std::numeric_limits::quiet_NaN()); + std::vector pdlp_obj_up(num_pdlp_vars, std::numeric_limits::quiet_NaN()); + + // DS can halt PDLP via concurrent_halt, but not the other way around + std::atomic concurrent_halt{0}; + std::thread pdlp_thread; + + if (use_pdlp) { + pdlp_thread = std::thread([&]() { + log.printf("RB batch PDLP: solving %d overflow unreliable variables\n", num_pdlp_vars); + + f_t start_batch = tic(); + + const auto mps_model = lp_problem_to_mps_data_model(worker->leaf_problem); + + std::vector fraction_values; + fraction_values.reserve(num_pdlp_vars); + for (i_t j : pdlp_overflow_list) { + fraction_values.push_back(solution[j]); + } + + const f_t batch_elapsed_time = toc(start_time); + const f_t batch_remaining_time = + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + if (batch_remaining_time <= 0.0) { return; } + + pdlp_solver_settings_t pdlp_settings; + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.time_limit = batch_remaining_time; + + const raft::handle_t batch_pdlp_handle; + const auto solutions = batch_pdlp_solve( + &batch_pdlp_handle, mps_model, pdlp_overflow_list, fraction_values, pdlp_settings); + + f_t batch_pdlp_time = toc(start_batch); + + if (solutions.get_additional_termination_informations().size() != + static_cast(num_pdlp_vars) * 2) { + log.printf("RB batch PDLP failed and produced no solutions\n"); + return; + } + + i_t amount_done = 0; + for (i_t k = 0; k < num_pdlp_vars * 2; k++) { + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + amount_done++; + } + } + + log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", + batch_pdlp_time, + amount_done, + num_pdlp_vars * 2); + + for (i_t k = 0; k < num_pdlp_vars; k++) { + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + pdlp_obj_down[k] = solutions.get_dual_objective_value(k); + } + if (solutions.get_termination_status(k + num_pdlp_vars) == + pdlp_termination_status_t::Optimal) { + pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_pdlp_vars); + } + } + }); + } + if (toc(start_time) > settings.time_limit) { log.printf("Time limit reached"); + if (use_pdlp) { + concurrent_halt.store(1); + pdlp_thread.join(); + } return branch_var; } + omp_atomic_t ds_optimal{0}; + omp_atomic_t ds_infeasible{0}; + omp_atomic_t ds_failed{0}; + f_t ds_start_time = tic(); + #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ - shared(score_mutex) + shared(score_mutex, ds_optimal, ds_infeasible, ds_failed) for (i_t i = 0; i < num_candidates; ++i) { const i_t j = unreliable_list[i]; @@ -826,7 +940,16 @@ i_t pseudo_costs_t::reliable_variable_selection( reliability_branching_settings.lower_max_lp_iter, strong_branching_lp_iter); - if (!std::isnan(obj)) { + if (std::isnan(obj)) { + ds_failed++; + } else if (std::isinf(obj)) { + ds_infeasible++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + } else { + ds_optimal++; f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); f_t change_in_x = solution[j] - std::floor(solution[j]); pseudo_cost_sum_down[j] += change_in_obj / change_in_x; @@ -857,7 +980,17 @@ i_t pseudo_costs_t::reliable_variable_selection( reliability_branching_settings.lower_max_lp_iter, strong_branching_lp_iter); - if (!std::isnan(obj)) { + if (std::isnan(obj)) { + ds_failed++; + } else if (std::isinf(obj)) { + // Is it ok to process infinity obj like this? + ds_infeasible++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + } else { + ds_optimal++; f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); f_t change_in_x = std::ceil(solution[j]) - solution[j]; pseudo_cost_sum_up[j] += change_in_obj / change_in_x; @@ -878,6 +1011,63 @@ i_t pseudo_costs_t::reliable_variable_selection( score_mutex.unlock(); } + f_t ds_elapsed = toc(ds_start_time); + log.printf( + "RB Dual Simplex: %d candidates, %d/%d optimal/dual-feasible, %d/%d infeasible, " + "%d/%d failed in %.2fs\n", + num_candidates, + ds_optimal.load(), + num_candidates * 2, + ds_infeasible.load(), + num_candidates * 2, + ds_failed.load(), + num_candidates * 2, + ds_elapsed); + + if (use_pdlp) { + // Dual Simplex is done on the main thread, telling Batch PDLP to stop + concurrent_halt.store(1); + pdlp_thread.join(); + + i_t pdlp_optimal = 0; + for (i_t k = 0; k < num_pdlp_vars; k++) { + const i_t j = pdlp_overflow_list[k]; + + pseudo_cost_mutex_down[j].lock(); + if (!std::isnan(pdlp_obj_down[k])) { + f_t change_in_obj = std::max(pdlp_obj_down[k] - node_ptr->lower_bound, eps); + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + pdlp_optimal++; + } + pseudo_cost_mutex_down[j].unlock(); + + pseudo_cost_mutex_up[j].lock(); + if (!std::isnan(pdlp_obj_up[k])) { + f_t change_in_obj = std::max(pdlp_obj_up[k] - node_ptr->lower_bound, eps); + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + pdlp_optimal++; + } + pseudo_cost_mutex_up[j].unlock(); + + f_t score = + calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + + log.printf( + "RB batch PDLP: %d candidates, %d/%d optimal\n", + num_pdlp_vars, + pdlp_optimal, + num_pdlp_vars * 2); + } + log.printf( "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score); From 3044887041e3624b35d0f16266ece7a9946f9227 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 6 Mar 2026 16:01:16 +0100 Subject: [PATCH 21/43] fix compilation issue --- cpp/src/branch_and_bound/pseudo_costs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 1adfb2355b..1a0a1f260b 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -16,7 +16,7 @@ #include -#include +#include #include From 0108de47c456d7c1168103b88929776b75bb7e97 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 9 Mar 2026 13:19:34 +0000 Subject: [PATCH 22/43] separate the two batch pdlp settings --- cpp/include/cuopt/linear_programming/constants.h | 1 + .../cuopt/linear_programming/mip/solver_settings.hpp | 1 + cpp/src/branch_and_bound/pseudo_costs.cpp | 7 ++++--- cpp/src/dual_simplex/simplex_solver_settings.hpp | 1 + cpp/src/math_optimization/solver_settings.cu | 1 + cpp/src/mip_heuristics/solver.cu | 2 ++ 6 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 7eb0aa07d6..551d9e6319 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -69,6 +69,7 @@ #define CUOPT_MIP_CUT_CHANGE_THRESHOLD "mip_cut_change_threshold" #define CUOPT_MIP_CUT_MIN_ORTHOGONALITY "mip_cut_min_orthogonality" #define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING "mip_batch_pdlp_strong_branching" +#define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching" #define CUOPT_SOLUTION_FILE "solution_file" #define CUOPT_NUM_CPU_THREADS "num_cpu_threads" #define CUOPT_NUM_GPUS "num_gpus" diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 95b2dffc46..f9735e1994 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -98,6 +98,7 @@ class mip_solver_settings_t { f_t cut_change_threshold = 1e-3; f_t cut_min_orthogonality = 0.5; i_t mip_batch_pdlp_strong_branching = 1; + i_t mip_batch_pdlp_reliability_branching = 1; i_t num_gpus = 1; bool log_to_console = true; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 1a0a1f260b..2798058d55 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -825,7 +825,7 @@ i_t pseudo_costs_t::reliable_variable_selection( // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex std::vector pdlp_overflow_list; - bool use_pdlp = settings.mip_batch_pdlp_strong_branching == 1 && + bool use_pdlp = settings.mip_batch_pdlp_reliability_branching == 1 && static_cast(unreliable_list.size()) > num_candidates; if (use_pdlp) { pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end()); @@ -881,10 +881,11 @@ i_t pseudo_costs_t::reliable_variable_selection( } } - log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", + log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d in %.2fs\n", batch_pdlp_time, amount_done, - num_pdlp_vars * 2); + num_pdlp_vars * 2, + toc(start_batch)); for (i_t k = 0; k < num_pdlp_vars; k++) { if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 815e229232..2e38117a75 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -186,6 +186,7 @@ struct simplex_solver_settings_t { f_t cut_min_orthogonality; // minimum orthogonality for cuts i_t mip_batch_pdlp_strong_branching{0}; // 0 if not using batch PDLP for strong branching, 1 if // using batch PDLP for strong branching + i_t mip_batch_pdlp_reliability_branching{0}; // 0 if not using batch PDLP for reliability branching, 1 if diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index f1350ca432..18e4d1b1e5 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -99,6 +99,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0}, + {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index e6f6d50b62..226d3c4b23 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -228,6 +228,8 @@ solution_t mip_solver_t::run_solver() branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; branch_and_bound_settings.mip_batch_pdlp_strong_branching = context.settings.mip_batch_pdlp_strong_branching; + branch_and_bound_settings.mip_batch_pdlp_reliability_branching = + context.settings.mip_batch_pdlp_reliability_branching; if (context.settings.num_cpu_threads < 0) { branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1); From 721a56a65f2e5f12b53349bba1587f43b0a7815a Mon Sep 17 00:00:00 2001 From: Christopher Maes Date: Wed, 11 Mar 2026 09:59:38 -0700 Subject: [PATCH 23/43] Fix bug where batch PDLP for strong branching was running on problem without cuts --- cpp/src/branch_and_bound/branch_and_bound.cpp | 4 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 138 +++++++++++------- cpp/src/branch_and_bound/pseudo_costs.hpp | 4 +- 3 files changed, 88 insertions(+), 58 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 41d23bc0ff..3fc12705fd 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2407,10 +2407,10 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut pc_.resize(original_lp_.num_cols); { raft::common::nvtx::range scope_sb("BB::strong_branching"); - strong_branching(original_problem_, - original_lp_, + strong_branching(original_lp_, settings_, exploration_stats_.start_time, + new_slacks_, var_types_, root_relax_soln_.x, fractional, diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index ee7e2f7803..3fd240a1e4 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -220,15 +220,46 @@ f_t trial_branching(const lp_problem_t& original_lp, template static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data_model( - const dual_simplex::user_problem_t& user_problem) + const dual_simplex::lp_problem_t& lp, + const std::vector& new_slacks, + const std::vector& root_soln, + std::vector& original_root_soln_x) { + + // Branch and bound has a problem of the form: + // minimize c^T x + // subject to A*x + Es = b + // l <= x <= u + // E_{jj} = sigma_j, where sigma_j is +1 or -1 + + // We need to convert this into a problem that is better for PDLP + // to solve. PDLP perfers inequality constraints. Thus, we want + // to convert the above into the problem: + // minimize c^T x + // subject to lb <= A*x <= ub + // l <= x <= u + + cuopt::mps_parser::mps_data_model_t mps_model; - int m = user_problem.num_rows; - int n = user_problem.num_cols; + int m = lp.num_rows; + int n = lp.num_cols - new_slacks.size(); + original_root_soln_x.resize(n); + + // Remove slacks from A + dual_simplex::csc_matrix_t A_no_slacks = lp.A; + std::vector cols_to_remove(lp.A.n, 0); + for (i_t j : new_slacks) { + cols_to_remove[j] = 1; + } + A_no_slacks.remove_columns(cols_to_remove); + + for (i_t j = 0; j < n; j++) { + original_root_soln_x[j] = root_soln[j]; + } // Convert CSC to CSR using built-in method dual_simplex::csr_matrix_t csr_A(m, n, 0); - user_problem.A.to_compressed_row(csr_A); + A_no_slacks.to_compressed_row(csr_A); int nz = csr_A.row_start[m]; @@ -237,70 +268,74 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1); // Set objective coefficients - mps_model.set_objective_coefficients(user_problem.objective.data(), n); + mps_model.set_objective_coefficients(lp.objective.data(), n); // Set objective scaling and offset - mps_model.set_objective_scaling_factor(user_problem.obj_scale); - mps_model.set_objective_offset(user_problem.obj_constant); + mps_model.set_objective_scaling_factor(lp.obj_scale); + mps_model.set_objective_offset(lp.obj_constant); // Set variable bounds - mps_model.set_variable_lower_bounds(user_problem.lower.data(), n); - mps_model.set_variable_upper_bounds(user_problem.upper.data(), n); + mps_model.set_variable_lower_bounds(lp.lower.data(), n); + mps_model.set_variable_upper_bounds(lp.upper.data(), n); // Convert row sense and RHS to constraint bounds std::vector constraint_lower(m); std::vector constraint_upper(m); - for (i_t i = 0; i < m; ++i) { - if (user_problem.row_sense[i] == 'L') { - constraint_lower[i] = -std::numeric_limits::infinity(); - constraint_upper[i] = user_problem.rhs[i]; - } else if (user_problem.row_sense[i] == 'G') { - constraint_lower[i] = user_problem.rhs[i]; - constraint_upper[i] = std::numeric_limits::infinity(); - } else { - constraint_lower[i] = user_problem.rhs[i]; - constraint_upper[i] = user_problem.rhs[i]; - } + std::vector slack_map(m, -1); + for (i_t j : new_slacks) { + const i_t col_start = lp.A.col_start[j]; + const i_t i = lp.A.i[col_start]; + slack_map[i] = j; } - for (i_t k = 0; k < user_problem.num_range_rows; ++k) { - i_t i = user_problem.range_rows[k]; - f_t r = user_problem.range_value[k]; - f_t b = user_problem.rhs[i]; - f_t h = -std::numeric_limits::infinity(); - f_t u = std::numeric_limits::infinity(); - if (user_problem.row_sense[i] == 'L') { - h = b - std::abs(r); - u = b; - } else if (user_problem.row_sense[i] == 'G') { - h = b; - u = b + std::abs(r); - } else if (user_problem.row_sense[i] == 'E') { - if (r > 0) { - h = b; - u = b + std::abs(r); - } else { - h = b - std::abs(r); - u = b; - } + for (i_t i = 0; i < m; ++i) { + // Each row is of the form a_i^T x + sigma * s_i = b_i + // with sigma = +1 or -1 + // and l_i <= s_i <= u_i + // We have that a_i^T x - b_i = -sigma * s_i + // If sigma = -1, then we have + // a_i^T x - b_i = s_i + // l_i <= a_i^T x - b_i <= u_i + // l_i + b_i <= a_i^T x <= u_i + b_i + // + // If sigma = +1, then we have + // a_i^T x - b_i = -s_i + // -a_i^T x + b_i = s_i + // l_i <= -a_i^T x + b_i <= u_i + // l_i - b_i <= -a_i^T x <= u_i - b_i + // -u_i + b_i <= a_i^T x <= -l_i + b_i + + const i_t slack = slack_map[i]; + assert(slack != -1); + const i_t col_start = lp.A.col_start[slack]; + const f_t sigma = lp.A.x[col_start]; + const f_t slack_lower = lp.lower[slack]; + const f_t slack_upper = lp.upper[slack]; + + if (sigma == -1) { + constraint_lower[i] = slack_lower + lp.rhs[i]; + constraint_upper[i] = slack_upper + lp.rhs[i]; + } else if (sigma == 1) { + constraint_lower[i] = -slack_upper + lp.rhs[i]; + constraint_upper[i] = -slack_lower + lp.rhs[i]; + } else { + assert(sigma == 1.0 || sigma == -1.0); } - constraint_lower[i] = h; - constraint_upper[i] = u; } mps_model.set_constraint_lower_bounds(constraint_lower.data(), m); mps_model.set_constraint_upper_bounds(constraint_upper.data(), m); - mps_model.set_maximize(user_problem.obj_scale < 0); + mps_model.set_maximize(lp.obj_scale < 0); return mps_model; } template -void strong_branching(const user_problem_t& original_problem, - const lp_problem_t& original_lp, +void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, f_t start_time, + const std::vector& new_slacks, const std::vector& var_types, const std::vector root_soln, const std::vector& fractional, @@ -321,14 +356,10 @@ void strong_branching(const user_problem_t& original_problem, settings.log.printf("Batch PDLP strong branching enabled\n"); f_t start_batch = tic(); + std::vector original_root_soln_x; - // Use original_problem to create the BatchLP problem - csr_matrix_t A_row(original_problem.A.m, original_problem.A.n, 0); - original_problem.A.to_compressed_row(A_row); + const auto mps_model = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); - // Convert the root_soln to the original problem space - std::vector original_root_soln_x; - uncrush_primal_solution(original_problem, original_lp, root_soln, original_root_soln_x); std::vector fraction_values; @@ -337,7 +368,6 @@ void strong_branching(const user_problem_t& original_problem, fraction_values.push_back(original_root_soln_x[j]); } - const auto mps_model = simplex_problem_to_mps_data_model(original_problem); const f_t batch_elapsed_time = toc(start_time); const f_t batch_remaining_time = std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); @@ -776,10 +806,10 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( template class pseudo_costs_t; -template void strong_branching(const user_problem_t& original_problem, - const lp_problem_t& original_lp, +template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, double start_time, + const std::vector& new_slacks, const std::vector& var_types, const std::vector root_soln, const std::vector& fractional, diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 6b6c6917b6..3323f8bd6f 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -517,10 +517,10 @@ class pseudo_costs_t { }; template -void strong_branching(const user_problem_t& original_problem, - const lp_problem_t& original_lp, +void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, f_t start_time, + const std::vector& new_slacks, const std::vector& var_types, const std::vector root_soln, const std::vector& fractional, From ba1e4bd72744f023ff62690b471db092e8596afe Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 13 Mar 2026 11:45:44 +0100 Subject: [PATCH 24/43] pass slack and correct problem convertion also in reliabiltiy branching, correctly fill the ds_obj objective before merging results at the root, correctly clamp the PDLP objective, remove the unucessary cuopt_assert regarding fixed point error --- cpp/src/branch_and_bound/branch_and_bound.cpp | 4 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 55 +++++++------------ cpp/src/branch_and_bound/pseudo_costs.hpp | 4 +- cpp/src/pdlp/pdlp.cu | 3 - 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 1dac28ae46..de448a18d5 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -829,7 +829,9 @@ branch_variable_t branch_and_bound_t::variable_selection( exploration_stats_, upper_bound_, worker_pool_.num_idle_workers(), - log); + log, + new_slacks_, + original_lp_); } else { branch_var = pc_.variable_selection(fractional, solution, log); } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index f3f939c9d4..db28888c69 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -102,6 +102,7 @@ void strong_branch_helper(i_t start, if (branch == 0) { pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0); + ds_obj_down[k] = std::max(obj - root_obj, 0.0); ds_status_down[k] = status; if (verbose) { settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n", @@ -114,6 +115,7 @@ void strong_branch_helper(i_t start, } } else { pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0); + ds_obj_up[k] = std::max(obj - root_obj, 0.0); ds_status_up[k] = status; if (verbose) { settings.log.printf( @@ -348,36 +350,6 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data return mps_model; } -template -static cuopt::mps_parser::mps_data_model_t lp_problem_to_mps_data_model( - const lp_problem_t& lp_problem) -{ - cuopt::mps_parser::mps_data_model_t mps_model; - int m = lp_problem.num_rows; - int n = lp_problem.num_cols; - - csr_matrix_t csr_A(m, n, 0); - lp_problem.A.to_compressed_row(csr_A); - - int nz = csr_A.row_start[m]; - - mps_model.set_csr_constraint_matrix( - csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1); - - mps_model.set_objective_coefficients(lp_problem.objective.data(), n); - mps_model.set_objective_scaling_factor(lp_problem.obj_scale); - mps_model.set_objective_offset(lp_problem.obj_constant); - - mps_model.set_variable_lower_bounds(lp_problem.lower.data(), n); - mps_model.set_variable_upper_bounds(lp_problem.upper.data(), n); - - mps_model.set_constraint_lower_bounds(lp_problem.rhs.data(), m); - mps_model.set_constraint_upper_bounds(lp_problem.rhs.data(), m); - mps_model.set_maximize(lp_problem.obj_scale < 0); - - return mps_model; -} - // Merge a single strong branching result from Dual Simplex and PDLP. // Rules: // 1. If both found optimal -> keep DS (higher quality vertex solution) @@ -536,8 +508,8 @@ void strong_branching(const lp_problem_t& original_lp, ? solutions.get_dual_objective_value(k + fractional.size()) : std::numeric_limits::quiet_NaN(); - pdlp_obj_down[k] = obj_down - root_obj; - pdlp_obj_up[k] = obj_up - root_obj; + pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0)); + pdlp_obj_up[k] = std::max(obj_up - root_obj, f_t(0.0)); } // Batch PDLP finished – tell Dual Simplex to stop @@ -763,7 +735,9 @@ i_t pseudo_costs_t::reliable_variable_selection( const branch_and_bound_stats_t& bnb_stats, f_t upper_bound, int max_num_tasks, - logger_t& log) + logger_t& log, + const std::vector& new_slacks, + const lp_problem_t& original_lp) { constexpr f_t eps = 1e-6; f_t start_time = bnb_stats.start_time; @@ -873,12 +847,23 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t start_batch = tic(); - const auto mps_model = lp_problem_to_mps_data_model(worker->leaf_problem); + std::vector original_soln_x; + // Convert the original_lp that has cuts to a problem that is better for PDLP + auto mps_model = simplex_problem_to_mps_data_model( + original_lp, new_slacks, solution, original_soln_x); + // Apply the bounds of the current leaf problem + { + const i_t n_orig = original_lp.num_cols - new_slacks.size(); + for (i_t j = 0; j < n_orig; j++) { + mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j]; + mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j]; + } + } std::vector fraction_values; fraction_values.reserve(num_pdlp_vars); for (i_t j : pdlp_overflow_list) { - fraction_values.push_back(solution[j]); + fraction_values.push_back(original_soln_x[j]); } const f_t batch_elapsed_time = toc(start_time); diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 8a408c81e3..75cf660621 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -481,7 +481,9 @@ class pseudo_costs_t { const branch_and_bound_stats_t& bnb_stats, f_t upper_bound, int max_num_tasks, - logger_t& log); + logger_t& log, + const std::vector& new_slacks, + const lp_problem_t& original_lp); void update_pseudo_costs_from_strong_branching(const std::vector& fractional, const std::vector& root_soln); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 7bdff6b4e7..dd1848e53a 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1506,9 +1506,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - cuopt_assert(movement + computed_interaction >= f_t(0.0), - "Movement + computed interaction must be >= 0"); - // Clamp to 0 to avoid NaN *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); From d513865b34b24dedb21daf6808dd2f9406e7d794 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Mar 2026 11:08:54 +0100 Subject: [PATCH 25/43] add initial pdlp iteartions to the warm start data and on by default --- .../linear_programming/pdlp/solver_settings.hpp | 11 +++++++++++ cpp/src/pdlp/pdlp.cu | 13 +++++++++++++ cpp/src/pdlp/solve.cu | 8 ++++++++ cpp/src/pdlp/solver_settings.cu | 12 ++++++++++++ 4 files changed, 44 insertions(+) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index d3f59144cc..72be0943da 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -147,6 +147,12 @@ class pdlp_solver_settings_t { * @param[in] initial_primal_weight Initial primal weight. */ void set_initial_primal_weight(f_t initial_primal_weight); + /** + * @brief Set an initial pdlp iteration. + * + * @param[in] initial_pdlp_iteration Initial pdlp iteration. + */ + void set_initial_pdlp_iteration(i_t initial_pdlp_iteration); /** * @brief Set the pdlp warm start data. This allows to restart PDLP with a @@ -213,6 +219,8 @@ class pdlp_solver_settings_t { std::optional get_initial_step_size() const; // TODO batch mode: tmp std::optional get_initial_primal_weight() const; + // TODO batch mode: tmp + std::optional get_initial_pdlp_iteration() const; const rmm::device_uvector& get_initial_primal_solution() const; const rmm::device_uvector& get_initial_dual_solution() const; @@ -284,6 +292,9 @@ class pdlp_solver_settings_t { /** Initial primal weight */ // TODO batch mode: tmp std::optional initial_primal_weight_; + /** Initial pdlp iteration */ + // TODO batch mode: tmp + std::optional initial_pdlp_iteration_; /** GPU-backed warm start data (device_uvector), used by C++ API and local GPU solves */ pdlp_warm_start_data_t pdlp_warm_start_data_; /** Warm start data as spans over external memory, used by Cython/Python interface */ diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index dd1848e53a..7edbeaff15 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2195,6 +2195,19 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); } + if (settings_.get_initial_pdlp_iteration().has_value()) { + total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); + // This is meaningless in batch mode since pdhg step is never used, set it just to avoid assertions + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, stream_view_); + pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; + // Reset the fixed point error since at this pdlp iteration it is expected to already be initialized to some value + std::fill(restart_strategy_.initial_fixed_point_error_.begin(), + restart_strategy_.initial_fixed_point_error_.end(), + f_t(0.0)); + std::fill(restart_strategy_.fixed_point_error_.begin(), + restart_strategy_.fixed_point_error_.end(), + f_t(0.0)); + } // Only the primal_weight_ and step_size_ variables are initialized during the initial phase // The associated primal/dual step_size (computed using the two firstly mentionned) are not diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 90f2a03590..9676ef483f 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -906,6 +906,7 @@ optimization_problem_solution_t run_batch_pdlp( // Hyper parameter than can be changed, I have put what I believe to be the best bool pdlp_primal_dual_init = true; bool primal_weight_init = true; + bool use_initial_pdlp_iterations = true; bool use_optimal_batch_size = false; constexpr int iteration_limit = 100000; @@ -915,6 +916,7 @@ optimization_problem_solution_t run_batch_pdlp( rmm::device_uvector initial_dual(0, stream); f_t initial_step_size = std::numeric_limits::signaling_NaN(); f_t initial_primal_weight = std::numeric_limits::signaling_NaN(); + i_t initial_pdlp_iteration = -1; cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0"); const size_t max_batch_size = settings.new_bounds.size(); @@ -993,6 +995,9 @@ optimization_problem_solution_t run_batch_pdlp( if (primal_weight_init) { initial_primal_weight = original_solution.get_pdlp_warm_start_data().initial_primal_weight_; } + if (use_initial_pdlp_iterations) { + initial_pdlp_iteration = original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + } } // We don't use the solutions vectors for now @@ -1021,6 +1026,9 @@ optimization_problem_solution_t run_batch_pdlp( if (!std::isnan(initial_step_size)) { batch_settings.set_initial_step_size(initial_step_size); } + if (use_initial_pdlp_iterations) { + batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration); + } } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } diff --git a/cpp/src/pdlp/solver_settings.cu b/cpp/src/pdlp/solver_settings.cu index 7acfc7481c..30d5ccaea5 100644 --- a/cpp/src/pdlp/solver_settings.cu +++ b/cpp/src/pdlp/solver_settings.cu @@ -348,6 +348,18 @@ std::optional pdlp_solver_settings_t::get_initial_primal_weight() return initial_primal_weight_; } +template +void pdlp_solver_settings_t::set_initial_pdlp_iteration(i_t initial_pdlp_iteration) +{ + initial_pdlp_iteration_ = std::make_optional(initial_pdlp_iteration); +} + +template +std::optional pdlp_solver_settings_t::get_initial_pdlp_iteration() const +{ + return initial_pdlp_iteration_; +} + template const pdlp_warm_start_data_t& pdlp_solver_settings_t::get_pdlp_warm_start_data() const noexcept From a3a458d65c834e50098c83d1d67dce174ddba9ae Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Mar 2026 12:39:34 +0000 Subject: [PATCH 26/43] put clique table in lp necessary file, add solver setting flag to generate solution in batch pdlp only for the test that needs it --- .../pdlp/solver_settings.hpp | 3 +++ cpp/src/mip_heuristics/CMakeLists.txt | 2 +- cpp/src/pdlp/solve.cu | 25 ++++++++++++++++--- cpp/tests/linear_programming/pdlp_test.cu | 9 ++++--- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 72be0943da..91ca14e96c 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -280,6 +280,9 @@ class pdlp_solver_settings_t { // concurrently i.e. if new_bounds.size() == 2, then 2 versions of the problem with updated bounds // will be solved concurrently std::vector> new_bounds; + // By default to save memory and speed we don't store and copy each climber's primal and dual solutions + // We only retrieve termination statistics and the objective values + bool generate_batch_primal_dual_solution{false}; private: /** Initial primal solution */ diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt index a200d4265b..5e3d19c8b0 100644 --- a/cpp/src/mip_heuristics/CMakeLists.txt +++ b/cpp/src/mip_heuristics/CMakeLists.txt @@ -14,6 +14,7 @@ set(MIP_LP_NECESSARY_FILES ${CMAKE_CURRENT_SOURCE_DIR}/presolve/third_party_presolve.cpp ${CMAKE_CURRENT_SOURCE_DIR}/presolve/gf2_presolve.cpp ${CMAKE_CURRENT_SOURCE_DIR}/solution/solution.cu + ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu ) # Files that are MIP-specific and not needed for pure LP @@ -38,7 +39,6 @@ set(MIP_NON_LP_FILES ${CMAKE_CURRENT_SOURCE_DIR}/presolve/multi_probe.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/probing_cache.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/trivial_presolve.cu - ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 9676ef483f..c59bf1bbdb 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -1000,10 +1000,12 @@ optimization_problem_solution_t run_batch_pdlp( } } - // We don't use the solutions vectors for now - rmm::device_uvector full_primal_solution(0, stream); - rmm::device_uvector full_dual_solution(0, stream); - rmm::device_uvector full_reduced_cost(0, stream); + + const bool collect_solutions = settings.generate_batch_primal_dual_solution; + + rmm::device_uvector full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); + rmm::device_uvector full_dual_solution((collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream); + rmm::device_uvector full_reduced_cost((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); std::vector< typename optimization_problem_solution_t::additional_termination_information_t> @@ -1045,6 +1047,21 @@ optimization_problem_solution_t run_batch_pdlp( auto status = sol.get_terminations_status(); full_status.insert(full_status.end(), status.begin(), status.end()); + + if (collect_solutions) { + raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), + sol.get_primal_solution().data(), + sol.get_primal_solution().size(), + stream); + raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(), + sol.get_dual_solution().data(), + sol.get_dual_solution().size(), + stream); + raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(), + sol.get_reduced_cost().data(), + sol.get_reduced_cost().size(), + stream); + } } return optimization_problem_solution_t(full_primal_solution, diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index d5a8d69008..9cbca2d86e 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -1677,10 +1677,11 @@ TEST(pdlp_class, strong_branching_test) const std::vector fractional = {1, 2, 4}; const std::vector root_soln_x = {0.891, 0.109, 0.636429}; - auto solver_settings = pdlp_solver_settings_t{}; - solver_settings.method = cuopt::linear_programming::method_t::PDLP; - solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.generate_batch_primal_dual_solution = true; const int n_fractional = fractional.size(); const int batch_size = n_fractional * 2; From 79d05e770d1c65bf0976b6b9c72d56edeb74c606 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Mar 2026 15:02:55 +0100 Subject: [PATCH 27/43] initial version of work stealing --- .../pdlp/solver_settings.hpp | 6 + cpp/src/branch_and_bound/pseudo_costs.cpp | 71 +++-- .../shared_strong_branching_context.hpp | 50 +++ cpp/src/pdlp/pdlp.cu | 40 ++- cpp/src/pdlp/solve.cu | 8 + .../termination_strategy.cu | 10 +- .../termination_strategy.hpp | 1 + cpp/tests/linear_programming/pdlp_test.cu | 298 ++++++++++++++++++ 8 files changed, 461 insertions(+), 23 deletions(-) create mode 100644 cpp/src/branch_and_bound/shared_strong_branching_context.hpp diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 91ca14e96c..f3521edc54 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -18,6 +18,8 @@ #include +#include + namespace cuopt::linear_programming { // Forward declare solver_settings_t for friend class @@ -272,6 +274,8 @@ class pdlp_solver_settings_t { bool inside_mip{false}; // For concurrent termination std::atomic* concurrent_halt{nullptr}; + // Shared strong branching context view for cooperative DS + PDLP + dual_simplex::shared_strong_branching_context_view_t shared_sb_view; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; pdlp_hyper_params::pdlp_hyper_params_t hyper_params; // Holds the information of new variable lower and upper bounds for each climber in the format: @@ -283,6 +287,8 @@ class pdlp_solver_settings_t { // By default to save memory and speed we don't store and copy each climber's primal and dual solutions // We only retrieve termination statistics and the objective values bool generate_batch_primal_dual_solution{false}; + // Used to force batch PDLP to solve a subbatch of the problems at a time + i_t sub_batch_size{0}; private: /** Initial primal solution */ diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index db28888c69..52a7a0ac78 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -6,6 +6,7 @@ /* clang-format on */ #include +#include #include #include @@ -41,7 +42,8 @@ void strong_branch_helper(i_t start, std::vector& ds_obj_up, std::vector& ds_status_down, std::vector& ds_status_up, - std::atomic* concurrent_halt) + std::atomic* concurrent_halt, + shared_strong_branching_context_view_t& sb_view) { raft::common::nvtx::range scope("BB::strong_branch_helper"); lp_problem_t child_problem = original_lp; @@ -56,6 +58,15 @@ void strong_branch_helper(i_t start, for (i_t branch = 0; branch < 2; branch++) { // Do the down branch + const i_t shared_idx = (branch == 0) ? k : k + static_cast(fractional.size()); + // Batch PDLP has already solved this subproblem, skip it + if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) { + settings.log.printf( + "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved by PDLP\n", + thread_id, j, branch == 0 ? "down" : "up", shared_idx); + continue; + } + if (branch == 0) { child_problem.lower[j] = original_lp.lower[j]; child_problem.upper[j] = std::floor(root_soln[j]); @@ -131,6 +142,13 @@ void strong_branch_helper(i_t start, toc(start_time)); } } + // Mark the subproblem as solved so that batch PDLP removes it from the batch + if (sb_view.is_valid()) { + sb_view.mark_solved(shared_idx); + settings.log.printf( + "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n", + thread_id, j, branch == 0 ? "down" : "up", shared_idx); + } if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; } @@ -408,7 +426,10 @@ void strong_branching(const lp_problem_t& original_lp, settings.num_threads, fractional.size()); - // Race both batch PDLP and parallel Dual Simplex + // Cooperative DS + PDLP: shared context tracks which subproblems are solved + shared_strong_branching_context_t shared_ctx(2 * fractional.size()); + shared_strong_branching_context_view_t sb_view(std::span(shared_ctx.solved)); + std::atomic concurrent_halt{0}; std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); @@ -446,6 +467,7 @@ void strong_branching(const lp_problem_t& original_lp, pdlp_solver_settings_t pdlp_settings; pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_view = sb_view; pdlp_settings.time_limit = batch_remaining_time; const raft::handle_t batch_pdlp_handle; @@ -512,8 +534,6 @@ void strong_branching(const lp_problem_t& original_lp, pdlp_obj_up[k] = std::max(obj_up - root_obj, f_t(0.0)); } - // Batch PDLP finished – tell Dual Simplex to stop - concurrent_halt.store(1); }); std::vector ds_status_down(fractional.size(), dual::status_t::UNSET); @@ -559,20 +579,12 @@ void strong_branching(const lp_problem_t& original_lp, ds_obj_up, ds_status_down, ds_status_up, - &concurrent_halt); + &concurrent_halt, + sb_view); } } - if (settings.mip_batch_pdlp_strong_branching == 1) { - if (concurrent_halt.load() == 1) { - settings.log.printf("Batch PDLP finished before Dual Simplex\n"); - } - else { - settings.log.printf("Dual Simplex finished before Batch PDLP\n"); - } - } - - // Dual Simplex finished all subproblems – tell Batch PDLP to stop + // DS done: signal PDLP to stop (time-limit or all work done) and wait concurrent_halt.store(1); pdlp_thread.join(); @@ -614,25 +626,46 @@ void strong_branching(const lp_problem_t& original_lp, i_t merged_from_ds = 0; i_t merged_from_pdlp = 0; i_t merged_nan = 0; + i_t solved_by_both_down = 0; + i_t solved_by_both_up = 0; for (i_t k = 0; k < fractional.size(); k++) { - const auto [value_down, source_down] = merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], !std::isnan(pdlp_obj_down[k])); + bool ds_has_down = ds_status_down[k] != dual::status_t::UNSET; + bool pdlp_has_down = !std::isnan(pdlp_obj_down[k]); + const auto [value_down, source_down] = merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down); pc.strong_branch_down[k] = value_down; if (source_down == 0) merged_from_ds++; else if (source_down == 1) merged_from_pdlp++; else merged_nan++; - const auto [value_up, source_up] = merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], !std::isnan(pdlp_obj_up[k])); + if (ds_has_down && pdlp_has_down) { + solved_by_both_down++; + settings.log.printf( + "[COOP SB] Merge: variable %d DOWN solved by BOTH (DS=%e PDLP=%e) -> kept %s\n", + fractional[k], ds_obj_down[k], pdlp_obj_down[k], source_down == 0 ? "DS" : "PDLP"); + } + + bool ds_has_up = ds_status_up[k] != dual::status_t::UNSET; + bool pdlp_has_up = !std::isnan(pdlp_obj_up[k]); + const auto [value_up, source_up] = merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up); pc.strong_branch_up[k] = value_up; if (source_up == 0) merged_from_ds++; else if (source_up == 1) merged_from_pdlp++; else merged_nan++; + if (ds_has_up && pdlp_has_up) { + solved_by_both_up++; + settings.log.printf( + "[COOP SB] Merge: variable %d UP solved by BOTH (DS=%e PDLP=%e) -> kept %s\n", + fractional[k], ds_obj_up[k], pdlp_obj_up[k], source_up == 0 ? "DS" : "PDLP"); + } } if (settings.mip_batch_pdlp_strong_branching == 1) { settings.log.printf( - "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN)\n", + "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n", merged_from_ds, merged_from_pdlp, - merged_nan); + merged_nan, + solved_by_both_down, + solved_by_both_up); } pc.update_pseudo_costs_from_strong_branching(fractional, root_soln); diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp new file mode 100644 index 0000000000..6cbea737f5 --- /dev/null +++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp @@ -0,0 +1,50 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +struct shared_strong_branching_context_t { + std::vector> solved; + + explicit shared_strong_branching_context_t(size_t num_subproblems) : solved(num_subproblems) + { + for (auto& s : solved) + s.store(0); + } +}; + +template +struct shared_strong_branching_context_view_t { + std::span> solved; + + shared_strong_branching_context_view_t() = default; + + shared_strong_branching_context_view_t(std::span> s) : solved(s) {} + + bool is_valid() const { return !solved.empty(); } + + bool is_solved(i_t local_idx) const + { + return solved[local_idx].load() != 0; + } + + void mark_solved(i_t local_idx) const { solved[local_idx].store(1); } + + shared_strong_branching_context_view_t subview(i_t offset, i_t count) const + { + return {solved.subspan(offset, count)}; + } +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 7edbeaff15..9d5715a936 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -776,7 +776,27 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) } #endif - // All are optimal or infeasible + // Sync external solved status into internal termination strategy before all_done() check + if (settings_.shared_sb_view.is_valid()) { + for (size_t i = 0; i < climber_strategies_.size(); ++i) { + // If PDLP has solved it to optimality we want to keep it and resolved both solvers having solved the problem later + if (current_termination_strategy_.is_done( + current_termination_strategy_.get_termination_status(i))) + continue; + const i_t local_idx = climber_strategies_[i].original_index; + if (settings_.shared_sb_view.is_solved(local_idx)) { + current_termination_strategy_.set_termination_status(i, + pdlp_termination_status_t::ConcurrentLimit); +#ifdef BATCH_VERBOSE_MODE + std::cout << "[COOP SB] DS already solved climber " << i << " (original_index " + << local_idx << "), synced to ConcurrentLimit at step " + << total_pdlp_iterations_ << std::endl; +#endif + } + } + } + + // All are optimal, infeasible, or externally solved if (current_termination_strategy_.all_done()) { const auto original_batch_size = settings_.new_bounds.size(); // Some climber got removed from the batch while the optimization was running @@ -823,6 +843,9 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) .get_additional_termination_informations()[climber_strategies_[i].original_index] .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) != pdlp_termination_status_t::ConcurrentLimit); + if (settings_.shared_sb_view.is_valid()) { + settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); + } } current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -839,6 +862,11 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) std::move(batch_solution_to_return_.get_additional_termination_informations()), std::move(batch_solution_to_return_.get_terminations_status())}; } + if (settings_.shared_sb_view.is_valid()) { + for (size_t i = 0; i < climber_strategies_.size(); ++i) { + settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); + } + } RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, @@ -857,8 +885,11 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) current_termination_strategy_.get_termination_status(i))) { raft::common::nvtx::range fun_scope("remove_done_climber"); #ifdef BATCH_VERBOSE_MODE - std::cout << "Removing climber " << i << " because it is done. Its original index is " - << climber_strategies_[i].original_index << std::endl; + const bool externally_solved = (current_termination_strategy_.get_termination_status(i) == + pdlp_termination_status_t::ConcurrentLimit); + std::cout << "Removing climber " << i << " (original_index " + << climber_strategies_[i].original_index << ") because it is done" + << (externally_solved ? " [solved by DS]" : " [solved by PDLP]") << std::endl; #endif to_remove.emplace(i); // Copy current climber solution information @@ -891,6 +922,9 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) .get_additional_termination_informations()[climber_strategies_[i].original_index] .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) != pdlp_termination_status_t::ConcurrentLimit); + if (settings_.shared_sb_view.is_valid()) { + settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); + } } } if (to_remove.size() > 0) { diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index c59bf1bbdb..ced3844a9b 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -962,6 +962,9 @@ optimization_problem_solution_t run_batch_pdlp( size_t optimal_batch_size = use_optimal_batch_size ? detail::optimal_batch_size_handler(problem, memory_max_batch_size) : max_batch_size; + if (settings.sub_batch_size > 0) { + optimal_batch_size = settings.sub_batch_size; + } cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size, "Optimal batch size should be between 1 and max batch size"); using f_t2 = typename type_2::type; @@ -1040,6 +1043,11 @@ optimization_problem_solution_t run_batch_pdlp( batch_settings.new_bounds = std::vector>( original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size); + if (settings.shared_sb_view.is_valid()) { + batch_settings.shared_sb_view = + settings.shared_sb_view.subview(i, current_batch_size); + } + auto sol = solve_lp(problem, batch_settings); auto info = sol.get_additional_termination_informations(); diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu index 7179df6a49..563850dc0c 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu @@ -124,6 +124,13 @@ pdlp_termination_status_t pdlp_termination_strategy_t::get_termination return (pdlp_termination_status_t)termination_status_[id]; } +template +void pdlp_termination_strategy_t::set_termination_status( + i_t id, pdlp_termination_status_t status) +{ + termination_status_[id] = (i_t)status; +} + template std::vector pdlp_termination_strategy_t::get_terminations_status() @@ -389,7 +396,8 @@ __host__ __device__ bool pdlp_termination_strategy_t::is_done( { return termination_status == pdlp_termination_status_t::Optimal || termination_status == pdlp_termination_status_t::PrimalInfeasible || - termination_status == pdlp_termination_status_t::DualInfeasible; + termination_status == pdlp_termination_status_t::DualInfeasible || + termination_status == pdlp_termination_status_t::ConcurrentLimit; } template diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp index 6fe118c488..efb7a41d7b 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp @@ -140,6 +140,7 @@ class pdlp_termination_strategy_t { f_t get_relative_primal_tolerance_factor() const; pdlp_termination_status_t get_termination_status(i_t id) const; + void set_termination_status(i_t id, pdlp_termination_status_t status); std::vector get_terminations_status(); bool all_optimal_status() const; bool all_done() const; diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index 9cbca2d86e..be91e96015 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -43,8 +43,11 @@ #include #include #include +#include #include +#include + namespace cuopt::linear_programming::test { constexpr double afiro_primal_objective = -464.0; @@ -2044,6 +2047,301 @@ TEST(pdlp_class, precision_single_pslp_presolve) afiro_primal_objective, solution.get_additional_termination_information().primal_objective)); } +// --------------------------------------------------------------------------- +// Cooperative strong branching tests +// --------------------------------------------------------------------------- + +TEST(pdlp_class, shared_sb_context_unit) +{ + using namespace cuopt::linear_programming::dual_simplex; + + constexpr int N = 10; + shared_strong_branching_context_t ctx(N); + shared_strong_branching_context_view_t view(std::span(ctx.solved)); + + EXPECT_TRUE(view.is_valid()); + + shared_strong_branching_context_view_t empty_view; + EXPECT_FALSE(empty_view.is_valid()); + + for (int i = 0; i < N; ++i) { + EXPECT_FALSE(view.is_solved(i)); + } + + view.mark_solved(0); + view.mark_solved(3); + view.mark_solved(7); + + EXPECT_TRUE(view.is_solved(0)); + EXPECT_FALSE(view.is_solved(1)); + EXPECT_FALSE(view.is_solved(2)); + EXPECT_TRUE(view.is_solved(3)); + EXPECT_FALSE(view.is_solved(4)); + EXPECT_FALSE(view.is_solved(5)); + EXPECT_FALSE(view.is_solved(6)); + EXPECT_TRUE(view.is_solved(7)); + EXPECT_FALSE(view.is_solved(8)); + EXPECT_FALSE(view.is_solved(9)); + + // subview(2, 5) covers global indices [2..6] + auto sv = view.subview(2, 5); + EXPECT_TRUE(sv.is_valid()); + EXPECT_FALSE(sv.is_solved(0)); // global 2 + EXPECT_TRUE(sv.is_solved(1)); // global 3 + EXPECT_FALSE(sv.is_solved(2)); // global 4 + EXPECT_FALSE(sv.is_solved(3)); // global 5 + EXPECT_FALSE(sv.is_solved(4)); // global 6 + + // Mark through subview: local 4 -> global 6 + sv.mark_solved(4); + EXPECT_TRUE(view.is_solved(6)); + EXPECT_TRUE(sv.is_solved(4)); +} + +TEST(pdlp_class, shared_sb_view_batch_pre_solved) +{ + using namespace cuopt::linear_programming::dual_simplex; + + const raft::handle_t handle_{}; + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::mps_parser::mps_data_model_t op_problem = + cuopt::mps_parser::parse_mps(path, true); + + const std::vector fractional = {1, 2, 4}; + const std::vector root_soln_x = {0.891, 0.109, 0.636429}; + const int n_fractional = fractional.size(); + const int batch_size = n_fractional * 2; // 6 + + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + + // Build new_bounds: down branches [0..2], up branches [3..5] + for (int i = 0; i < n_fractional; ++i) + solver_settings.new_bounds.push_back({fractional[i], + op_problem.get_variable_lower_bounds()[fractional[i]], + std::floor(root_soln_x[i])}); + for (int i = 0; i < n_fractional; ++i) + solver_settings.new_bounds.push_back({fractional[i], + std::ceil(root_soln_x[i]), + op_problem.get_variable_upper_bounds()[fractional[i]]}); + + shared_strong_branching_context_t ctx(batch_size); + + // Pre-mark entries 1 and 4 as solved (simulating DS) + ctx.solved[1].store(1); + ctx.solved[4].store(1); + + solver_settings.shared_sb_view = + shared_strong_branching_context_view_t(std::span(ctx.solved)); + + auto solution = solve_lp(&handle_, op_problem, solver_settings); + + ASSERT_EQ(solution.get_terminations_status().size(), batch_size); + + // Pre-solved entries should have ConcurrentLimit + EXPECT_EQ(solution.get_termination_status(1), pdlp_termination_status_t::ConcurrentLimit); + EXPECT_EQ(solution.get_termination_status(4), pdlp_termination_status_t::ConcurrentLimit); + + // Others should be Optimal + EXPECT_EQ(solution.get_termination_status(0), pdlp_termination_status_t::Optimal); + EXPECT_EQ(solution.get_termination_status(2), pdlp_termination_status_t::Optimal); + EXPECT_EQ(solution.get_termination_status(3), pdlp_termination_status_t::Optimal); + EXPECT_EQ(solution.get_termination_status(5), pdlp_termination_status_t::Optimal); + + // All entries should now be marked solved in the shared context + for (int i = 0; i < batch_size; ++i) { + EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + } +} + +TEST(pdlp_class, shared_sb_view_subbatch) +{ + using namespace cuopt::linear_programming::dual_simplex; + + const raft::handle_t handle_{}; + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::mps_parser::mps_data_model_t op_problem = + cuopt::mps_parser::parse_mps(path, true); + + const std::vector fractional = {1, 2, 4}; + const std::vector root_soln_x = {0.891, 0.109, 0.636429}; + const int n_fractional = fractional.size(); + const int batch_size = n_fractional * 2; + + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.sub_batch_size = 2; + + shared_strong_branching_context_t ctx(batch_size); + + // Pre-mark one entry in each sub-batch of size 2: indices 1, 4 + ctx.solved[1].store(1); + ctx.solved[4].store(1); + + solver_settings.shared_sb_view = + shared_strong_branching_context_view_t(std::span(ctx.solved)); + + auto solution = batch_pdlp_solve(&handle_, op_problem, fractional, root_soln_x, solver_settings); + + ASSERT_EQ(solution.get_terminations_status().size(), batch_size); + + // Pre-solved entries should have ConcurrentLimit + EXPECT_EQ(solution.get_termination_status(1), pdlp_termination_status_t::ConcurrentLimit); + EXPECT_EQ(solution.get_termination_status(4), pdlp_termination_status_t::ConcurrentLimit); + + // Others should be Optimal + for (int i = 0; i < batch_size; ++i) { + if (i == 1 || i == 4) continue; + EXPECT_EQ(solution.get_termination_status(i), pdlp_termination_status_t::Optimal) + << "Entry " << i << " should be Optimal"; + } + + // All should be marked solved + for (int i = 0; i < batch_size; ++i) { + EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + } +} + +TEST(pdlp_class, shared_sb_view_concurrent_mark) +{ + using namespace cuopt::linear_programming::dual_simplex; + + const raft::handle_t handle_{}; + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::mps_parser::mps_data_model_t op_problem = + cuopt::mps_parser::parse_mps(path, true); + + const std::vector fractional = {1, 2, 4}; + const std::vector root_soln_x = {0.891, 0.109, 0.636429}; + const int n_fractional = fractional.size(); + const int batch_size = n_fractional * 2; + + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.iteration_limit = 1000000; + + for (int i = 0; i < n_fractional; ++i) + solver_settings.new_bounds.push_back({fractional[0], + -5, + -5}); + + for (int i = 0; i < n_fractional; ++i) + solver_settings.new_bounds.push_back({fractional[i], + std::ceil(root_soln_x[i]), + op_problem.get_variable_upper_bounds()[fractional[i]]}); + + shared_strong_branching_context_t ctx(batch_size); + + solver_settings.shared_sb_view = + shared_strong_branching_context_view_t(std::span(ctx.solved)); + + optimization_problem_solution_t* result_ptr = nullptr; + + auto pdlp_thread = std::thread([&]() { + auto sol = new optimization_problem_solution_t( + solve_lp(&handle_, op_problem, solver_settings)); + result_ptr = sol; + }); + + // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS) + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + for (int i = 0; i < n_fractional; ++i) + ctx.solved[i].store(1); + + pdlp_thread.join(); + + ASSERT_NE(result_ptr, nullptr); + auto& solution = *result_ptr; + + ASSERT_EQ(solution.get_terminations_status().size(), batch_size); + + for (int i = 0; i < batch_size; ++i) { + auto status = solution.get_termination_status(i); + // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it) + EXPECT_TRUE(status == pdlp_termination_status_t::Optimal || + status == pdlp_termination_status_t::ConcurrentLimit) + << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t::get_termination_status_string(status); + } + + // All entries should end up marked solved + for (int i = 0; i < batch_size; ++i) { + EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + } + + delete result_ptr; +} + +TEST(pdlp_class, shared_sb_view_all_infeasible) +{ + using namespace cuopt::linear_programming::dual_simplex; + + const raft::handle_t handle_{}; + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::mps_parser::mps_data_model_t op_problem = + cuopt::mps_parser::parse_mps(path, true); + + const std::vector fractional = {1, 2, 4}; + const std::vector root_soln_x = {0.891, 0.109, 0.636429}; + const int n_fractional = fractional.size(); + const int batch_size = n_fractional; + + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.iteration_limit = 1000000; + + for (int i = 0; i < n_fractional; ++i) + solver_settings.new_bounds.push_back({fractional[0], + -5, + -5}); + + shared_strong_branching_context_t ctx(batch_size); + + solver_settings.shared_sb_view = + shared_strong_branching_context_view_t(std::span(ctx.solved)); + + optimization_problem_solution_t* result_ptr = nullptr; + + auto pdlp_thread = std::thread([&]() { + auto sol = new optimization_problem_solution_t( + solve_lp(&handle_, op_problem, solver_settings)); + result_ptr = sol; + }); + + // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS) + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + for (int i = 0; i < n_fractional; ++i) + ctx.solved[i].store(1); + + pdlp_thread.join(); + + ASSERT_NE(result_ptr, nullptr); + auto& solution = *result_ptr; + + ASSERT_EQ(solution.get_terminations_status().size(), batch_size); + + for (int i = 0; i < batch_size; ++i) { + auto status = solution.get_termination_status(i); + // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it) + EXPECT_TRUE(status == pdlp_termination_status_t::ConcurrentLimit) + << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t::get_termination_status_string(status); + } + + // All entries should end up marked solved + for (int i = 0; i < batch_size; ++i) { + EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + } + + delete result_ptr; +} + } // namespace cuopt::linear_programming::test CUOPT_TEST_PROGRAM_MAIN() From 2c8bbfd56ea23c18a016601cb0719d5122597ae7 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Mar 2026 17:11:21 +0100 Subject: [PATCH 28/43] add option to use either dual simplex, bpdlp, or both with work stealing --- cpp/src/branch_and_bound/pseudo_costs.cpp | 57 ++++++++++++------- .../dual_simplex/simplex_solver_settings.hpp | 3 +- cpp/src/math_optimization/solver_settings.cu | 2 +- .../linear_programming/data_definition.py | 5 +- 4 files changed, 43 insertions(+), 24 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 52a7a0ac78..503d958a83 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -440,7 +440,9 @@ void strong_branching(const lp_problem_t& original_lp, if (settings.mip_batch_pdlp_strong_branching == 0) return; - settings.log.printf("Racing batch PDLP and Dual Simplex for strong branching\n"); + settings.log.printf(settings.mip_batch_pdlp_strong_branching == 2 + ? "Batch PDLP only for strong branching\n" + : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); f_t start_batch = tic(); std::vector original_root_soln_x; @@ -466,8 +468,10 @@ void strong_branching(const lp_problem_t& original_lp, if (batch_remaining_time <= 0.0) { return; } pdlp_solver_settings_t pdlp_settings; - pdlp_settings.concurrent_halt = &concurrent_halt; - pdlp_settings.shared_sb_view = sb_view; + if (settings.mip_batch_pdlp_strong_branching == 1) { + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_view = sb_view; + } pdlp_settings.time_limit = batch_remaining_time; const raft::handle_t batch_pdlp_handle; @@ -542,6 +546,7 @@ void strong_branching(const lp_problem_t& original_lp, std::vector ds_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); f_t dual_simplex_strong_branching_time = tic(); + if (settings.mip_batch_pdlp_strong_branching != 2) { #pragma omp parallel num_threads(settings.num_threads) { i_t n = std::min(4 * settings.num_threads, fractional.size()); @@ -586,6 +591,7 @@ void strong_branching(const lp_problem_t& original_lp, // DS done: signal PDLP to stop (time-limit or all work done) and wait concurrent_halt.store(1); + } pdlp_thread.join(); @@ -593,24 +599,37 @@ void strong_branching(const lp_problem_t& original_lp, // Collect Dual Simplex statistics - i_t ds_optimal_count = 0; - i_t ds_dual_feasible_only_count = 0; + i_t ds_optimal = 0, ds_infeasible = 0, ds_iter_limit = 0; + i_t ds_numerical = 0, ds_cutoff = 0, ds_time_limit = 0; + i_t ds_concurrent = 0, ds_work_limit = 0, ds_unset = 0; + const i_t total_subproblems = fractional.size() * 2; for (i_t k = 0; k < fractional.size(); k++) { - if (ds_status_down[k] == dual::status_t::OPTIMAL) ds_optimal_count++; - if (ds_status_up[k] == dual::status_t::OPTIMAL) ds_optimal_count++; - if (ds_status_down[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++; - if (ds_status_up[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++; + for (auto st : {ds_status_down[k], ds_status_up[k]}) { + switch (st) { + case dual::status_t::OPTIMAL: ds_optimal++; break; + case dual::status_t::DUAL_UNBOUNDED: ds_infeasible++; break; + case dual::status_t::ITERATION_LIMIT: ds_iter_limit++; break; + case dual::status_t::NUMERICAL: ds_numerical++; break; + case dual::status_t::CUTOFF: ds_cutoff++; break; + case dual::status_t::TIME_LIMIT: ds_time_limit++; break; + case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++; break; + case dual::status_t::WORK_LIMIT: ds_work_limit++; break; + case dual::status_t::UNSET: ds_unset++; break; + } + } } - settings.log.printf( - "Dual Simplex found %d/%d optimal solutions and %d/%d dual feasible only solutions\n", - ds_optimal_count, - fractional.size() * 2, - ds_dual_feasible_only_count, - fractional.size() * 2); - - if (settings.mip_batch_pdlp_strong_branching == 1) { - // Collect Batch PDLP statistics + settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit", + ds_optimal, total_subproblems, ds_infeasible, ds_iter_limit); + if (ds_cutoff) settings.log.printf(", %d cutoff", ds_cutoff); + if (ds_time_limit) settings.log.printf(", %d time-limit", ds_time_limit); + if (ds_numerical) settings.log.printf(", %d numerical", ds_numerical); + if (ds_concurrent) settings.log.printf(", %d concurrent-halt", ds_concurrent); + if (ds_work_limit) settings.log.printf(", %d work-limit", ds_work_limit); + if (ds_unset) settings.log.printf(", %d unset/skipped", ds_unset); + settings.log.printf("\n"); + + if (settings.mip_batch_pdlp_strong_branching != 0) { i_t pdlp_optimal_count = 0; for (i_t k = 0; k < fractional.size(); k++) { if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++; @@ -658,7 +677,7 @@ void strong_branching(const lp_problem_t& original_lp, } } - if (settings.mip_batch_pdlp_strong_branching == 1) { + if (settings.mip_batch_pdlp_strong_branching != 0) { settings.log.printf( "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n", merged_from_ds, diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 8de5302978..29c7d7a80f 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -187,8 +187,7 @@ struct simplex_solver_settings_t { // strengthening f_t cut_change_threshold; // threshold for cut change f_t cut_min_orthogonality; // minimum orthogonality for cuts - i_t mip_batch_pdlp_strong_branching{0}; // 0 if not using batch PDLP for strong branching, 1 if - // using batch PDLP for strong branching + i_t mip_batch_pdlp_strong_branching{0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only i_t mip_batch_pdlp_reliability_branching{0}; // 0 if not using batch PDLP for reliability branching, 1 if diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index aa3741852f..cc2f09d58d 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -99,7 +99,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, - {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0}, + {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 59ea62089d..9ea5cf4e1b 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -452,8 +452,9 @@ class SolverConfig(BaseModel): ) mip_batch_pdlp_strong_branching: Optional[int] = Field( default=0, - description="Set 1 to enable batch PDLP strong branching " - "in the MIP solver, 0 to disable.", + description="Strong branching mode: 0 = Dual Simplex only, " + "1 = cooperative work-stealing (DS + batch PDLP), " + "2 = batch PDLP only.", ) num_cpu_threads: Optional[int] = Field( default=None, From 0968167a551448a20ec80e4865a037e4274cb91b Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 23 Mar 2026 13:04:45 +0000 Subject: [PATCH 29/43] fix: resize the buffers to handle the case where we go to a single column which internally makes the spmm switch to spmv which need a new buffer --- cpp/src/pdlp/pdlp.cu | 86 ++++++++++++++++++++++++++++++++++++++++++- cpp/src/pdlp/solve.cu | 8 ++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 9d5715a936..bd53b1d93b 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -790,7 +790,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) #ifdef BATCH_VERBOSE_MODE std::cout << "[COOP SB] DS already solved climber " << i << " (original_index " << local_idx << "), synced to ConcurrentLimit at step " - << total_pdlp_iterations_ << std::endl; + << internal_solver_iterations_ << std::endl; #endif } } @@ -1798,6 +1798,90 @@ void pdlp_solver_t::resize_and_swap_all_context_loop( pdhg_solver_.get_primal_tmp_resource().data(), CUSPARSE_ORDER_COL); + // Recalculate SpMM buffer sizes for the new batch dimensions. + // cuSparse may require different buffer sizes when the number of columns changes + // (e.g. SpMM with 1 column may internally fall back to SpMV with larger buffer needs). + { + size_t new_buf_size = 0; + + // PDHG row-row: A_T * batch_dual_solutions -> batch_current_AtYs + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( + handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + pdhg_cusparse_view.A_T, + pdhg_cusparse_view.batch_dual_solutions, + reusable_device_scalar_value_0_.data(), + pdhg_cusparse_view.batch_current_AtYs, + (deterministic_batch_pdlp) ? CUSPARSE_SPMM_CSR_ALG3 : CUSPARSE_SPMM_CSR_ALG2, + &new_buf_size, + stream_view_)); + pdhg_cusparse_view.buffer_transpose_batch_row_row_.resize(new_buf_size, stream_view_); + + // PDHG row-row: A * batch_reflected_primal_solutions -> batch_dual_gradients + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( + handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + pdhg_cusparse_view.A, + pdhg_cusparse_view.batch_reflected_primal_solutions, + reusable_device_scalar_value_0_.data(), + pdhg_cusparse_view.batch_dual_gradients, + (deterministic_batch_pdlp) ? CUSPARSE_SPMM_CSR_ALG3 : CUSPARSE_SPMM_CSR_ALG2, + &new_buf_size, + stream_view_)); + pdhg_cusparse_view.buffer_non_transpose_batch_row_row_.resize(new_buf_size, stream_view_); + + // Adaptive step size: A_T * batch_potential_next_dual_solution -> batch_next_AtYs + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( + handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + pdhg_cusparse_view.A_T, + pdhg_cusparse_view.batch_potential_next_dual_solution, + reusable_device_scalar_value_0_.data(), + pdhg_cusparse_view.batch_next_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + &new_buf_size, + stream_view_)); + pdhg_cusparse_view.buffer_transpose_batch.resize(new_buf_size, stream_view_); + + // Convergence info: A_T * batch_dual_solutions -> batch_tmp_primals + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( + handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + current_op_problem_evaluation_cusparse_view_.A_T, + current_op_problem_evaluation_cusparse_view_.batch_dual_solutions, + reusable_device_scalar_value_0_.data(), + current_op_problem_evaluation_cusparse_view_.batch_tmp_primals, + CUSPARSE_SPMM_CSR_ALG3, + &new_buf_size, + stream_view_)); + current_op_problem_evaluation_cusparse_view_.buffer_transpose_batch.resize(new_buf_size, + stream_view_); + + // Convergence info: A * batch_primal_solutions -> batch_tmp_duals + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( + handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + current_op_problem_evaluation_cusparse_view_.A, + current_op_problem_evaluation_cusparse_view_.batch_primal_solutions, + reusable_device_scalar_value_0_.data(), + current_op_problem_evaluation_cusparse_view_.batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + &new_buf_size, + stream_view_)); + current_op_problem_evaluation_cusparse_view_.buffer_non_transpose_batch.resize(new_buf_size, + stream_view_); + } + // Rerun preprocess // PDHG SpMM preprocess diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index ced3844a9b..6bb2456c31 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -986,8 +986,16 @@ optimization_problem_solution_t run_batch_pdlp( warm_start_settings.detect_infeasibility = false; warm_start_settings.iteration_limit = iteration_limit; warm_start_settings.inside_mip = true; + #ifdef BATCH_VERBOSE_MODE + auto start_time = std::chrono::high_resolution_clock::now(); + #endif optimization_problem_solution_t original_solution = solve_lp(problem, warm_start_settings); + #ifdef BATCH_VERBOSE_MODE + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; + #endif if (pdlp_primal_dual_init) { initial_primal = rmm::device_uvector(original_solution.get_primal_solution(), original_solution.get_primal_solution().stream()); From 7642ded0d45ffade79097a54a77e0f5dec2e3b82 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 23 Mar 2026 18:36:11 +0100 Subject: [PATCH 30/43] general batch pdlp improvements and support work stealing in RB --- .../mip/solver_settings.hpp | 4 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 488 ++++++++++++------ cpp/src/branch_and_bound/pseudo_costs.hpp | 16 + .../dual_simplex/simplex_solver_settings.hpp | 2 +- cpp/src/math_optimization/solver_settings.cu | 2 +- cpp/src/pdlp/pdlp.cu | 2 +- cpp/src/pdlp/solve.cu | 91 ++-- .../linear_programming/data_definition.py | 6 + 8 files changed, 388 insertions(+), 223 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 07a28a7748..62e88d5eb0 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -98,8 +98,8 @@ class mip_solver_settings_t { i_t reduced_cost_strengthening = -1; f_t cut_change_threshold = -1.0; f_t cut_min_orthogonality = 0.5; - i_t mip_batch_pdlp_strong_branching = 1; - i_t mip_batch_pdlp_reliability_branching = 1; + i_t mip_batch_pdlp_strong_branching{1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + i_t mip_batch_pdlp_reliability_branching{1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only i_t num_gpus = 1; bool log_to_console = true; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 503d958a83..204d28c386 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -177,7 +177,7 @@ void strong_branch_helper(i_t start, } template -f_t trial_branching(const lp_problem_t& original_lp, +std::pair trial_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, const std::vector& var_types, const std::vector& vstatus, @@ -244,12 +244,12 @@ f_t trial_branching(const lp_problem_t& original_lp, if (status == dual::status_t::DUAL_UNBOUNDED) { // LP was infeasible - return std::numeric_limits::infinity(); + return {std::numeric_limits::infinity(), dual::status_t::DUAL_UNBOUNDED}; } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) { - return compute_objective(child_problem, solution.x); + return {compute_objective(child_problem, solution.x), status}; } else { - return std::numeric_limits::quiet_NaN(); + return {std::numeric_limits::quiet_NaN(), dual::status_t::NUMERICAL}; } } @@ -394,8 +394,8 @@ static std::pair merge_sb_result(f_t ds_val, if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; } if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; } - // Rule 4: Dual Simplex hit iteration limit -> keep DS - if (ds_status == dual::status_t::ITERATION_LIMIT) { return {ds_val, 0}; } + // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS + if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || ds_status == dual::status_t::CUTOFF) { return {ds_val, 0}; } // Rule 5: None converged -> NaN return {std::numeric_limits::quiet_NaN(), 2}; @@ -447,6 +447,8 @@ void strong_branching(const lp_problem_t& original_lp, f_t start_batch = tic(); std::vector original_root_soln_x; + if (concurrent_halt.load() == 1) { return; } + const auto mps_model = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); @@ -462,6 +464,8 @@ void strong_branching(const lp_problem_t& original_lp, fraction_values.push_back(original_root_soln_x[j]); } + if (concurrent_halt.load() == 1) { return; } + const f_t batch_elapsed_time = toc(start_time); const f_t batch_remaining_time = std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); @@ -474,16 +478,80 @@ void strong_branching(const lp_problem_t& original_lp, } pdlp_settings.time_limit = batch_remaining_time; - const raft::handle_t batch_pdlp_handle; - constexpr bool dual_simplex_primal_dual = false; - if (dual_simplex_primal_dual) { + + if (!pc.pdlp_warm_cache.populated) { + pdlp_solver_settings_t ws_settings; + ws_settings.method = method_t::PDLP; + ws_settings.presolver = presolver_t::None; + ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + ws_settings.detect_infeasibility = false; + // Since the warm start will be used over and over again we want to maximize the chance of convergeance + // Batch PDLP is very compute intensive so we want to minimize the number of iterations + constexpr int warm_start_iteration_limit = 500000; + ws_settings.iteration_limit = warm_start_iteration_limit; + constexpr f_t pdlp_tolerance = 1e-6; + ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; + ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; + ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; + ws_settings.inside_mip = true; + if (settings.mip_batch_pdlp_strong_branching == 1) { + ws_settings.concurrent_halt = &concurrent_halt; + } + + #ifdef BATCH_VERBOSE_MODE + auto start_time = std::chrono::high_resolution_clock::now(); + #endif + + auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); + + #ifdef BATCH_VERBOSE_MODE + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; + #endif + + if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) { + auto& cache = pc.pdlp_warm_cache; + const auto& ws_primal = ws_solution.get_primal_solution(); + const auto& ws_dual = ws_solution.get_dual_solution(); + // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm start + cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); + cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); + cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; + cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; + cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + cache.populated = true; + + settings.log.printf("Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", + cache.initial_primal.size(), cache.initial_dual.size(), + cache.step_size, cache.primal_weight, cache.pdlp_iteration); + } else { + settings.log.printf("PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", + ws_solution.get_termination_status_string().c_str()); + return; + } + } + + if (concurrent_halt.load() == 1) { return; } + + if (pc.pdlp_warm_cache.populated) { + auto& cache = pc.pdlp_warm_cache; pdlp_settings.set_initial_primal_solution( - original_root_soln_x.data(), original_root_soln_x.size(), batch_pdlp_handle.get_stream()); + cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_dual_solution( - original_root_soln_y.data(), original_root_soln_y.size(), batch_pdlp_handle.get_stream()); + cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache.step_size); + pdlp_settings.set_initial_primal_weight(cache.primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); } + + if (concurrent_halt.load() == 1) { return; } + const auto solutions = - batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); + batch_pdlp_solve(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Fail safe in case the batch PDLP failed and produced no solutions @@ -856,9 +924,25 @@ i_t pseudo_costs_t::reliable_variable_selection( return branch_var; } + const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching; + // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled + // This indicates that PDLP alone (not batched) couldn't even run at the root node + // So it will most likely perform poorly compared to DS + // Also, if the number of candidate is very small we don't use batch PDLP + constexpr i_t min_num_candidates_for_pdlp = 5; + const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp; + + if (rb_mode != 0 && !pdlp_warm_cache.populated) { + log.printf("PDLP warm start data not populated, using DS only\n"); + } + if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { + log.printf("Not enough candidates to use batch PDLP, using DS only\n"); + } + const int num_tasks = std::max(max_num_tasks, 1); const int task_priority = reliability_branching_settings.task_priority; - const i_t max_num_candidates = reliability_branching_settings.max_num_candidates; + // If both batch PDLP and DS are used we double the max number of candidates + const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates; const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); assert(task_priority > 0); @@ -877,33 +961,36 @@ i_t pseudo_costs_t::reliable_variable_selection( // Shuffle the unreliable list so every variable has the same chance to be selected. if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); } - // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex - std::vector pdlp_overflow_list; - bool use_pdlp = settings.mip_batch_pdlp_reliability_branching == 1 && - static_cast(unreliable_list.size()) > num_candidates; - if (use_pdlp) { - pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end()); - } + // Both DS and PDLP work on the same candidate set + std::vector candidate_vars(unreliable_list.begin(), + unreliable_list.begin() + num_candidates); + + // Shared context for cooperative work-stealing (mode 1) + // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up + shared_strong_branching_context_t shared_ctx(2 * num_candidates); + shared_strong_branching_context_view_t sb_view(std::span(shared_ctx.solved)); - const i_t num_pdlp_vars = pdlp_overflow_list.size(); - std::vector pdlp_obj_down(num_pdlp_vars, std::numeric_limits::quiet_NaN()); - std::vector pdlp_obj_up(num_pdlp_vars, std::numeric_limits::quiet_NaN()); + std::vector pdlp_obj_down(num_candidates, std::numeric_limits::quiet_NaN()); + std::vector pdlp_obj_up(num_candidates, std::numeric_limits::quiet_NaN()); - // DS can halt PDLP via concurrent_halt, but not the other way around std::atomic concurrent_halt{0}; std::thread pdlp_thread; if (use_pdlp) { pdlp_thread = std::thread([&]() { - log.printf("RB batch PDLP: solving %d overflow unreliable variables\n", num_pdlp_vars); + log.printf(rb_mode == 2 + ? "RB batch PDLP only for %d candidates\n" + : "RB cooperative batch PDLP and DS for %d candidates\n", + num_candidates); f_t start_batch = tic(); std::vector original_soln_x; - // Convert the original_lp that has cuts to a problem that is better for PDLP + + if (concurrent_halt.load() == 1) { return; } + auto mps_model = simplex_problem_to_mps_data_model( original_lp, new_slacks, solution, original_soln_x); - // Apply the bounds of the current leaf problem { const i_t n_orig = original_lp.num_cols - new_slacks.size(); for (i_t j = 0; j < n_orig; j++) { @@ -913,59 +1000,74 @@ i_t pseudo_costs_t::reliable_variable_selection( } std::vector fraction_values; - fraction_values.reserve(num_pdlp_vars); - for (i_t j : pdlp_overflow_list) { + fraction_values.reserve(num_candidates); + for (i_t j : candidate_vars) { fraction_values.push_back(original_soln_x[j]); } - const f_t batch_elapsed_time = toc(start_time); + if (concurrent_halt.load() == 1) { return; } + + const f_t batch_elapsed_time = toc(start_time); const f_t batch_remaining_time = std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (batch_remaining_time <= 0.0) { return; } pdlp_solver_settings_t pdlp_settings; - pdlp_settings.concurrent_halt = &concurrent_halt; - pdlp_settings.time_limit = batch_remaining_time; + if (rb_mode == 1) { + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_view = sb_view; + } + pdlp_settings.time_limit = batch_remaining_time; + + + if (pdlp_warm_cache.populated) { + auto& cache = pdlp_warm_cache; + pdlp_settings.set_initial_primal_solution( + cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution( + cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache.step_size); + pdlp_settings.set_initial_primal_weight(cache.primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); + } + + if (concurrent_halt.load() == 1) { return; } - const raft::handle_t batch_pdlp_handle; const auto solutions = batch_pdlp_solve( - &batch_pdlp_handle, mps_model, pdlp_overflow_list, fraction_values, pdlp_settings); + &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); f_t batch_pdlp_time = toc(start_batch); if (solutions.get_additional_termination_informations().size() != - static_cast(num_pdlp_vars) * 2) { + static_cast(num_candidates) * 2) { log.printf("RB batch PDLP failed and produced no solutions\n"); return; } i_t amount_done = 0; - for (i_t k = 0; k < num_pdlp_vars * 2; k++) { + for (i_t k = 0; k < num_candidates * 2; k++) { if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { amount_done++; } } - log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d in %.2fs\n", - batch_pdlp_time, - amount_done, - num_pdlp_vars * 2, - toc(start_batch)); + log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", + batch_pdlp_time, amount_done, num_candidates * 2); - for (i_t k = 0; k < num_pdlp_vars; k++) { + for (i_t k = 0; k < num_candidates; k++) { if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { pdlp_obj_down[k] = solutions.get_dual_objective_value(k); } - if (solutions.get_termination_status(k + num_pdlp_vars) == + if (solutions.get_termination_status(k + num_candidates) == pdlp_termination_status_t::Optimal) { - pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_pdlp_vars); + pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates); } } }); } if (toc(start_time) > settings.time_limit) { - log.printf("Time limit reached"); + log.printf("Time limit reached\n"); if (use_pdlp) { concurrent_halt.store(1); pdlp_thread.join(); @@ -973,165 +1075,211 @@ i_t pseudo_costs_t::reliable_variable_selection( return branch_var; } + std::vector ds_obj_down(num_candidates, std::numeric_limits::quiet_NaN()); + std::vector ds_obj_up(num_candidates, std::numeric_limits::quiet_NaN()); + std::vector ds_status_down(num_candidates, dual::status_t::UNSET); + std::vector ds_status_up(num_candidates, dual::status_t::UNSET); + omp_atomic_t ds_optimal{0}; omp_atomic_t ds_infeasible{0}; omp_atomic_t ds_failed{0}; + omp_atomic_t ds_skipped{0}; f_t ds_start_time = tic(); + if (rb_mode != 2) { #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ - shared(score_mutex, ds_optimal, ds_infeasible, ds_failed) - for (i_t i = 0; i < num_candidates; ++i) { - const i_t j = unreliable_list[i]; - - if (toc(start_time) > settings.time_limit) { continue; } - - pseudo_cost_mutex_down[j].lock(); - if (pseudo_cost_num_down[j] < reliable_threshold) { - // Do trial branching on the down branch - f_t obj = trial_branching(worker->leaf_problem, - settings, - var_types, - node_ptr->vstatus, - worker->leaf_edge_norms, - worker->basis_factors, - worker->basic_list, - worker->nonbasic_list, - j, - worker->leaf_problem.lower[j], - std::floor(solution[j]), - upper_bound, - branch_and_bound_lp_iter_per_node, - start_time, - reliability_branching_settings.upper_max_lp_iter, - reliability_branching_settings.lower_max_lp_iter, - strong_branching_lp_iter); - - if (std::isnan(obj)) { - ds_failed++; - } else if (std::isinf(obj)) { - ds_infeasible++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = solution[j] - std::floor(solution[j]); - pseudo_cost_sum_down[j] += change_in_obj / change_in_x; - pseudo_cost_num_down[j]++; + shared(score_mutex, ds_optimal, ds_infeasible, ds_failed, ds_skipped, ds_obj_down, ds_obj_up, ds_status_down, ds_status_up, sb_view) + for (i_t i = 0; i < num_candidates; ++i) { + const i_t j = unreliable_list[i]; + + if (toc(start_time) > settings.time_limit) { continue; } + + if (rb_mode == 1 && sb_view.is_solved(i)) { + ds_skipped++; } else { - ds_optimal++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = solution[j] - std::floor(solution[j]); - pseudo_cost_sum_down[j] += change_in_obj / change_in_x; - pseudo_cost_num_down[j]++; + pseudo_cost_mutex_down[j].lock(); + if (pseudo_cost_num_down[j] < reliable_threshold) { + // Do trial branching on the down branch + const auto [obj, status] = trial_branching(worker->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker->leaf_edge_norms, + worker->basis_factors, + worker->basic_list, + worker->nonbasic_list, + j, + worker->leaf_problem.lower[j], + std::floor(solution[j]), + upper_bound, + branch_and_bound_lp_iter_per_node, + start_time, + reliability_branching_settings.upper_max_lp_iter, + reliability_branching_settings.lower_max_lp_iter, + strong_branching_lp_iter); + + ds_obj_down[i] = obj; + ds_status_down[i] = status; + if (std::isnan(obj)) { + ds_failed++; + } else if (std::isinf(obj)) { + ds_infeasible++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + } else { + ds_optimal++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + } + if (rb_mode == 1) { sb_view.mark_solved(i); } + } + pseudo_cost_mutex_down[j].unlock(); } - } - pseudo_cost_mutex_down[j].unlock(); - - if (toc(start_time) > settings.time_limit) { continue; } - - pseudo_cost_mutex_up[j].lock(); - if (pseudo_cost_num_up[j] < reliable_threshold) { - f_t obj = trial_branching(worker->leaf_problem, - settings, - var_types, - node_ptr->vstatus, - worker->leaf_edge_norms, - worker->basis_factors, - worker->basic_list, - worker->nonbasic_list, - j, - std::ceil(solution[j]), - worker->leaf_problem.upper[j], - upper_bound, - branch_and_bound_lp_iter_per_node, - start_time, - reliability_branching_settings.upper_max_lp_iter, - reliability_branching_settings.lower_max_lp_iter, - strong_branching_lp_iter); - - if (std::isnan(obj)) { - ds_failed++; - } else if (std::isinf(obj)) { - // Is it ok to process infinity obj like this? - ds_infeasible++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(solution[j]) - solution[j]; - pseudo_cost_sum_up[j] += change_in_obj / change_in_x; - pseudo_cost_num_up[j]++; + + if (toc(start_time) > settings.time_limit) { continue; } + + const i_t shared_idx = i + num_candidates; + if (rb_mode == 1 && sb_view.is_solved(shared_idx)) { + ds_skipped++; } else { - ds_optimal++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(solution[j]) - solution[j]; - pseudo_cost_sum_up[j] += change_in_obj / change_in_x; - pseudo_cost_num_up[j]++; + pseudo_cost_mutex_up[j].lock(); + if (pseudo_cost_num_up[j] < reliable_threshold) { + const auto [obj, status] = trial_branching(worker->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker->leaf_edge_norms, + worker->basis_factors, + worker->basic_list, + worker->nonbasic_list, + j, + std::ceil(solution[j]), + worker->leaf_problem.upper[j], + upper_bound, + branch_and_bound_lp_iter_per_node, + start_time, + reliability_branching_settings.upper_max_lp_iter, + reliability_branching_settings.lower_max_lp_iter, + strong_branching_lp_iter); + + ds_obj_up[i] = obj; + ds_status_up[i] = status; + if (std::isnan(obj)) { + ds_failed++; + } else if (std::isinf(obj)) { + ds_infeasible++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + } else { + ds_optimal++; + f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + } + if (rb_mode == 1) { sb_view.mark_solved(shared_idx); } + } + pseudo_cost_mutex_up[j].unlock(); } - } - pseudo_cost_mutex_up[j].unlock(); - if (toc(start_time) > settings.time_limit) { continue; } + if (toc(start_time) > settings.time_limit) { continue; } - f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); + f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); - score_mutex.lock(); - if (score > max_score) { - max_score = score; - branch_var = j; + score_mutex.lock(); + if (score > max_score) { + max_score = score; + branch_var = j; + } + score_mutex.unlock(); } - score_mutex.unlock(); + + concurrent_halt.store(1); } f_t ds_elapsed = toc(ds_start_time); - log.printf( - "RB Dual Simplex: %d candidates, %d/%d optimal/dual-feasible, %d/%d infeasible, " - "%d/%d failed in %.2fs\n", - num_candidates, - ds_optimal.load(), - num_candidates * 2, - ds_infeasible.load(), - num_candidates * 2, - ds_failed.load(), - num_candidates * 2, - ds_elapsed); + + if (rb_mode != 2) { + if (rb_mode == 1) { + log.printf( + "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n", + num_candidates, + ds_optimal.load(), num_candidates * 2, + ds_infeasible.load(), num_candidates * 2, + ds_failed.load(), num_candidates * 2, + ds_skipped.load(), ds_elapsed); + } else { + log.printf( + "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n", + num_candidates, + ds_optimal.load(), num_candidates * 2, + ds_infeasible.load(), num_candidates * 2, + ds_failed.load(), num_candidates * 2, + ds_elapsed); + } + } if (use_pdlp) { - // Dual Simplex is done on the main thread, telling Batch PDLP to stop - concurrent_halt.store(1); pdlp_thread.join(); - i_t pdlp_optimal = 0; - for (i_t k = 0; k < num_pdlp_vars; k++) { - const i_t j = pdlp_overflow_list[k]; + i_t pdlp_applied = 0; + i_t pdlp_optimal = 0; + for (i_t i = 0; i < num_candidates; i++) { + const i_t j = candidate_vars[i]; - pseudo_cost_mutex_down[j].lock(); - if (!std::isnan(pdlp_obj_down[k])) { - f_t change_in_obj = std::max(pdlp_obj_down[k] - node_ptr->lower_bound, eps); - f_t change_in_x = solution[j] - std::floor(solution[j]); - pseudo_cost_sum_down[j] += change_in_obj / change_in_x; - pseudo_cost_num_down[j]++; + // Down: check if PDLP should override DS + if (!std::isnan(pdlp_obj_down[i])) { pdlp_optimal++; + const auto [merged_obj, source] = + merge_sb_result(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true); + // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable) + if (source == 1) { + pseudo_cost_mutex_down[j].lock(); + if (pseudo_cost_num_down[j] < reliable_threshold) { + f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + pdlp_applied++; + } + pseudo_cost_mutex_down[j].unlock(); + } } - pseudo_cost_mutex_down[j].unlock(); - pseudo_cost_mutex_up[j].lock(); - if (!std::isnan(pdlp_obj_up[k])) { - f_t change_in_obj = std::max(pdlp_obj_up[k] - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(solution[j]) - solution[j]; - pseudo_cost_sum_up[j] += change_in_obj / change_in_x; - pseudo_cost_num_up[j]++; + // Up: check if PDLP should override DS + if (!std::isnan(pdlp_obj_up[i])) { pdlp_optimal++; + const auto [merged_obj, source] = + merge_sb_result(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true); + // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable) + if (source == 1) { + pseudo_cost_mutex_up[j].lock(); + if (pseudo_cost_num_up[j] < reliable_threshold) { + f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + pdlp_applied++; + } + pseudo_cost_mutex_up[j].unlock(); + } } - pseudo_cost_mutex_up[j].unlock(); - f_t score = - calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); + f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); if (score > max_score) { max_score = score; branch_var = j; } } - log.printf( - "RB batch PDLP: %d candidates, %d/%d optimal\n", - num_pdlp_vars, - pdlp_optimal, - num_pdlp_vars * 2); + log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n", + num_candidates, pdlp_optimal, num_candidates * 2, pdlp_applied); } log.printf( diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 75cf660621..c48ed908d7 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -20,7 +20,10 @@ #include #include +#include + #include +#include namespace cuopt::linear_programming::dual_simplex { @@ -405,6 +408,17 @@ struct reliability_branching_settings_t { i_t min_reliable_threshold = 1; }; +template +struct batch_pdlp_warm_cache_t { + const raft::handle_t batch_pdlp_handle{}; + rmm::device_uvector initial_primal{0, batch_pdlp_handle.get_stream()}; + rmm::device_uvector initial_dual{0, batch_pdlp_handle.get_stream()}; + f_t step_size{std::numeric_limits::signaling_NaN()}; + f_t primal_weight{std::numeric_limits::signaling_NaN()}; + i_t pdlp_iteration{-1}; + bool populated{false}; +}; + template class pseudo_costs_t { public: @@ -516,6 +530,8 @@ class pseudo_costs_t { std::vector pseudo_cost_mutex_down; omp_atomic_t num_strong_branches_completed = 0; omp_atomic_t strong_branching_lp_iter = 0; + + batch_pdlp_warm_cache_t pdlp_warm_cache; }; template diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 29c7d7a80f..c097baf561 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -188,7 +188,7 @@ struct simplex_solver_settings_t { f_t cut_change_threshold; // threshold for cut change f_t cut_min_orthogonality; // minimum orthogonality for cuts i_t mip_batch_pdlp_strong_branching{0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only - i_t mip_batch_pdlp_reliability_branching{0}; // 0 if not using batch PDLP for reliability branching, 1 if + i_t mip_batch_pdlp_reliability_branching{0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index cc2f09d58d..749d89a35c 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -100,7 +100,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, - {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0}, + {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index bd53b1d93b..37e9e1a31f 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -770,7 +770,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) if (current_termination_strategy_.is_done(term)) { std::cout << "[BATCH MODE]: Climber " << i << " is done with " << optimization_problem_solution_t::get_termination_status_string(term) - << " at step " << total_pdlp_iterations_ << ". It's original index is " + << " at step " << internal_solver_iterations_ << ". It's original index is " << climber_strategies_[i].original_index << std::endl; } } diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 6bb2456c31..b9cdb8c9c6 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -904,11 +904,12 @@ optimization_problem_solution_t run_batch_pdlp( optimization_problem_t& problem, pdlp_solver_settings_t const& settings) { // Hyper parameter than can be changed, I have put what I believe to be the best - bool pdlp_primal_dual_init = true; - bool primal_weight_init = true; - bool use_initial_pdlp_iterations = true; + constexpr bool pdlp_primal_dual_init = true; + constexpr bool primal_weight_init = true; + constexpr bool use_initial_pdlp_iterations = true; bool use_optimal_batch_size = false; - constexpr int iteration_limit = 100000; + constexpr int batch_iteration_limit = 100000; + constexpr f_t pdlp_tolerance = 1e-6; rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream(); @@ -967,47 +968,31 @@ optimization_problem_solution_t run_batch_pdlp( } cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size, "Optimal batch size should be between 1 and max batch size"); - using f_t2 = typename type_2::type; - - // In case Dual Simplex already provided the initial primal and dual solution - if (settings.has_initial_primal_solution() && settings.has_initial_dual_solution()) { - initial_primal = rmm::device_uvector( - settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream()); - initial_dual = rmm::device_uvector( - settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream()); - } - if (pdlp_primal_dual_init || primal_weight_init) { - pdlp_solver_settings_t warm_start_settings = settings; - warm_start_settings.new_bounds.clear(); - warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; - warm_start_settings.presolver = cuopt::linear_programming::presolver_t::None; - warm_start_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - warm_start_settings.detect_infeasibility = false; - warm_start_settings.iteration_limit = iteration_limit; - warm_start_settings.inside_mip = true; - #ifdef BATCH_VERBOSE_MODE - auto start_time = std::chrono::high_resolution_clock::now(); - #endif - optimization_problem_solution_t original_solution = - solve_lp(problem, warm_start_settings); + const bool warm_start_from_settings = + settings.has_initial_primal_solution() || settings.has_initial_dual_solution() || + settings.get_initial_step_size().has_value() || + settings.get_initial_primal_weight().has_value() || + settings.get_initial_pdlp_iteration().has_value(); + + if (warm_start_from_settings) { #ifdef BATCH_VERBOSE_MODE - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time).count(); - std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; + std::cout << "Using warm start from settings" << std::endl; #endif - if (pdlp_primal_dual_init) { - initial_primal = rmm::device_uvector(original_solution.get_primal_solution(), - original_solution.get_primal_solution().stream()); - initial_dual = rmm::device_uvector(original_solution.get_dual_solution(), - original_solution.get_dual_solution().stream()); - initial_step_size = original_solution.get_pdlp_warm_start_data().initial_step_size_; + if (settings.has_initial_primal_solution() && pdlp_primal_dual_init) { + initial_primal = rmm::device_uvector(settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream()); + } + if (settings.has_initial_dual_solution() && pdlp_primal_dual_init) { + initial_dual = rmm::device_uvector(settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream()); + } + if (settings.get_initial_step_size().has_value() && pdlp_primal_dual_init) { + initial_step_size = *settings.get_initial_step_size(); } - if (primal_weight_init) { - initial_primal_weight = original_solution.get_pdlp_warm_start_data().initial_primal_weight_; + if (settings.get_initial_primal_weight().has_value() && primal_weight_init) { + initial_primal_weight = *settings.get_initial_primal_weight(); } - if (use_initial_pdlp_iterations) { - initial_pdlp_iteration = original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + if (settings.get_initial_pdlp_iteration().has_value() && use_initial_pdlp_iterations) { + initial_pdlp_iteration = *settings.get_initial_pdlp_iteration(); } } @@ -1029,21 +1014,31 @@ optimization_problem_solution_t run_batch_pdlp( batch_settings.presolver = presolver_t::None; batch_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; batch_settings.detect_infeasibility = false; - batch_settings.iteration_limit = iteration_limit; + batch_settings.iteration_limit = batch_iteration_limit; batch_settings.inside_mip = true; + batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; + batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; + batch_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; + batch_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; + batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; + batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; if (initial_primal.size() > 0) { batch_settings.set_initial_primal_solution( initial_primal.data(), initial_primal.size(), initial_primal.stream()); + } + if (initial_dual.size() > 0) { batch_settings.set_initial_dual_solution( initial_dual.data(), initial_dual.size(), initial_dual.stream()); - if (!std::isnan(initial_step_size)) { - batch_settings.set_initial_step_size(initial_step_size); - } - if (use_initial_pdlp_iterations) { - batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration); - } } - if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } + if (!std::isnan(initial_step_size)) { + batch_settings.set_initial_step_size(initial_step_size); + } + if (initial_pdlp_iteration != -1) { + batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration); + } + if (!std::isnan(initial_primal_weight)) { + batch_settings.set_initial_primal_weight(initial_primal_weight); + } for (size_t i = 0; i < max_batch_size; i += optimal_batch_size) { const size_t current_batch_size = std::min(optimal_batch_size, max_batch_size - i); diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 9ea5cf4e1b..32cf860f28 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -456,6 +456,12 @@ class SolverConfig(BaseModel): "1 = cooperative work-stealing (DS + batch PDLP), " "2 = batch PDLP only.", ) + mip_batch_pdlp_reliability_branching: Optional[int] = Field( + default=0, + description="Reliability branching mode: 0 = Dual Simplex only, " + "1 = cooperative work-stealing (DS + batch PDLP), " + "2 = batch PDLP only.", + ) num_cpu_threads: Optional[int] = Field( default=None, description="Set the number of CPU threads to use for branch and bound.", # noqa From 84dab81fb8c98631b8e5cb8efabefd2f73d3b3e5 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 24 Mar 2026 14:09:56 +0100 Subject: [PATCH 31/43] turn off logs --- cpp/src/branch_and_bound/branch_and_bound.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index eccadfbb74..b940134fbb 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -827,8 +827,7 @@ branch_variable_t branch_and_bound_t::variable_selection( branch_and_bound_worker_t* worker) { logger_t log; - // TODO put back false - log.log = true; + log.log = false; i_t branch_var = -1; rounding_direction_t round_dir = rounding_direction_t::NONE; std::vector current_incumbent; From a97cca7ad29b0df0c7603bac55fe44f58f3428f4 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 24 Mar 2026 18:07:15 +0100 Subject: [PATCH 32/43] few improvements to BPDLP --- .../pdlp/solver_settings.hpp | 1 + cpp/src/branch_and_bound/pseudo_costs.cpp | 128 +++++++++--------- cpp/src/pdlp/solve.cu | 32 ++--- 3 files changed, 80 insertions(+), 81 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index f3521edc54..17fa7c548f 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -288,6 +288,7 @@ class pdlp_solver_settings_t { // We only retrieve termination statistics and the objective values bool generate_batch_primal_dual_solution{false}; // Used to force batch PDLP to solve a subbatch of the problems at a time + // The 0 default value will make the solver use its heuristic to determine the subbatch size i_t sub_batch_size{0}; private: diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 204d28c386..c9f96b3666 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -25,6 +25,11 @@ namespace cuopt::linear_programming::dual_simplex { namespace { +static bool ds_is_valid_done(dual::status_t status) +{ + return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF; +} + template void strong_branch_helper(i_t start, i_t end, @@ -100,7 +105,7 @@ void strong_branch_helper(i_t start, if (status == dual::status_t::DUAL_UNBOUNDED) { // LP was infeasible obj = std::numeric_limits::infinity(); - } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT) { + } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) { obj = compute_objective(child_problem, solution.x); } else { settings.log.debug("Thread id %2d remaining %d variable %d branch %d status %d\n", @@ -144,10 +149,14 @@ void strong_branch_helper(i_t start, } // Mark the subproblem as solved so that batch PDLP removes it from the batch if (sb_view.is_valid()) { - sb_view.mark_solved(shared_idx); - settings.log.printf( - "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n", - thread_id, j, branch == 0 ? "down" : "up", shared_idx); + // We could not mark as solved nodes hitting iteartion limit in DS + if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || (branch == 1 && ds_is_valid_done(ds_status_up[k]))) + { + sb_view.mark_solved(shared_idx); + settings.log.printf( + "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n", + thread_id, j, branch == 0 ? "down" : "up", shared_idx); + } } if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; @@ -422,6 +431,9 @@ void strong_branching(const lp_problem_t& original_lp, pc.strong_branch_up.assign(fractional.size(), 0); pc.num_strong_branches_completed = 0; + const f_t elapsed_time = toc(start_time); + if (elapsed_time > settings.time_limit) { return; } + settings.log.printf("Strong branching using %d threads and %ld fractional variables\n", settings.num_threads, fractional.size()); @@ -466,18 +478,10 @@ void strong_branching(const lp_problem_t& original_lp, if (concurrent_halt.load() == 1) { return; } - const f_t batch_elapsed_time = toc(start_time); - const f_t batch_remaining_time = + f_t batch_elapsed_time = toc(start_time); + const f_t warm_start_remaining_time = std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); - if (batch_remaining_time <= 0.0) { return; } - - pdlp_solver_settings_t pdlp_settings; - if (settings.mip_batch_pdlp_strong_branching == 1) { - pdlp_settings.concurrent_halt = &concurrent_halt; - pdlp_settings.shared_sb_view = sb_view; - } - - pdlp_settings.time_limit = batch_remaining_time; + if (warm_start_remaining_time <= 0.0) { return; } if (!pc.pdlp_warm_cache.populated) { pdlp_solver_settings_t ws_settings; @@ -489,6 +493,7 @@ void strong_branching(const lp_problem_t& original_lp, // Batch PDLP is very compute intensive so we want to minimize the number of iterations constexpr int warm_start_iteration_limit = 500000; ws_settings.iteration_limit = warm_start_iteration_limit; + ws_settings.time_limit = warm_start_remaining_time; constexpr f_t pdlp_tolerance = 1e-6; ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; @@ -537,6 +542,18 @@ void strong_branching(const lp_problem_t& original_lp, if (concurrent_halt.load() == 1) { return; } + pdlp_solver_settings_t pdlp_settings; + if (settings.mip_batch_pdlp_strong_branching == 1) { + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_view = sb_view; + } + + batch_elapsed_time = toc(start_time); + const f_t batch_remaining_time = + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + if (batch_remaining_time <= 0.0) { return; } + pdlp_settings.time_limit = batch_remaining_time; + if (pc.pdlp_warm_cache.populated) { auto& cache = pc.pdlp_warm_cache; pdlp_settings.set_initial_primal_solution( @@ -928,7 +945,7 @@ i_t pseudo_costs_t::reliable_variable_selection( // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled // This indicates that PDLP alone (not batched) couldn't even run at the root node // So it will most likely perform poorly compared to DS - // Also, if the number of candidate is very small we don't use batch PDLP + // It is also off if the number of candidate is very small constexpr i_t min_num_candidates_for_pdlp = 5; const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp; @@ -1080,22 +1097,18 @@ i_t pseudo_costs_t::reliable_variable_selection( std::vector ds_status_down(num_candidates, dual::status_t::UNSET); std::vector ds_status_up(num_candidates, dual::status_t::UNSET); - omp_atomic_t ds_optimal{0}; - omp_atomic_t ds_infeasible{0}; - omp_atomic_t ds_failed{0}; - omp_atomic_t ds_skipped{0}; f_t ds_start_time = tic(); if (rb_mode != 2) { #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ - shared(score_mutex, ds_optimal, ds_infeasible, ds_failed, ds_skipped, ds_obj_down, ds_obj_up, ds_status_down, ds_status_up, sb_view) + shared(score_mutex, sb_view) for (i_t i = 0; i < num_candidates; ++i) { const i_t j = unreliable_list[i]; if (toc(start_time) > settings.time_limit) { continue; } if (rb_mode == 1 && sb_view.is_solved(i)) { - ds_skipped++; + log.printf("DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i); } else { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { @@ -1120,22 +1133,14 @@ i_t pseudo_costs_t::reliable_variable_selection( ds_obj_down[i] = obj; ds_status_down[i] = status; - if (std::isnan(obj)) { - ds_failed++; - } else if (std::isinf(obj)) { - ds_infeasible++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = solution[j] - std::floor(solution[j]); - pseudo_cost_sum_down[j] += change_in_obj / change_in_x; - pseudo_cost_num_down[j]++; - } else { - ds_optimal++; + if (!std::isnan(obj)) { f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); f_t change_in_x = solution[j] - std::floor(solution[j]); pseudo_cost_sum_down[j] += change_in_obj / change_in_x; pseudo_cost_num_down[j]++; + // Should be valid if were are already here + if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(i); } } - if (rb_mode == 1) { sb_view.mark_solved(i); } } pseudo_cost_mutex_down[j].unlock(); } @@ -1144,7 +1149,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const i_t shared_idx = i + num_candidates; if (rb_mode == 1 && sb_view.is_solved(shared_idx)) { - ds_skipped++; + log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", j, shared_idx); } else { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { @@ -1168,22 +1173,14 @@ i_t pseudo_costs_t::reliable_variable_selection( ds_obj_up[i] = obj; ds_status_up[i] = status; - if (std::isnan(obj)) { - ds_failed++; - } else if (std::isinf(obj)) { - ds_infeasible++; - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(solution[j]) - solution[j]; - pseudo_cost_sum_up[j] += change_in_obj / change_in_x; - pseudo_cost_num_up[j]++; - } else { - ds_optimal++; + if (!std::isnan(obj)) { f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); f_t change_in_x = std::ceil(solution[j]) - solution[j]; pseudo_cost_sum_up[j] += change_in_obj / change_in_x; pseudo_cost_num_up[j]++; + // Should be valid if were are already here + if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(shared_idx); } } - if (rb_mode == 1) { sb_view.mark_solved(shared_idx); } } pseudo_cost_mutex_up[j].unlock(); } @@ -1205,25 +1202,26 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t ds_elapsed = toc(ds_start_time); - if (rb_mode != 2) { - if (rb_mode == 1) { - log.printf( - "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n", - num_candidates, - ds_optimal.load(), num_candidates * 2, - ds_infeasible.load(), num_candidates * 2, - ds_failed.load(), num_candidates * 2, - ds_skipped.load(), ds_elapsed); - } else { - log.printf( - "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n", - num_candidates, - ds_optimal.load(), num_candidates * 2, - ds_infeasible.load(), num_candidates * 2, - ds_failed.load(), num_candidates * 2, - ds_elapsed); - } - } + // TODO put back + //if (rb_mode != 2) { + // if (rb_mode == 1) { + // log.printf( + // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n", + // num_candidates, + // ds_optimal.load(), num_candidates * 2, + // ds_infeasible.load(), num_candidates * 2, + // ds_failed.load(), num_candidates * 2, + // ds_skipped.load(), ds_elapsed); + // } else { + // log.printf( + // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n", + // num_candidates, + // ds_optimal.load(), num_candidates * 2, + // ds_infeasible.load(), num_candidates * 2, + // ds_failed.load(), num_candidates * 2, + // ds_elapsed); + // } + //} if (use_pdlp) { pdlp_thread.join(); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index b9cdb8c9c6..a27ecd965c 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -996,7 +996,7 @@ optimization_problem_solution_t run_batch_pdlp( } } - + // Only used in tests const bool collect_solutions = settings.generate_batch_primal_dual_solution; rmm::device_uvector full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); @@ -1053,26 +1053,26 @@ optimization_problem_solution_t run_batch_pdlp( auto sol = solve_lp(problem, batch_settings); + + if (collect_solutions) { + raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), + sol.get_primal_solution().data(), + sol.get_primal_solution().size(), + stream); + raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(), + sol.get_dual_solution().data(), + sol.get_dual_solution().size(), + stream); + raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(), + sol.get_reduced_cost().data(), + sol.get_reduced_cost().size(), + stream); + } auto info = sol.get_additional_termination_informations(); full_info.insert(full_info.end(), info.begin(), info.end()); auto status = sol.get_terminations_status(); full_status.insert(full_status.end(), status.begin(), status.end()); - - if (collect_solutions) { - raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), - sol.get_primal_solution().data(), - sol.get_primal_solution().size(), - stream); - raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(), - sol.get_dual_solution().data(), - sol.get_dual_solution().size(), - stream); - raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(), - sol.get_reduced_cost().data(), - sol.get_reduced_cost().size(), - stream); - } } return optimization_problem_solution_t(full_primal_solution, From 697908624d49fa51f1b41cb616f8a887c8d638ef Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 25 Mar 2026 15:05:31 +0100 Subject: [PATCH 33/43] reduce accuracy to 1e-5, no BPDLP if in sub mip, disable BPDLP in RB if root BPDLP couldn't solve more than 5% --- cpp/src/branch_and_bound/pseudo_costs.cpp | 35 ++++++++++++++++------- cpp/src/branch_and_bound/pseudo_costs.hpp | 1 + cpp/src/pdlp/solve.cu | 2 +- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c9f96b3666..a9b2177a29 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -434,6 +434,8 @@ void strong_branching(const lp_problem_t& original_lp, const f_t elapsed_time = toc(start_time); if (elapsed_time > settings.time_limit) { return; } + const i_t effective_batch_pdlp = settings.sub_mip ? 0 : settings.mip_batch_pdlp_strong_branching; + settings.log.printf("Strong branching using %d threads and %ld fractional variables\n", settings.num_threads, fractional.size()); @@ -449,10 +451,10 @@ void strong_branching(const lp_problem_t& original_lp, auto pdlp_thread = std::thread([&]() { - if (settings.mip_batch_pdlp_strong_branching == 0) + if (effective_batch_pdlp == 0) return; - settings.log.printf(settings.mip_batch_pdlp_strong_branching == 2 + settings.log.printf(effective_batch_pdlp == 2 ? "Batch PDLP only for strong branching\n" : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); @@ -494,7 +496,7 @@ void strong_branching(const lp_problem_t& original_lp, constexpr int warm_start_iteration_limit = 500000; ws_settings.iteration_limit = warm_start_iteration_limit; ws_settings.time_limit = warm_start_remaining_time; - constexpr f_t pdlp_tolerance = 1e-6; + constexpr f_t pdlp_tolerance = 1e-5; ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; @@ -502,7 +504,7 @@ void strong_branching(const lp_problem_t& original_lp, ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; ws_settings.inside_mip = true; - if (settings.mip_batch_pdlp_strong_branching == 1) { + if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } @@ -543,7 +545,7 @@ void strong_branching(const lp_problem_t& original_lp, if (concurrent_halt.load() == 1) { return; } pdlp_solver_settings_t pdlp_settings; - if (settings.mip_batch_pdlp_strong_branching == 1) { + if (effective_batch_pdlp == 1) { pdlp_settings.concurrent_halt = &concurrent_halt; pdlp_settings.shared_sb_view = sb_view; } @@ -631,7 +633,7 @@ void strong_branching(const lp_problem_t& original_lp, std::vector ds_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); f_t dual_simplex_strong_branching_time = tic(); - if (settings.mip_batch_pdlp_strong_branching != 2) { + if (effective_batch_pdlp != 2) { #pragma omp parallel num_threads(settings.num_threads) { i_t n = std::min(4 * settings.num_threads, fractional.size()); @@ -714,7 +716,7 @@ void strong_branching(const lp_problem_t& original_lp, if (ds_unset) settings.log.printf(", %d unset/skipped", ds_unset); settings.log.printf("\n"); - if (settings.mip_batch_pdlp_strong_branching != 0) { + if (effective_batch_pdlp != 0) { i_t pdlp_optimal_count = 0; for (i_t k = 0; k < fractional.size(); k++) { if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++; @@ -724,7 +726,7 @@ void strong_branching(const lp_problem_t& original_lp, settings.log.printf( "Batch PDLP found %d/%d optimal solutions\n", pdlp_optimal_count, - fractional.size() * 2); + static_cast(fractional.size() * 2)); } i_t merged_from_ds = 0; @@ -762,7 +764,10 @@ void strong_branching(const lp_problem_t& original_lp, } } - if (settings.mip_batch_pdlp_strong_branching != 0) { + + if (effective_batch_pdlp != 0) { + pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; + settings.log.printf("Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root); settings.log.printf( "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n", merged_from_ds, @@ -946,15 +951,23 @@ i_t pseudo_costs_t::reliable_variable_selection( // This indicates that PDLP alone (not batched) couldn't even run at the root node // So it will most likely perform poorly compared to DS // It is also off if the number of candidate is very small + // If warm start could run but almost none of the BPDLP results were used, we also want to avoid using batch PDLP constexpr i_t min_num_candidates_for_pdlp = 5; - const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp; + constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; + const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp; if (rb_mode != 0 && !pdlp_warm_cache.populated) { log.printf("PDLP warm start data not populated, using DS only\n"); } - if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { + else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { log.printf("Not enough candidates to use batch PDLP, using DS only\n"); } + else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) { + log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n"); + } + else if (use_pdlp) { + log.printf("Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved by batch PDLP at root is %f%% (> %f%%)\n", static_cast(unreliable_list.size()), min_num_candidates_for_pdlp, pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp); + } const int num_tasks = std::max(max_num_tasks, 1); const int task_priority = reliability_branching_settings.task_priority; diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index c48ed908d7..be8f9f71d4 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -416,6 +416,7 @@ struct batch_pdlp_warm_cache_t { f_t step_size{std::numeric_limits::signaling_NaN()}; f_t primal_weight{std::numeric_limits::signaling_NaN()}; i_t pdlp_iteration{-1}; + f_t pourcent_solved_by_batch_pdlp_at_root{f_t(0.0)}; bool populated{false}; }; diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index a27ecd965c..275c119d03 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -909,7 +909,7 @@ optimization_problem_solution_t run_batch_pdlp( constexpr bool use_initial_pdlp_iterations = true; bool use_optimal_batch_size = false; constexpr int batch_iteration_limit = 100000; - constexpr f_t pdlp_tolerance = 1e-6; + constexpr f_t pdlp_tolerance = 1e-5; rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream(); From b0061e4805a0b33f0b6aa5b7a834ebc314c5aaff Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 25 Mar 2026 15:21:20 +0100 Subject: [PATCH 34/43] empty just to run a new benchmark From f504a75561060f38963ac476e52dd32db13ab743 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 11:29:06 +0200 Subject: [PATCH 35/43] fix PR review comments --- .../cuopt/linear_programming/constants.h | 112 ++-- .../mip/solver_settings.hpp | 18 +- .../pdlp/solver_settings.hpp | 8 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 483 ++++++++++-------- .../shared_strong_branching_context.hpp | 5 +- .../dual_simplex/simplex_solver_settings.hpp | 6 +- cpp/src/math_optimization/solver_settings.cu | 4 +- cpp/src/pdlp/pdlp.cu | 34 +- cpp/src/pdlp/pdlp_constants.hpp | 2 - cpp/src/pdlp/solve.cu | 123 +++-- cpp/src/pdlp/solver_settings.cu | 21 + .../termination_strategy.cu | 5 +- cpp/src/pdlp/utilities/ping_pong_graph.cu | 1 + cpp/tests/linear_programming/pdlp_test.cu | 36 +- .../linear_programming/data_definition.py | 4 +- 15 files changed, 470 insertions(+), 392 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index c20a20a571..1b9d7e85a4 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -20,63 +20,63 @@ #define CUOPT_INSTANTIATE_INT64 0 /* @brief LP/MIP parameter string constants */ -#define CUOPT_ABSOLUTE_DUAL_TOLERANCE "absolute_dual_tolerance" -#define CUOPT_RELATIVE_DUAL_TOLERANCE "relative_dual_tolerance" -#define CUOPT_ABSOLUTE_PRIMAL_TOLERANCE "absolute_primal_tolerance" -#define CUOPT_RELATIVE_PRIMAL_TOLERANCE "relative_primal_tolerance" -#define CUOPT_ABSOLUTE_GAP_TOLERANCE "absolute_gap_tolerance" -#define CUOPT_RELATIVE_GAP_TOLERANCE "relative_gap_tolerance" -#define CUOPT_INFEASIBILITY_DETECTION "infeasibility_detection" -#define CUOPT_STRICT_INFEASIBILITY "strict_infeasibility" -#define CUOPT_PRIMAL_INFEASIBLE_TOLERANCE "primal_infeasible_tolerance" -#define CUOPT_DUAL_INFEASIBLE_TOLERANCE "dual_infeasible_tolerance" -#define CUOPT_ITERATION_LIMIT "iteration_limit" -#define CUOPT_TIME_LIMIT "time_limit" -#define CUOPT_WORK_LIMIT "work_limit" -#define CUOPT_PDLP_SOLVER_MODE "pdlp_solver_mode" -#define CUOPT_METHOD "method" -#define CUOPT_PER_CONSTRAINT_RESIDUAL "per_constraint_residual" -#define CUOPT_SAVE_BEST_PRIMAL_SO_FAR "save_best_primal_so_far" -#define CUOPT_FIRST_PRIMAL_FEASIBLE "first_primal_feasible" -#define CUOPT_LOG_FILE "log_file" -#define CUOPT_LOG_TO_CONSOLE "log_to_console" -#define CUOPT_CROSSOVER "crossover" -#define CUOPT_FOLDING "folding" -#define CUOPT_AUGMENTED "augmented" -#define CUOPT_DUALIZE "dualize" -#define CUOPT_ORDERING "ordering" -#define CUOPT_BARRIER_DUAL_INITIAL_POINT "barrier_dual_initial_point" -#define CUOPT_ELIMINATE_DENSE_COLUMNS "eliminate_dense_columns" -#define CUOPT_CUDSS_DETERMINISTIC "cudss_deterministic" -#define CUOPT_PRESOLVE "presolve" -#define CUOPT_DUAL_POSTSOLVE "dual_postsolve" -#define CUOPT_MIP_DETERMINISM_MODE "mip_determinism_mode" -#define CUOPT_MIP_ABSOLUTE_TOLERANCE "mip_absolute_tolerance" -#define CUOPT_MIP_RELATIVE_TOLERANCE "mip_relative_tolerance" -#define CUOPT_MIP_INTEGRALITY_TOLERANCE "mip_integrality_tolerance" -#define CUOPT_MIP_ABSOLUTE_GAP "mip_absolute_gap" -#define CUOPT_MIP_RELATIVE_GAP "mip_relative_gap" -#define CUOPT_MIP_HEURISTICS_ONLY "mip_heuristics_only" -#define CUOPT_MIP_SCALING "mip_scaling" -#define CUOPT_MIP_PRESOLVE "mip_presolve" -#define CUOPT_MIP_RELIABILITY_BRANCHING "mip_reliability_branching" -#define CUOPT_MIP_CUT_PASSES "mip_cut_passes" -#define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts" -#define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS "mip_mixed_integer_gomory_cuts" -#define CUOPT_MIP_KNAPSACK_CUTS "mip_knapsack_cuts" -#define CUOPT_MIP_CLIQUE_CUTS "mip_clique_cuts" -#define CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS "mip_strong_chvatal_gomory_cuts" -#define CUOPT_MIP_REDUCED_COST_STRENGTHENING "mip_reduced_cost_strengthening" -#define CUOPT_MIP_CUT_CHANGE_THRESHOLD "mip_cut_change_threshold" -#define CUOPT_MIP_CUT_MIN_ORTHOGONALITY "mip_cut_min_orthogonality" -#define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING "mip_batch_pdlp_strong_branching" +#define CUOPT_ABSOLUTE_DUAL_TOLERANCE "absolute_dual_tolerance" +#define CUOPT_RELATIVE_DUAL_TOLERANCE "relative_dual_tolerance" +#define CUOPT_ABSOLUTE_PRIMAL_TOLERANCE "absolute_primal_tolerance" +#define CUOPT_RELATIVE_PRIMAL_TOLERANCE "relative_primal_tolerance" +#define CUOPT_ABSOLUTE_GAP_TOLERANCE "absolute_gap_tolerance" +#define CUOPT_RELATIVE_GAP_TOLERANCE "relative_gap_tolerance" +#define CUOPT_INFEASIBILITY_DETECTION "infeasibility_detection" +#define CUOPT_STRICT_INFEASIBILITY "strict_infeasibility" +#define CUOPT_PRIMAL_INFEASIBLE_TOLERANCE "primal_infeasible_tolerance" +#define CUOPT_DUAL_INFEASIBLE_TOLERANCE "dual_infeasible_tolerance" +#define CUOPT_ITERATION_LIMIT "iteration_limit" +#define CUOPT_TIME_LIMIT "time_limit" +#define CUOPT_WORK_LIMIT "work_limit" +#define CUOPT_PDLP_SOLVER_MODE "pdlp_solver_mode" +#define CUOPT_METHOD "method" +#define CUOPT_PER_CONSTRAINT_RESIDUAL "per_constraint_residual" +#define CUOPT_SAVE_BEST_PRIMAL_SO_FAR "save_best_primal_so_far" +#define CUOPT_FIRST_PRIMAL_FEASIBLE "first_primal_feasible" +#define CUOPT_LOG_FILE "log_file" +#define CUOPT_LOG_TO_CONSOLE "log_to_console" +#define CUOPT_CROSSOVER "crossover" +#define CUOPT_FOLDING "folding" +#define CUOPT_AUGMENTED "augmented" +#define CUOPT_DUALIZE "dualize" +#define CUOPT_ORDERING "ordering" +#define CUOPT_BARRIER_DUAL_INITIAL_POINT "barrier_dual_initial_point" +#define CUOPT_ELIMINATE_DENSE_COLUMNS "eliminate_dense_columns" +#define CUOPT_CUDSS_DETERMINISTIC "cudss_deterministic" +#define CUOPT_PRESOLVE "presolve" +#define CUOPT_DUAL_POSTSOLVE "dual_postsolve" +#define CUOPT_MIP_DETERMINISM_MODE "mip_determinism_mode" +#define CUOPT_MIP_ABSOLUTE_TOLERANCE "mip_absolute_tolerance" +#define CUOPT_MIP_RELATIVE_TOLERANCE "mip_relative_tolerance" +#define CUOPT_MIP_INTEGRALITY_TOLERANCE "mip_integrality_tolerance" +#define CUOPT_MIP_ABSOLUTE_GAP "mip_absolute_gap" +#define CUOPT_MIP_RELATIVE_GAP "mip_relative_gap" +#define CUOPT_MIP_HEURISTICS_ONLY "mip_heuristics_only" +#define CUOPT_MIP_SCALING "mip_scaling" +#define CUOPT_MIP_PRESOLVE "mip_presolve" +#define CUOPT_MIP_RELIABILITY_BRANCHING "mip_reliability_branching" +#define CUOPT_MIP_CUT_PASSES "mip_cut_passes" +#define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts" +#define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS "mip_mixed_integer_gomory_cuts" +#define CUOPT_MIP_KNAPSACK_CUTS "mip_knapsack_cuts" +#define CUOPT_MIP_CLIQUE_CUTS "mip_clique_cuts" +#define CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS "mip_strong_chvatal_gomory_cuts" +#define CUOPT_MIP_REDUCED_COST_STRENGTHENING "mip_reduced_cost_strengthening" +#define CUOPT_MIP_CUT_CHANGE_THRESHOLD "mip_cut_change_threshold" +#define CUOPT_MIP_CUT_MIN_ORTHOGONALITY "mip_cut_min_orthogonality" +#define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING "mip_batch_pdlp_strong_branching" #define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching" -#define CUOPT_SOLUTION_FILE "solution_file" -#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" -#define CUOPT_NUM_GPUS "num_gpus" -#define CUOPT_USER_PROBLEM_FILE "user_problem_file" -#define CUOPT_RANDOM_SEED "random_seed" -#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_SOLUTION_FILE "solution_file" +#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" +#define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_USER_PROBLEM_FILE "user_problem_file" +#define CUOPT_RANDOM_SEED "random_seed" +#define CUOPT_PDLP_PRECISION "pdlp_precision" /* @brief MIP determinism mode constants */ #define CUOPT_MODE_OPPORTUNISTIC 0 diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 62e88d5eb0..4af5e727d8 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -94,14 +94,16 @@ class mip_solver_settings_t { i_t mixed_integer_gomory_cuts = -1; i_t knapsack_cuts = -1; i_t clique_cuts = -1; - i_t strong_chvatal_gomory_cuts = -1; - i_t reduced_cost_strengthening = -1; - f_t cut_change_threshold = -1.0; - f_t cut_min_orthogonality = 0.5; - i_t mip_batch_pdlp_strong_branching{1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only - i_t mip_batch_pdlp_reliability_branching{1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only - i_t num_gpus = 1; - bool log_to_console = true; + i_t strong_chvatal_gomory_cuts = -1; + i_t reduced_cost_strengthening = -1; + f_t cut_change_threshold = -1.0; + f_t cut_min_orthogonality = 0.5; + i_t mip_batch_pdlp_strong_branching{ + 1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + i_t mip_batch_pdlp_reliability_branching{ + 1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + i_t num_gpus = 1; + bool log_to_console = true; std::string log_file; std::string sol_file; diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 17fa7c548f..40b61d4ab0 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -149,12 +149,12 @@ class pdlp_solver_settings_t { * @param[in] initial_primal_weight Initial primal weight. */ void set_initial_primal_weight(f_t initial_primal_weight); - /** + /** * @brief Set an initial pdlp iteration. * * @param[in] initial_pdlp_iteration Initial pdlp iteration. */ - void set_initial_pdlp_iteration(i_t initial_pdlp_iteration); + void set_initial_pdlp_iteration(i_t initial_pdlp_iteration); /** * @brief Set the pdlp warm start data. This allows to restart PDLP with a @@ -284,8 +284,8 @@ class pdlp_solver_settings_t { // concurrently i.e. if new_bounds.size() == 2, then 2 versions of the problem with updated bounds // will be solved concurrently std::vector> new_bounds; - // By default to save memory and speed we don't store and copy each climber's primal and dual solutions - // We only retrieve termination statistics and the objective values + // By default to save memory and speed we don't store and copy each climber's primal and dual + // solutions We only retrieve termination statistics and the objective values bool generate_batch_primal_dual_solution{false}; // Used to force batch PDLP to solve a subbatch of the problems at a time // The 0 default value will make the solver use its heuristic to determine the subbatch size diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index a9b2177a29..0dbc4764f5 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -13,6 +13,8 @@ #include #include +#include + #include #include @@ -27,7 +29,8 @@ namespace { static bool ds_is_valid_done(dual::status_t status) { - return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF; + return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || + status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF; } template @@ -67,8 +70,12 @@ void strong_branch_helper(i_t start, // Batch PDLP has already solved this subproblem, skip it if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) { settings.log.printf( - "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved by PDLP\n", - thread_id, j, branch == 0 ? "down" : "up", shared_idx); + "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved " + "by PDLP\n", + thread_id, + j, + branch == 0 ? "down" : "up", + shared_idx); continue; } @@ -105,7 +112,8 @@ void strong_branch_helper(i_t start, if (status == dual::status_t::DUAL_UNBOUNDED) { // LP was infeasible obj = std::numeric_limits::infinity(); - } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) { + } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || + status == dual::status_t::CUTOFF) { obj = compute_objective(child_problem, solution.x); } else { settings.log.debug("Thread id %2d remaining %d variable %d branch %d status %d\n", @@ -118,7 +126,7 @@ void strong_branch_helper(i_t start, if (branch == 0) { pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0); - ds_obj_down[k] = std::max(obj - root_obj, 0.0); + ds_obj_down[k] = std::max(obj - root_obj, 0.0); ds_status_down[k] = status; if (verbose) { settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n", @@ -131,7 +139,7 @@ void strong_branch_helper(i_t start, } } else { pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0); - ds_obj_up[k] = std::max(obj - root_obj, 0.0); + ds_obj_up[k] = std::max(obj - root_obj, 0.0); ds_status_up[k] = status; if (verbose) { settings.log.printf( @@ -150,21 +158,21 @@ void strong_branch_helper(i_t start, // Mark the subproblem as solved so that batch PDLP removes it from the batch if (sb_view.is_valid()) { // We could not mark as solved nodes hitting iteartion limit in DS - if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || (branch == 1 && ds_is_valid_done(ds_status_up[k]))) - { + if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || + (branch == 1 && ds_is_valid_done(ds_status_up[k]))) { sb_view.mark_solved(shared_idx); settings.log.printf( - "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n", - thread_id, j, branch == 0 ? "down" : "up", shared_idx); + "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in " + "shared context\n", + thread_id, + j, + branch == 0 ? "down" : "up", + shared_idx); } } - if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { - break; - } - } - if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { - break; + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; } } + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; } const i_t completed = pc.num_strong_branches_completed++; @@ -179,30 +187,28 @@ void strong_branch_helper(i_t start, child_problem.lower[j] = original_lp.lower[j]; child_problem.upper[j] = original_lp.upper[j]; - if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { - break; - } + if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; } } } template std::pair trial_branching(const lp_problem_t& original_lp, - const simplex_solver_settings_t& settings, - const std::vector& var_types, - const std::vector& vstatus, - const std::vector& edge_norms, - const basis_update_mpf_t& basis_factors, - const std::vector& basic_list, - const std::vector& nonbasic_list, - i_t branch_var, - f_t branch_var_lower, - f_t branch_var_upper, - f_t upper_bound, - i_t bnb_lp_iter_per_node, - f_t start_time, - i_t upper_max_lp_iter, - i_t lower_max_lp_iter, - omp_atomic_t& total_lp_iter) + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + i_t upper_max_lp_iter, + i_t lower_max_lp_iter, + omp_atomic_t& total_lp_iter) { lp_problem_t child_problem = original_lp; child_problem.lower[branch_var] = branch_var_lower; @@ -271,7 +277,6 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data const std::vector& root_soln, std::vector& original_root_soln_x) { - // Branch and bound has a problem of the form: // minimize c^T x // subject to A*x + Es = b @@ -285,7 +290,6 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data // subject to lb <= A*x <= ub // l <= x <= u - cuopt::mps_parser::mps_data_model_t mps_model; int m = lp.num_rows; int n = lp.num_cols - new_slacks.size(); @@ -331,8 +335,8 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data std::vector slack_map(m, -1); for (i_t j : new_slacks) { const i_t col_start = lp.A.col_start[j]; - const i_t i = lp.A.i[col_start]; - slack_map[i] = j; + const i_t i = lp.A.i[col_start]; + slack_map[i] = j; } for (i_t i = 0; i < m; ++i) { @@ -354,8 +358,8 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data const i_t slack = slack_map[i]; assert(slack != -1); - const i_t col_start = lp.A.col_start[slack]; - const f_t sigma = lp.A.x[col_start]; + const i_t col_start = lp.A.col_start[slack]; + const f_t sigma = lp.A.x[col_start]; const f_t slack_lower = lp.lower[slack]; const f_t slack_upper = lp.upper[slack]; @@ -387,9 +391,9 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data // Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both template static std::pair merge_sb_result(f_t ds_val, - dual::status_t ds_status, - f_t pdlp_dual_obj, - bool pdlp_optimal) + dual::status_t ds_status, + f_t pdlp_dual_obj, + bool pdlp_optimal) { // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify @@ -397,20 +401,24 @@ static std::pair merge_sb_result(f_t ds_val, if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; } // Rule 2: Dual Simplex found infeasible -> declare infeasible - if (ds_status == dual::status_t::DUAL_UNBOUNDED) { return {std::numeric_limits::infinity(), 0}; } + if (ds_status == dual::status_t::DUAL_UNBOUNDED) { + return {std::numeric_limits::infinity(), 0}; + } // Rule 3: Only one converged -> keep that if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; } if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; } // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS - if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || ds_status == dual::status_t::CUTOFF) { return {ds_val, 0}; } + if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || + ds_status == dual::status_t::CUTOFF) { + return {ds_val, 0}; + } // Rule 5: None converged -> NaN return {std::numeric_limits::quiet_NaN(), 2}; } - template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, @@ -450,28 +458,27 @@ void strong_branching(const lp_problem_t& original_lp, std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); auto pdlp_thread = std::thread([&]() { - - if (effective_batch_pdlp == 0) - return; - + if (effective_batch_pdlp == 0) return; + settings.log.printf(effective_batch_pdlp == 2 - ? "Batch PDLP only for strong branching\n" - : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); + ? "Batch PDLP only for strong branching\n" + : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); f_t start_batch = tic(); std::vector original_root_soln_x; if (concurrent_halt.load() == 1) { return; } - const auto mps_model = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); - + const auto mps_model = + simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); std::vector fraction_values; std::vector original_root_soln_y, original_root_soln_z; // TODO put back later once Chris has this part /*uncrush_dual_solution( - original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, original_root_soln_z);*/ + original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, + original_root_soln_z);*/ for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; @@ -485,58 +492,69 @@ void strong_branching(const lp_problem_t& original_lp, std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (warm_start_remaining_time <= 0.0) { return; } + assert(!pc.pdlp_warm_cache.populated && + "PDLP warm cache should not be populated at this point"); + if (!pc.pdlp_warm_cache.populated) { pdlp_solver_settings_t ws_settings; ws_settings.method = method_t::PDLP; ws_settings.presolver = presolver_t::None; ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; ws_settings.detect_infeasibility = false; - // Since the warm start will be used over and over again we want to maximize the chance of convergeance - // Batch PDLP is very compute intensive so we want to minimize the number of iterations - constexpr int warm_start_iteration_limit = 500000; - ws_settings.iteration_limit = warm_start_iteration_limit; - ws_settings.time_limit = warm_start_remaining_time; - constexpr f_t pdlp_tolerance = 1e-5; - ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; - ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; + // Since the warm start will be used over and over again we want to maximize the chance of + // convergeance Batch PDLP is very compute intensive so we want to minimize the number of + // iterations + constexpr int warm_start_iteration_limit = 500000; + ws_settings.iteration_limit = warm_start_iteration_limit; + ws_settings.time_limit = warm_start_remaining_time; + constexpr f_t pdlp_tolerance = 1e-5; + ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; - ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; - ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; - ws_settings.inside_mip = true; - if (effective_batch_pdlp == 1) { - ws_settings.concurrent_halt = &concurrent_halt; - } + ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; + ws_settings.inside_mip = true; + if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } - #ifdef BATCH_VERBOSE_MODE +#ifdef BATCH_VERBOSE_MODE auto start_time = std::chrono::high_resolution_clock::now(); - #endif +#endif auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); - #ifdef BATCH_VERBOSE_MODE +#ifdef BATCH_VERBOSE_MODE auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time).count(); - std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; - #endif + auto duration = + std::chrono::duration_cast(end_time - start_time).count(); + std::cout << "Original problem solved in " << duration << " milliseconds" + << " and iterations: " + << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; +#endif if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) { - auto& cache = pc.pdlp_warm_cache; + auto& cache = pc.pdlp_warm_cache; const auto& ws_primal = ws_solution.get_primal_solution(); const auto& ws_dual = ws_solution.get_dual_solution(); - // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm start - cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); - cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); - cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; - cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; - cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; - cache.populated = true; - - settings.log.printf("Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", - cache.initial_primal.size(), cache.initial_dual.size(), - cache.step_size, cache.primal_weight, cache.pdlp_iteration); + // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm + // start + cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); + cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); + cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; + cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; + cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + cache.populated = true; + + settings.log.printf( + "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", + cache.initial_primal.size(), + cache.initial_dual.size(), + cache.step_size, + cache.primal_weight, + cache.pdlp_iteration); } else { - settings.log.printf("PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", + settings.log.printf( + "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", ws_solution.get_termination_status_string().c_str()); return; } @@ -552,14 +570,15 @@ void strong_branching(const lp_problem_t& original_lp, batch_elapsed_time = toc(start_time); const f_t batch_remaining_time = - std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (batch_remaining_time <= 0.0) { return; } pdlp_settings.time_limit = batch_remaining_time; if (pc.pdlp_warm_cache.populated) { auto& cache = pc.pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution( - cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), + cache.initial_primal.size(), + cache.batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_dual_solution( cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_step_size(cache.step_size); @@ -569,8 +588,8 @@ void strong_branching(const lp_problem_t& original_lp, if (concurrent_halt.load() == 1) { return; } - const auto solutions = - batch_pdlp_solve(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); + const auto solutions = batch_pdlp_solve( + &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Fail safe in case the batch PDLP failed and produced no solutions @@ -624,9 +643,8 @@ void strong_branching(const lp_problem_t& original_lp, pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0)); pdlp_obj_up[k] = std::max(obj_up - root_obj, f_t(0.0)); } - }); - + std::vector ds_status_down(fractional.size(), dual::status_t::UNSET); std::vector ds_status_up(fractional.size(), dual::status_t::UNSET); std::vector ds_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); @@ -676,15 +694,14 @@ void strong_branching(const lp_problem_t& original_lp, } } - // DS done: signal PDLP to stop (time-limit or all work done) and wait - concurrent_halt.store(1); + // DS done: signal PDLP to stop (time-limit or all work done) and wait + concurrent_halt.store(1); } pdlp_thread.join(); settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time)); - // Collect Dual Simplex statistics i_t ds_optimal = 0, ds_infeasible = 0, ds_iter_limit = 0; i_t ds_numerical = 0, ds_cutoff = 0, ds_time_limit = 0; @@ -693,27 +710,30 @@ void strong_branching(const lp_problem_t& original_lp, for (i_t k = 0; k < fractional.size(); k++) { for (auto st : {ds_status_down[k], ds_status_up[k]}) { switch (st) { - case dual::status_t::OPTIMAL: ds_optimal++; break; - case dual::status_t::DUAL_UNBOUNDED: ds_infeasible++; break; - case dual::status_t::ITERATION_LIMIT: ds_iter_limit++; break; - case dual::status_t::NUMERICAL: ds_numerical++; break; - case dual::status_t::CUTOFF: ds_cutoff++; break; - case dual::status_t::TIME_LIMIT: ds_time_limit++; break; - case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++; break; - case dual::status_t::WORK_LIMIT: ds_work_limit++; break; - case dual::status_t::UNSET: ds_unset++; break; + case dual::status_t::OPTIMAL: ds_optimal++; break; + case dual::status_t::DUAL_UNBOUNDED: ds_infeasible++; break; + case dual::status_t::ITERATION_LIMIT: ds_iter_limit++; break; + case dual::status_t::NUMERICAL: ds_numerical++; break; + case dual::status_t::CUTOFF: ds_cutoff++; break; + case dual::status_t::TIME_LIMIT: ds_time_limit++; break; + case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++; break; + case dual::status_t::WORK_LIMIT: ds_work_limit++; break; + case dual::status_t::UNSET: ds_unset++; break; } } } settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit", - ds_optimal, total_subproblems, ds_infeasible, ds_iter_limit); - if (ds_cutoff) settings.log.printf(", %d cutoff", ds_cutoff); + ds_optimal, + total_subproblems, + ds_infeasible, + ds_iter_limit); + if (ds_cutoff) settings.log.printf(", %d cutoff", ds_cutoff); if (ds_time_limit) settings.log.printf(", %d time-limit", ds_time_limit); - if (ds_numerical) settings.log.printf(", %d numerical", ds_numerical); + if (ds_numerical) settings.log.printf(", %d numerical", ds_numerical); if (ds_concurrent) settings.log.printf(", %d concurrent-halt", ds_concurrent); if (ds_work_limit) settings.log.printf(", %d work-limit", ds_work_limit); - if (ds_unset) settings.log.printf(", %d unset/skipped", ds_unset); + if (ds_unset) settings.log.printf(", %d unset/skipped", ds_unset); settings.log.printf("\n"); if (effective_batch_pdlp != 0) { @@ -723,53 +743,69 @@ void strong_branching(const lp_problem_t& original_lp, if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++; } - settings.log.printf( - "Batch PDLP found %d/%d optimal solutions\n", - pdlp_optimal_count, - static_cast(fractional.size() * 2)); + settings.log.printf("Batch PDLP found %d/%d optimal solutions\n", + pdlp_optimal_count, + static_cast(fractional.size() * 2)); } - i_t merged_from_ds = 0; - i_t merged_from_pdlp = 0; - i_t merged_nan = 0; + i_t merged_from_ds = 0; + i_t merged_from_pdlp = 0; + i_t merged_nan = 0; i_t solved_by_both_down = 0; - i_t solved_by_both_up = 0; + i_t solved_by_both_up = 0; for (i_t k = 0; k < fractional.size(); k++) { bool ds_has_down = ds_status_down[k] != dual::status_t::UNSET; bool pdlp_has_down = !std::isnan(pdlp_obj_down[k]); - const auto [value_down, source_down] = merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down); + const auto [value_down, source_down] = + merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down); pc.strong_branch_down[k] = value_down; - if (source_down == 0) merged_from_ds++; - else if (source_down == 1) merged_from_pdlp++; - else merged_nan++; + if (source_down == 0) + merged_from_ds++; + else if (source_down == 1) + merged_from_pdlp++; + else + merged_nan++; if (ds_has_down && pdlp_has_down) { solved_by_both_down++; settings.log.printf( "[COOP SB] Merge: variable %d DOWN solved by BOTH (DS=%e PDLP=%e) -> kept %s\n", - fractional[k], ds_obj_down[k], pdlp_obj_down[k], source_down == 0 ? "DS" : "PDLP"); + fractional[k], + ds_obj_down[k], + pdlp_obj_down[k], + source_down == 0 ? "DS" : "PDLP"); } bool ds_has_up = ds_status_up[k] != dual::status_t::UNSET; bool pdlp_has_up = !std::isnan(pdlp_obj_up[k]); - const auto [value_up, source_up] = merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up); + const auto [value_up, source_up] = + merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up); pc.strong_branch_up[k] = value_up; - if (source_up == 0) merged_from_ds++; - else if (source_up == 1) merged_from_pdlp++; - else merged_nan++; + if (source_up == 0) + merged_from_ds++; + else if (source_up == 1) + merged_from_pdlp++; + else + merged_nan++; if (ds_has_up && pdlp_has_up) { solved_by_both_up++; settings.log.printf( "[COOP SB] Merge: variable %d UP solved by BOTH (DS=%e PDLP=%e) -> kept %s\n", - fractional[k], ds_obj_up[k], pdlp_obj_up[k], source_up == 0 ? "DS" : "PDLP"); + fractional[k], + ds_obj_up[k], + pdlp_obj_up[k], + source_up == 0 ? "DS" : "PDLP"); } } - if (effective_batch_pdlp != 0) { - pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; - settings.log.printf("Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root); + pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = + (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; settings.log.printf( - "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n", + "Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", + pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root); + settings.log.printf( + "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both " + "(down/up)\n", merged_from_ds, merged_from_pdlp, merged_nan, @@ -951,28 +987,36 @@ i_t pseudo_costs_t::reliable_variable_selection( // This indicates that PDLP alone (not batched) couldn't even run at the root node // So it will most likely perform poorly compared to DS // It is also off if the number of candidate is very small - // If warm start could run but almost none of the BPDLP results were used, we also want to avoid using batch PDLP - constexpr i_t min_num_candidates_for_pdlp = 5; + // If warm start could run but almost none of the BPDLP results were used, we also want to avoid + // using batch PDLP + constexpr i_t min_num_candidates_for_pdlp = 5; constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; - const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp; + const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && + unreliable_list.size() > min_num_candidates_for_pdlp && + pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > + min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp; if (rb_mode != 0 && !pdlp_warm_cache.populated) { log.printf("PDLP warm start data not populated, using DS only\n"); - } - else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { + } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { log.printf("Not enough candidates to use batch PDLP, using DS only\n"); - } - else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) { + } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) { log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n"); - } - else if (use_pdlp) { - log.printf("Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved by batch PDLP at root is %f%% (> %f%%)\n", static_cast(unreliable_list.size()), min_num_candidates_for_pdlp, pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp); + } else if (use_pdlp) { + log.printf( + "Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved " + "by batch PDLP at root is %f%% (> %f%%)\n", + static_cast(unreliable_list.size()), + min_num_candidates_for_pdlp, + pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, + min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp); } - const int num_tasks = std::max(max_num_tasks, 1); - const int task_priority = reliability_branching_settings.task_priority; + const int num_tasks = std::max(max_num_tasks, 1); + const int task_priority = reliability_branching_settings.task_priority; // If both batch PDLP and DS are used we double the max number of candidates - const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates; + const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates + : reliability_branching_settings.max_num_candidates; const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); assert(task_priority > 0); @@ -1008,10 +1052,9 @@ i_t pseudo_costs_t::reliable_variable_selection( if (use_pdlp) { pdlp_thread = std::thread([&]() { - log.printf(rb_mode == 2 - ? "RB batch PDLP only for %d candidates\n" - : "RB cooperative batch PDLP and DS for %d candidates\n", - num_candidates); + log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n" + : "RB cooperative batch PDLP and DS for %d candidates\n", + num_candidates); f_t start_batch = tic(); @@ -1019,8 +1062,8 @@ i_t pseudo_costs_t::reliable_variable_selection( if (concurrent_halt.load() == 1) { return; } - auto mps_model = simplex_problem_to_mps_data_model( - original_lp, new_slacks, solution, original_soln_x); + auto mps_model = + simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x); { const i_t n_orig = original_lp.num_cols - new_slacks.size(); for (i_t j = 0; j < n_orig; j++) { @@ -1049,13 +1092,14 @@ i_t pseudo_costs_t::reliable_variable_selection( } pdlp_settings.time_limit = batch_remaining_time; - if (pdlp_warm_cache.populated) { auto& cache = pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution( - cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), + cache.initial_primal.size(), + cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution(cache.initial_dual.data(), + cache.initial_dual.size(), + cache.batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_step_size(cache.step_size); pdlp_settings.set_initial_primal_weight(cache.primal_weight); pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); @@ -1063,8 +1107,11 @@ i_t pseudo_costs_t::reliable_variable_selection( if (concurrent_halt.load() == 1) { return; } - const auto solutions = batch_pdlp_solve( - &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); + const auto solutions = batch_pdlp_solve(&pdlp_warm_cache.batch_pdlp_handle, + mps_model, + candidate_vars, + fraction_values, + pdlp_settings); f_t batch_pdlp_time = toc(start_batch); @@ -1082,7 +1129,9 @@ i_t pseudo_costs_t::reliable_variable_selection( } log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", - batch_pdlp_time, amount_done, num_candidates * 2); + batch_pdlp_time, + amount_done, + num_candidates * 2); for (i_t k = 0; k < num_candidates; k++) { if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { @@ -1121,30 +1170,32 @@ i_t pseudo_costs_t::reliable_variable_selection( if (toc(start_time) > settings.time_limit) { continue; } if (rb_mode == 1 && sb_view.is_solved(i)) { - log.printf("DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i); + log.printf( + "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i); } else { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { // Do trial branching on the down branch - const auto [obj, status] = trial_branching(worker->leaf_problem, - settings, - var_types, - node_ptr->vstatus, - worker->leaf_edge_norms, - worker->basis_factors, - worker->basic_list, - worker->nonbasic_list, - j, - worker->leaf_problem.lower[j], - std::floor(solution[j]), - upper_bound, - branch_and_bound_lp_iter_per_node, - start_time, - reliability_branching_settings.upper_max_lp_iter, - reliability_branching_settings.lower_max_lp_iter, - strong_branching_lp_iter); - - ds_obj_down[i] = obj; + const auto [obj, status] = + trial_branching(worker->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker->leaf_edge_norms, + worker->basis_factors, + worker->basic_list, + worker->nonbasic_list, + j, + worker->leaf_problem.lower[j], + std::floor(solution[j]), + upper_bound, + branch_and_bound_lp_iter_per_node, + start_time, + reliability_branching_settings.upper_max_lp_iter, + reliability_branching_settings.lower_max_lp_iter, + strong_branching_lp_iter); + + ds_obj_down[i] = obj; ds_status_down[i] = status; if (!std::isnan(obj)) { f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); @@ -1162,29 +1213,32 @@ i_t pseudo_costs_t::reliable_variable_selection( const i_t shared_idx = i + num_candidates; if (rb_mode == 1 && sb_view.is_solved(shared_idx)) { - log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", j, shared_idx); + log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", + j, + shared_idx); } else { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { - const auto [obj, status] = trial_branching(worker->leaf_problem, - settings, - var_types, - node_ptr->vstatus, - worker->leaf_edge_norms, - worker->basis_factors, - worker->basic_list, - worker->nonbasic_list, - j, - std::ceil(solution[j]), - worker->leaf_problem.upper[j], - upper_bound, - branch_and_bound_lp_iter_per_node, - start_time, - reliability_branching_settings.upper_max_lp_iter, - reliability_branching_settings.lower_max_lp_iter, - strong_branching_lp_iter); - - ds_obj_up[i] = obj; + const auto [obj, status] = + trial_branching(worker->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker->leaf_edge_norms, + worker->basis_factors, + worker->basic_list, + worker->nonbasic_list, + j, + std::ceil(solution[j]), + worker->leaf_problem.upper[j], + upper_bound, + branch_and_bound_lp_iter_per_node, + start_time, + reliability_branching_settings.upper_max_lp_iter, + reliability_branching_settings.lower_max_lp_iter, + strong_branching_lp_iter); + + ds_obj_up[i] = obj; ds_status_up[i] = status; if (!std::isnan(obj)) { f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); @@ -1216,23 +1270,19 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t ds_elapsed = toc(ds_start_time); // TODO put back - //if (rb_mode != 2) { + // if (rb_mode != 2) { // if (rb_mode == 1) { // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n", - // num_candidates, - // ds_optimal.load(), num_candidates * 2, + // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped + // (PDLP) in %.2fs\n", num_candidates, ds_optimal.load(), num_candidates * 2, // ds_infeasible.load(), num_candidates * 2, // ds_failed.load(), num_candidates * 2, // ds_skipped.load(), ds_elapsed); // } else { // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n", - // num_candidates, - // ds_optimal.load(), num_candidates * 2, - // ds_infeasible.load(), num_candidates * 2, - // ds_failed.load(), num_candidates * 2, - // ds_elapsed); + // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in + // %.2fs\n", num_candidates, ds_optimal.load(), num_candidates * 2, ds_infeasible.load(), + // num_candidates * 2, ds_failed.load(), num_candidates * 2, ds_elapsed); // } //} @@ -1249,7 +1299,8 @@ i_t pseudo_costs_t::reliable_variable_selection( pdlp_optimal++; const auto [merged_obj, source] = merge_sb_result(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true); - // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable) + // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent + // calls may have made it reliable) if (source == 1) { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { @@ -1268,7 +1319,8 @@ i_t pseudo_costs_t::reliable_variable_selection( pdlp_optimal++; const auto [merged_obj, source] = merge_sb_result(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true); - // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable) + // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent + // calls may have made it reliable) if (source == 1) { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { @@ -1290,7 +1342,10 @@ i_t pseudo_costs_t::reliable_variable_selection( } log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n", - num_candidates, pdlp_optimal, num_candidates * 2, pdlp_applied); + num_candidates, + pdlp_optimal, + num_candidates * 2, + pdlp_applied); } log.printf( diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp index 6cbea737f5..6840ccbb77 100644 --- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp +++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp @@ -34,10 +34,7 @@ struct shared_strong_branching_context_view_t { bool is_valid() const { return !solved.empty(); } - bool is_solved(i_t local_idx) const - { - return solved[local_idx].load() != 0; - } + bool is_solved(i_t local_idx) const { return solved[local_idx].load() != 0; } void mark_solved(i_t local_idx) const { solved[local_idx].store(1); } diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index c097baf561..882f7a14f7 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -187,8 +187,10 @@ struct simplex_solver_settings_t { // strengthening f_t cut_change_threshold; // threshold for cut change f_t cut_min_orthogonality; // minimum orthogonality for cuts - i_t mip_batch_pdlp_strong_branching{0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only - i_t mip_batch_pdlp_reliability_branching{0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + i_t + mip_batch_pdlp_strong_branching; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + i_t mip_batch_pdlp_reliability_branching; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch + // PDLP only diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 749d89a35c..5440809754 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -99,8 +99,8 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, - {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, - {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, + {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 1}, + {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 1}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 37e9e1a31f..642c17758d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -779,18 +779,19 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) // Sync external solved status into internal termination strategy before all_done() check if (settings_.shared_sb_view.is_valid()) { for (size_t i = 0; i < climber_strategies_.size(); ++i) { - // If PDLP has solved it to optimality we want to keep it and resolved both solvers having solved the problem later + // If PDLP has solved it to optimality we want to keep it and resolved both solvers having + // solved the problem later if (current_termination_strategy_.is_done( current_termination_strategy_.get_termination_status(i))) continue; const i_t local_idx = climber_strategies_[i].original_index; if (settings_.shared_sb_view.is_solved(local_idx)) { - current_termination_strategy_.set_termination_status(i, - pdlp_termination_status_t::ConcurrentLimit); + current_termination_strategy_.set_termination_status( + i, pdlp_termination_status_t::ConcurrentLimit); #ifdef BATCH_VERBOSE_MODE - std::cout << "[COOP SB] DS already solved climber " << i << " (original_index " - << local_idx << "), synced to ConcurrentLimit at step " - << internal_solver_iterations_ << std::endl; + std::cout << "[COOP SB] DS already solved climber " << i << " (original_index " << local_idx + << "), synced to ConcurrentLimit at step " << internal_solver_iterations_ + << std::endl; #endif } } @@ -1863,7 +1864,7 @@ void pdlp_solver_t::resize_and_swap_all_context_loop( &new_buf_size, stream_view_)); current_op_problem_evaluation_cusparse_view_.buffer_transpose_batch.resize(new_buf_size, - stream_view_); + stream_view_); // Convergence info: A * batch_primal_solutions -> batch_tmp_duals RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize( @@ -1879,7 +1880,7 @@ void pdlp_solver_t::resize_and_swap_all_context_loop( &new_buf_size, stream_view_)); current_op_problem_evaluation_cusparse_view_.buffer_non_transpose_batch.resize(new_buf_size, - stream_view_); + stream_view_); } // Rerun preprocess @@ -2315,13 +2316,16 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co } if (settings_.get_initial_pdlp_iteration().has_value()) { total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); - // This is meaningless in batch mode since pdhg step is never used, set it just to avoid assertions - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, stream_view_); + // This is meaningless in batch mode since pdhg step is never used, set it just to avoid + // assertions + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, + stream_view_); pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; - // Reset the fixed point error since at this pdlp iteration it is expected to already be initialized to some value + // Reset the fixed point error since at this pdlp iteration it is expected to already be + // initialized to some value std::fill(restart_strategy_.initial_fixed_point_error_.begin(), - restart_strategy_.initial_fixed_point_error_.end(), - f_t(0.0)); + restart_strategy_.initial_fixed_point_error_.end(), + f_t(0.0)); std::fill(restart_strategy_.fixed_point_error_.begin(), restart_strategy_.fixed_point_error_.end(), f_t(0.0)); @@ -2472,8 +2476,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co if (is_major_iteration || artificial_restart_check_main_loop || error_occured || is_conditional_major) { if (verbose) { - std::cout << "-------------------------------" << std::endl; - std::cout << internal_solver_iterations_ << std::endl; + std::cout << "-------------------------------" << std::endl; + std::cout << internal_solver_iterations_ << std::endl; raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout); raft::print_device_vector( "primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); diff --git a/cpp/src/pdlp/pdlp_constants.hpp b/cpp/src/pdlp/pdlp_constants.hpp index cf17cc985b..568d7d00b0 100644 --- a/cpp/src/pdlp/pdlp_constants.hpp +++ b/cpp/src/pdlp/pdlp_constants.hpp @@ -7,8 +7,6 @@ #pragma once -#include - #include namespace cuopt::linear_programming::detail { diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 275c119d03..4763391d0e 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -904,19 +904,19 @@ optimization_problem_solution_t run_batch_pdlp( optimization_problem_t& problem, pdlp_solver_settings_t const& settings) { // Hyper parameter than can be changed, I have put what I believe to be the best - constexpr bool pdlp_primal_dual_init = true; - constexpr bool primal_weight_init = true; + constexpr bool pdlp_primal_dual_init = true; + constexpr bool primal_weight_init = true; constexpr bool use_initial_pdlp_iterations = true; - bool use_optimal_batch_size = false; - constexpr int batch_iteration_limit = 100000; - constexpr f_t pdlp_tolerance = 1e-5; + bool use_optimal_batch_size = false; + constexpr int batch_iteration_limit = 100000; + constexpr f_t pdlp_tolerance = 1e-5; rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream(); rmm::device_uvector initial_primal(0, stream); rmm::device_uvector initial_dual(0, stream); - f_t initial_step_size = std::numeric_limits::signaling_NaN(); - f_t initial_primal_weight = std::numeric_limits::signaling_NaN(); + f_t initial_step_size = std::numeric_limits::signaling_NaN(); + f_t initial_primal_weight = std::numeric_limits::signaling_NaN(); i_t initial_pdlp_iteration = -1; cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0"); @@ -927,63 +927,61 @@ optimization_problem_solution_t run_batch_pdlp( const double memory_estimate = batch_pdlp_memory_estimator(problem, max_batch_size); size_t st_free_mem, st_total_mem; RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem)); - const double free_mem = static_cast(st_free_mem); + const double free_mem = static_cast(st_free_mem); const double total_mem = static_cast(st_total_mem); - #ifdef BATCH_VERBOSE_MODE +#ifdef BATCH_VERBOSE_MODE std::cout << "Memory estimate: " << memory_estimate << std::endl; std::cout << "Free memory: " << free_mem << std::endl; std::cout << "Total memory: " << total_mem << std::endl; - #endif +#endif if (memory_estimate > free_mem) { use_optimal_batch_size = true; // Decrement batch size iteratively until we find a batch size that fits while (memory_max_batch_size > 1) { - const double memory_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size); + const double memory_estimate = batch_pdlp_memory_estimator(problem, memory_max_batch_size); if (memory_estimate <= free_mem) { break; } - #ifdef BATCH_VERBOSE_MODE +#ifdef BATCH_VERBOSE_MODE std::cout << "Memory estimate: " << memory_estimate << std::endl; std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl; std::cout << "Free memory: " << free_mem << std::endl; std::cout << "Total memory: " << total_mem << std::endl; std::cout << "--------------------------------" << std::endl; - #endif +#endif memory_max_batch_size--; } - const double min_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size); + const double min_estimate = batch_pdlp_memory_estimator(problem, memory_max_batch_size); if (min_estimate > free_mem) { - return optimization_problem_solution_t( - pdlp_termination_status_t::NumericalError, stream); + return optimization_problem_solution_t(pdlp_termination_status_t::NumericalError, + stream); } } size_t optimal_batch_size = use_optimal_batch_size - ? detail::optimal_batch_size_handler(problem, memory_max_batch_size) - : max_batch_size; - if (settings.sub_batch_size > 0) { - optimal_batch_size = settings.sub_batch_size; - } + ? detail::optimal_batch_size_handler(problem, memory_max_batch_size) + : max_batch_size; + if (settings.sub_batch_size > 0) { optimal_batch_size = settings.sub_batch_size; } cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size, "Optimal batch size should be between 1 and max batch size"); - const bool warm_start_from_settings = - settings.has_initial_primal_solution() || settings.has_initial_dual_solution() || - settings.get_initial_step_size().has_value() || - settings.get_initial_primal_weight().has_value() || - settings.get_initial_pdlp_iteration().has_value(); + const bool warm_start_from_settings = settings.has_initial_primal_solution() || + settings.has_initial_dual_solution() || + settings.get_initial_step_size().has_value() || + settings.get_initial_primal_weight().has_value() || + settings.get_initial_pdlp_iteration().has_value(); if (warm_start_from_settings) { - #ifdef BATCH_VERBOSE_MODE +#ifdef BATCH_VERBOSE_MODE std::cout << "Using warm start from settings" << std::endl; - #endif +#endif if (settings.has_initial_primal_solution() && pdlp_primal_dual_init) { - initial_primal = rmm::device_uvector(settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream()); + initial_primal = rmm::device_uvector(settings.get_initial_primal_solution(), + settings.get_initial_primal_solution().stream()); } if (settings.has_initial_dual_solution() && pdlp_primal_dual_init) { - initial_dual = rmm::device_uvector(settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream()); + initial_dual = rmm::device_uvector(settings.get_initial_dual_solution(), + settings.get_initial_dual_solution().stream()); } if (settings.get_initial_step_size().has_value() && pdlp_primal_dual_init) { initial_step_size = *settings.get_initial_step_size(); @@ -998,30 +996,33 @@ optimization_problem_solution_t run_batch_pdlp( // Only used in tests const bool collect_solutions = settings.generate_batch_primal_dual_solution; - - rmm::device_uvector full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); - rmm::device_uvector full_dual_solution((collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream); - rmm::device_uvector full_reduced_cost((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); + + rmm::device_uvector full_primal_solution( + (collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); + rmm::device_uvector full_dual_solution( + (collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream); + rmm::device_uvector full_reduced_cost( + (collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); std::vector< typename optimization_problem_solution_t::additional_termination_information_t> full_info; std::vector full_status; - pdlp_solver_settings_t batch_settings = settings; - const auto original_new_bounds = batch_settings.new_bounds; - batch_settings.method = cuopt::linear_programming::method_t::PDLP; - batch_settings.presolver = presolver_t::None; - batch_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - batch_settings.detect_infeasibility = false; - batch_settings.iteration_limit = batch_iteration_limit; - batch_settings.inside_mip = true; - batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; - batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; + pdlp_solver_settings_t batch_settings = settings; + const auto original_new_bounds = batch_settings.new_bounds; + batch_settings.method = cuopt::linear_programming::method_t::PDLP; + batch_settings.presolver = presolver_t::None; + batch_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + batch_settings.detect_infeasibility = false; + batch_settings.iteration_limit = batch_iteration_limit; + batch_settings.inside_mip = true; + batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; + batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; batch_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; batch_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; - batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; - batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; + batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; + batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; if (initial_primal.size() > 0) { batch_settings.set_initial_primal_solution( initial_primal.data(), initial_primal.size(), initial_primal.stream()); @@ -1030,9 +1031,7 @@ optimization_problem_solution_t run_batch_pdlp( batch_settings.set_initial_dual_solution( initial_dual.data(), initial_dual.size(), initial_dual.stream()); } - if (!std::isnan(initial_step_size)) { - batch_settings.set_initial_step_size(initial_step_size); - } + if (!std::isnan(initial_step_size)) { batch_settings.set_initial_step_size(initial_step_size); } if (initial_pdlp_iteration != -1) { batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration); } @@ -1047,26 +1046,24 @@ optimization_problem_solution_t run_batch_pdlp( original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size); if (settings.shared_sb_view.is_valid()) { - batch_settings.shared_sb_view = - settings.shared_sb_view.subview(i, current_batch_size); + batch_settings.shared_sb_view = settings.shared_sb_view.subview(i, current_batch_size); } auto sol = solve_lp(problem, batch_settings); - if (collect_solutions) { raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), - sol.get_primal_solution().data(), - sol.get_primal_solution().size(), - stream); + sol.get_primal_solution().data(), + sol.get_primal_solution().size(), + stream); raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(), - sol.get_dual_solution().data(), - sol.get_dual_solution().size(), - stream); + sol.get_dual_solution().data(), + sol.get_dual_solution().size(), + stream); raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(), - sol.get_reduced_cost().data(), - sol.get_reduced_cost().size(), - stream); + sol.get_reduced_cost().data(), + sol.get_reduced_cost().size(), + stream); } auto info = sol.get_additional_termination_informations(); full_info.insert(full_info.end(), info.begin(), info.end()); diff --git a/cpp/src/pdlp/solver_settings.cu b/cpp/src/pdlp/solver_settings.cu index 30d5ccaea5..ac2564bb16 100644 --- a/cpp/src/pdlp/solver_settings.cu +++ b/cpp/src/pdlp/solver_settings.cu @@ -61,12 +61,30 @@ void pdlp_solver_settings_t::set_initial_dual_solution(const f_t* init template void pdlp_solver_settings_t::set_initial_step_size(f_t initial_step_size) { + cuopt_expects(initial_step_size > f_t(0), + error_type_t::ValidationError, + "Initial step size must be greater than 0"); + cuopt_expects(!std::isinf(initial_step_size), + error_type_t::ValidationError, + "Initial step size must be finite"); + cuopt_expects(!std::isnan(initial_step_size), + error_type_t::ValidationError, + "Initial step size must be a number"); initial_step_size_ = std::make_optional(initial_step_size); } template void pdlp_solver_settings_t::set_initial_primal_weight(f_t initial_primal_weight) { + cuopt_expects(initial_primal_weight > f_t(0), + error_type_t::ValidationError, + "Initial primal weight must be greater than 0"); + cuopt_expects(!std::isinf(initial_primal_weight), + error_type_t::ValidationError, + "Initial primal weight must be finite"); + cuopt_expects(!std::isnan(initial_primal_weight), + error_type_t::ValidationError, + "Initial primal weight must be a number"); initial_primal_weight_ = std::make_optional(initial_primal_weight); } @@ -351,6 +369,9 @@ std::optional pdlp_solver_settings_t::get_initial_primal_weight() template void pdlp_solver_settings_t::set_initial_pdlp_iteration(i_t initial_pdlp_iteration) { + cuopt_expects(initial_pdlp_iteration >= 0, + error_type_t::ValidationError, + "Initial pdlp iteration must be greater than or equal to 0"); initial_pdlp_iteration_ = std::make_optional(initial_pdlp_iteration); } diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu index 563850dc0c..167cf33e73 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu @@ -125,9 +125,10 @@ pdlp_termination_status_t pdlp_termination_strategy_t::get_termination } template -void pdlp_termination_strategy_t::set_termination_status( - i_t id, pdlp_termination_status_t status) +void pdlp_termination_strategy_t::set_termination_status(i_t id, + pdlp_termination_status_t status) { + cuopt_assert(id < termination_status_.size(), "id too big for batch size"); termination_status_[id] = (i_t)status; } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 4ec5bff8c1..0df3861b5a 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -8,6 +8,7 @@ #include #include +#include #include diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index be91e96015..ef43b1a591 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -1680,11 +1680,11 @@ TEST(pdlp_class, strong_branching_test) const std::vector fractional = {1, 2, 4}; const std::vector root_soln_x = {0.891, 0.109, 0.636429}; - auto solver_settings = pdlp_solver_settings_t{}; - solver_settings.method = cuopt::linear_programming::method_t::PDLP; - solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - solver_settings.presolver = cuopt::linear_programming::presolver_t::None; - solver_settings.generate_batch_primal_dual_solution = true; + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.generate_batch_primal_dual_solution = true; const int n_fractional = fractional.size(); const int batch_size = n_fractional * 2; @@ -2170,11 +2170,11 @@ TEST(pdlp_class, shared_sb_view_subbatch) const int n_fractional = fractional.size(); const int batch_size = n_fractional * 2; - auto solver_settings = pdlp_solver_settings_t{}; - solver_settings.method = cuopt::linear_programming::method_t::PDLP; - solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - solver_settings.presolver = cuopt::linear_programming::presolver_t::None; - solver_settings.sub_batch_size = 2; + auto solver_settings = pdlp_solver_settings_t{}; + solver_settings.method = cuopt::linear_programming::method_t::PDLP; + solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + solver_settings.presolver = cuopt::linear_programming::presolver_t::None; + solver_settings.sub_batch_size = 2; shared_strong_branching_context_t ctx(batch_size); @@ -2227,9 +2227,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark) solver_settings.iteration_limit = 1000000; for (int i = 0; i < n_fractional; ++i) - solver_settings.new_bounds.push_back({fractional[0], - -5, - -5}); + solver_settings.new_bounds.push_back({fractional[0], -5, -5}); for (int i = 0; i < n_fractional; ++i) solver_settings.new_bounds.push_back({fractional[i], @@ -2266,7 +2264,9 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark) // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it) EXPECT_TRUE(status == pdlp_termination_status_t::Optimal || status == pdlp_termination_status_t::ConcurrentLimit) - << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t::get_termination_status_string(status); + << "Entry " << i << " has unexpected status " + << cuopt::linear_programming::optimization_problem_solution_t:: + get_termination_status_string(status); } // All entries should end up marked solved @@ -2298,9 +2298,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible) solver_settings.iteration_limit = 1000000; for (int i = 0; i < n_fractional; ++i) - solver_settings.new_bounds.push_back({fractional[0], - -5, - -5}); + solver_settings.new_bounds.push_back({fractional[0], -5, -5}); shared_strong_branching_context_t ctx(batch_size); @@ -2331,7 +2329,9 @@ TEST(pdlp_class, shared_sb_view_all_infeasible) auto status = solution.get_termination_status(i); // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it) EXPECT_TRUE(status == pdlp_termination_status_t::ConcurrentLimit) - << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t::get_termination_status_string(status); + << "Entry " << i << " has unexpected status " + << cuopt::linear_programming::optimization_problem_solution_t:: + get_termination_status_string(status); } // All entries should end up marked solved diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 32cf860f28..ddc38539f5 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -451,13 +451,13 @@ class SolverConfig(BaseModel): "heuristics and branch and bound for MILP", ) mip_batch_pdlp_strong_branching: Optional[int] = Field( - default=0, + default=1, description="Strong branching mode: 0 = Dual Simplex only, " "1 = cooperative work-stealing (DS + batch PDLP), " "2 = batch PDLP only.", ) mip_batch_pdlp_reliability_branching: Optional[int] = Field( - default=0, + default=1, description="Reliability branching mode: 0 = Dual Simplex only, " "1 = cooperative work-stealing (DS + batch PDLP), " "2 = batch PDLP only.", From 962c2eabff9704b1de9c85cf144ad3e8d39a0547 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 13:34:10 +0200 Subject: [PATCH 36/43] fix: disable batch pdlp if deterministic mode --- cpp/src/branch_and_bound/pseudo_costs.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 0dbc4764f5..6287150e86 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -442,7 +442,14 @@ void strong_branching(const lp_problem_t& original_lp, const f_t elapsed_time = toc(start_time); if (elapsed_time > settings.time_limit) { return; } - const i_t effective_batch_pdlp = settings.sub_mip ? 0 : settings.mip_batch_pdlp_strong_branching; + const i_t effective_batch_pdlp = + (settings.sub_mip || settings.deterministic) ? 0 : settings.mip_batch_pdlp_strong_branching; + + if (settings.mip_batch_pdlp_strong_branching != 0 && + (settings.sub_mip || settings.deterministic)) { + settings.log.printf( + "Batch PDLP strong branching is disabled because sub-MIP or deterministic mode is enabled\n"); + } settings.log.printf("Strong branching using %d threads and %ld fractional variables\n", settings.num_threads, @@ -991,13 +998,19 @@ i_t pseudo_costs_t::reliable_variable_selection( // using batch PDLP constexpr i_t min_num_candidates_for_pdlp = 5; constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; - const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && + const bool use_pdlp = (rb_mode != 0) && !settings.sub_mip && !settings.deterministic && + pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp; if (rb_mode != 0 && !pdlp_warm_cache.populated) { log.printf("PDLP warm start data not populated, using DS only\n"); + } else if (rb_mode != 0 && settings.sub_mip) { + log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n"); + } else if (rb_mode != 0 && settings.deterministic) { + log.printf( + "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n"); } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { log.printf("Not enough candidates to use batch PDLP, using DS only\n"); } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) { From 496c4fd91bc355101588aded0bf56e6a96ace4bb Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 13:40:20 +0200 Subject: [PATCH 37/43] fix: add size assertion to shared strong branching context --- .../shared_strong_branching_context.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp index 6840ccbb77..60982d9344 100644 --- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp +++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp @@ -34,12 +34,21 @@ struct shared_strong_branching_context_view_t { bool is_valid() const { return !solved.empty(); } - bool is_solved(i_t local_idx) const { return solved[local_idx].load() != 0; } + bool is_solved(i_t local_idx) const + { + assert(local_idx < solved.size() && "local_idx out of bounds"); + return solved[local_idx].load() != 0; + } - void mark_solved(i_t local_idx) const { solved[local_idx].store(1); } + void mark_solved(i_t local_idx) const + { + assert(local_idx < solved.size() && "local_idx out of bounds"); + solved[local_idx].store(1); + } shared_strong_branching_context_view_t subview(i_t offset, i_t count) const { + assert(offset + count <= solved.size() && "subview out of bounds"); return {solved.subspan(offset, count)}; } }; From 9ec3f40576e942bd58e20dd49ebff84c189c400e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 14:34:44 +0200 Subject: [PATCH 38/43] cleanup names --- cpp/src/branch_and_bound/pseudo_costs.cpp | 79 ++++++++++++----------- cpp/src/branch_and_bound/pseudo_costs.hpp | 2 +- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 6287150e86..a04dd6a1f5 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -27,7 +27,7 @@ namespace cuopt::linear_programming::dual_simplex { namespace { -static bool ds_is_valid_done(dual::status_t status) +static bool is_dual_simplex_done(dual::status_t status) { return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF; @@ -158,8 +158,8 @@ void strong_branch_helper(i_t start, // Mark the subproblem as solved so that batch PDLP removes it from the batch if (sb_view.is_valid()) { // We could not mark as solved nodes hitting iteartion limit in DS - if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || - (branch == 1 && ds_is_valid_done(ds_status_up[k]))) { + if ((branch == 0 && is_dual_simplex_done(ds_status_down[k])) || + (branch == 1 && is_dual_simplex_done(ds_status_up[k]))) { sb_view.mark_solved(shared_idx); settings.log.printf( "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in " @@ -381,6 +381,8 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data return mps_model; } +enum class sb_source_t { DUAL_SIMPLEX, PDLP, NONE }; + // Merge a single strong branching result from Dual Simplex and PDLP. // Rules: // 1. If both found optimal -> keep DS (higher quality vertex solution) @@ -388,35 +390,40 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data // 3. Else if one is optimal -> keep the optimal one // 4. Else if Dual Simplex hit iteration limit -> keep DS // 5. Else if none converged -> NaN (original objective) -// Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both template -static std::pair merge_sb_result(f_t ds_val, - dual::status_t ds_status, - f_t pdlp_dual_obj, - bool pdlp_optimal) +static std::pair merge_sb_result(f_t ds_val, + dual::status_t ds_status, + f_t pdlp_dual_obj, + bool pdlp_optimal) { // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify // Rule 1: Both optimal -> keep DS - if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; } + if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { + return {ds_val, sb_source_t::DUAL_SIMPLEX}; + } // Rule 2: Dual Simplex found infeasible -> declare infeasible if (ds_status == dual::status_t::DUAL_UNBOUNDED) { - return {std::numeric_limits::infinity(), 0}; + return {std::numeric_limits::infinity(), sb_source_t::DUAL_SIMPLEX}; } // Rule 3: Only one converged -> keep that - if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; } - if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; } + if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { + return {ds_val, sb_source_t::DUAL_SIMPLEX}; + } + if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { + return {pdlp_dual_obj, sb_source_t::PDLP}; + } // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || ds_status == dual::status_t::CUTOFF) { - return {ds_val, 0}; + return {ds_val, sb_source_t::DUAL_SIMPLEX}; } // Rule 5: None converged -> NaN - return {std::numeric_limits::quiet_NaN(), 2}; + return {std::numeric_limits::quiet_NaN(), sb_source_t::NONE}; } template @@ -766,9 +773,9 @@ void strong_branching(const lp_problem_t& original_lp, const auto [value_down, source_down] = merge_sb_result(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down); pc.strong_branch_down[k] = value_down; - if (source_down == 0) + if (source_down == sb_source_t::DUAL_SIMPLEX) merged_from_ds++; - else if (source_down == 1) + else if (source_down == sb_source_t::PDLP) merged_from_pdlp++; else merged_nan++; @@ -779,7 +786,7 @@ void strong_branching(const lp_problem_t& original_lp, fractional[k], ds_obj_down[k], pdlp_obj_down[k], - source_down == 0 ? "DS" : "PDLP"); + source_down == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP"); } bool ds_has_up = ds_status_up[k] != dual::status_t::UNSET; @@ -787,9 +794,9 @@ void strong_branching(const lp_problem_t& original_lp, const auto [value_up, source_up] = merge_sb_result(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up); pc.strong_branch_up[k] = value_up; - if (source_up == 0) + if (source_up == sb_source_t::DUAL_SIMPLEX) merged_from_ds++; - else if (source_up == 1) + else if (source_up == sb_source_t::PDLP) merged_from_pdlp++; else merged_nan++; @@ -800,16 +807,16 @@ void strong_branching(const lp_problem_t& original_lp, fractional[k], ds_obj_up[k], pdlp_obj_up[k], - source_up == 0 ? "DS" : "PDLP"); + source_up == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP"); } } if (effective_batch_pdlp != 0) { - pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = + pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; settings.log.printf( - "Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", - pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root); + "Batch PDLP only for strong branching. percent solved by batch PDLP at root: %f\n", + pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root); settings.log.printf( "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both " "(down/up)\n", @@ -996,13 +1003,13 @@ i_t pseudo_costs_t::reliable_variable_selection( // It is also off if the number of candidate is very small // If warm start could run but almost none of the BPDLP results were used, we also want to avoid // using batch PDLP - constexpr i_t min_num_candidates_for_pdlp = 5; - constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; + constexpr i_t min_num_candidates_for_pdlp = 5; + constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; const bool use_pdlp = (rb_mode != 0) && !settings.sub_mip && !settings.deterministic && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && - pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > - min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp; + pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root > + min_percent_solved_by_batch_pdlp_at_root_for_pdlp; if (rb_mode != 0 && !pdlp_warm_cache.populated) { log.printf("PDLP warm start data not populated, using DS only\n"); @@ -1013,16 +1020,16 @@ i_t pseudo_costs_t::reliable_variable_selection( "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n"); } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { log.printf("Not enough candidates to use batch PDLP, using DS only\n"); - } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) { - log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n"); + } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) { + log.printf("Percent solved by batch PDLP at root is too low, using DS only\n"); } else if (use_pdlp) { log.printf( - "Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved " + "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved " "by batch PDLP at root is %f%% (> %f%%)\n", static_cast(unreliable_list.size()), min_num_candidates_for_pdlp, - pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, - min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp); + pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root, + min_percent_solved_by_batch_pdlp_at_root_for_pdlp); } const int num_tasks = std::max(max_num_tasks, 1); @@ -1216,7 +1223,7 @@ i_t pseudo_costs_t::reliable_variable_selection( pseudo_cost_sum_down[j] += change_in_obj / change_in_x; pseudo_cost_num_down[j]++; // Should be valid if were are already here - if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(i); } + if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); } } } pseudo_cost_mutex_down[j].unlock(); @@ -1259,7 +1266,7 @@ i_t pseudo_costs_t::reliable_variable_selection( pseudo_cost_sum_up[j] += change_in_obj / change_in_x; pseudo_cost_num_up[j]++; // Should be valid if were are already here - if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(shared_idx); } + if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); } } } pseudo_cost_mutex_up[j].unlock(); @@ -1314,7 +1321,7 @@ i_t pseudo_costs_t::reliable_variable_selection( merge_sb_result(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true); // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent // calls may have made it reliable) - if (source == 1) { + if (source == sb_source_t::PDLP) { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); @@ -1334,7 +1341,7 @@ i_t pseudo_costs_t::reliable_variable_selection( merge_sb_result(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true); // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent // calls may have made it reliable) - if (source == 1) { + if (source == sb_source_t::PDLP) { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index be8f9f71d4..322daa8907 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -416,7 +416,7 @@ struct batch_pdlp_warm_cache_t { f_t step_size{std::numeric_limits::signaling_NaN()}; f_t primal_weight{std::numeric_limits::signaling_NaN()}; i_t pdlp_iteration{-1}; - f_t pourcent_solved_by_batch_pdlp_at_root{f_t(0.0)}; + f_t percent_solved_by_batch_pdlp_at_root{f_t(0.0)}; bool populated{false}; }; From 16e4e5fbec08770973c8cb24122e6b81be30053f Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 18:00:00 +0200 Subject: [PATCH 39/43] multiples fixes: use span only in solver, use tasks to launch bpdlp rather than thread, put both bpdlp call in functions --- .../pdlp/solver_settings.hpp | 7 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 623 ++++++++++-------- .../shared_strong_branching_context.hpp | 10 +- cpp/src/pdlp/pdlp.cu | 16 +- cpp/src/pdlp/pdlp.cuh | 3 + cpp/src/pdlp/solve.cu | 4 +- cpp/tests/linear_programming/pdlp_test.cu | 49 +- 7 files changed, 381 insertions(+), 331 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index ded180fdf3..6abefb2d5d 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -17,8 +17,7 @@ #include #include - -#include +#include namespace cuopt::linear_programming { @@ -275,8 +274,8 @@ class pdlp_solver_settings_t { bool inside_mip{false}; // For concurrent termination std::atomic* concurrent_halt{nullptr}; - // Shared strong branching context view for cooperative DS + PDLP - dual_simplex::shared_strong_branching_context_view_t shared_sb_view; + // Shared strong branching solved flags for cooperative DS + PDLP + std::span> shared_sb_solved; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; pdlp_hyper_params::pdlp_hyper_params_t hyper_params; // Holds the information of new variable lower and upper bounds for each climber in the format: diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index a04dd6a1f5..5bcd819ba5 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -157,7 +157,7 @@ void strong_branch_helper(i_t start, } // Mark the subproblem as solved so that batch PDLP removes it from the batch if (sb_view.is_valid()) { - // We could not mark as solved nodes hitting iteartion limit in DS + // We could not mark as solved nodes hitting iteration limit in DS if ((branch == 0 && is_dual_simplex_done(ds_status_down[k])) || (branch == 1 && is_dual_simplex_done(ds_status_up[k]))) { sb_view.mark_solved(shared_idx); @@ -426,6 +426,299 @@ static std::pair merge_sb_result(f_t ds_val, return {std::numeric_limits::quiet_NaN(), sb_source_t::NONE}; } +template +static void batch_pdlp_strong_branching_task( + const simplex_solver_settings_t& settings, + i_t effective_batch_pdlp, + f_t start_time, + std::atomic& concurrent_halt, + const lp_problem_t& original_lp, + const std::vector& new_slacks, + const std::vector& root_soln, + const std::vector& fractional, + f_t root_obj, + pseudo_costs_t& pc, + shared_strong_branching_context_view_t& sb_view, + std::vector& pdlp_obj_down, + std::vector& pdlp_obj_up) +{ + settings.log.printf(effective_batch_pdlp == 2 + ? "Batch PDLP only for strong branching\n" + : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); + + f_t start_batch = tic(); + std::vector original_root_soln_x; + + if (concurrent_halt.load() == 1) { return; } + + const auto mps_model = + simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); + + std::vector fraction_values; + + std::vector original_root_soln_y, original_root_soln_z; + // TODO put back later once Chris has this part + /*uncrush_dual_solution( + original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, + original_root_soln_z);*/ + + for (i_t k = 0; k < fractional.size(); k++) { + const i_t j = fractional[k]; + fraction_values.push_back(original_root_soln_x[j]); + } + + if (concurrent_halt.load() == 1) { return; } + + f_t batch_elapsed_time = toc(start_time); + const f_t warm_start_remaining_time = + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + if (warm_start_remaining_time <= 0.0) { return; } + + assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point"); + + if (!pc.pdlp_warm_cache.populated) { + pdlp_solver_settings_t ws_settings; + ws_settings.method = method_t::PDLP; + ws_settings.presolver = presolver_t::None; + ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + ws_settings.detect_infeasibility = false; + // Since the warm start will be used over and over again we want to maximize the chance of + // convergeance Batch PDLP is very compute intensive so we want to minimize the number of + // iterations + constexpr int warm_start_iteration_limit = 500000; + ws_settings.iteration_limit = warm_start_iteration_limit; + ws_settings.time_limit = warm_start_remaining_time; + constexpr f_t pdlp_tolerance = 1e-5; + ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; + ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; + ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; + ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; + ws_settings.inside_mip = true; + if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } + +#ifdef BATCH_VERBOSE_MODE + auto start_time = std::chrono::high_resolution_clock::now(); +#endif + + auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); + +#ifdef BATCH_VERBOSE_MODE + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end_time - start_time).count(); + std::cout << "Original problem solved in " << duration << " milliseconds" + << " and iterations: " + << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; +#endif + + if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) { + auto& cache = pc.pdlp_warm_cache; + const auto& ws_primal = ws_solution.get_primal_solution(); + const auto& ws_dual = ws_solution.get_dual_solution(); + // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm + // start + cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); + cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); + cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; + cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; + cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + cache.populated = true; + + settings.log.printf( + "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", + cache.initial_primal.size(), + cache.initial_dual.size(), + cache.step_size, + cache.primal_weight, + cache.pdlp_iteration); + } else { + settings.log.printf( + "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", + ws_solution.get_termination_status_string().c_str()); + return; + } + } + + if (concurrent_halt.load() == 1) { return; } + + pdlp_solver_settings_t pdlp_settings; + if (effective_batch_pdlp == 1) { + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_solved = sb_view.solved; + } + + batch_elapsed_time = toc(start_time); + const f_t batch_remaining_time = + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + if (batch_remaining_time <= 0.0) { return; } + pdlp_settings.time_limit = batch_remaining_time; + + if (pc.pdlp_warm_cache.populated) { + auto& cache = pc.pdlp_warm_cache; + pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), + cache.initial_primal.size(), + cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution( + cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache.step_size); + pdlp_settings.set_initial_primal_weight(cache.primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); + } + + if (concurrent_halt.load() == 1) { return; } + + const auto solutions = batch_pdlp_solve( + &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); + f_t batch_pdlp_strong_branching_time = toc(start_batch); + + // Fail safe in case the batch PDLP failed and produced no solutions + if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) { + settings.log.printf("Batch PDLP failed and produced no solutions\n"); + return; + } + + // Find max iteration on how many are done accross the batch + i_t max_iterations = 0; + i_t amount_done = 0; + for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) { + max_iterations = std::max( + max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken); + // TODO batch mode infeasible: should also count as done if infeasible + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + amount_done++; + } + } + + settings.log.printf( + "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n", + batch_pdlp_strong_branching_time, + amount_done, + fractional.size() * 2, + max_iterations); + + for (i_t k = 0; k < fractional.size(); k++) { + f_t obj_down = (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) + ? solutions.get_dual_objective_value(k) + : std::numeric_limits::quiet_NaN(); + + f_t obj_up = (solutions.get_termination_status(k + fractional.size()) == + pdlp_termination_status_t::Optimal) + ? solutions.get_dual_objective_value(k + fractional.size()) + : std::numeric_limits::quiet_NaN(); + + pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0)); + pdlp_obj_up[k] = std::max(obj_up - root_obj, f_t(0.0)); + } +} + +template +static void batch_pdlp_reliability_branching_task( + logger_t& log, + i_t rb_mode, + i_t num_candidates, + f_t start_time, + std::atomic& concurrent_halt, + const lp_problem_t& original_lp, + const std::vector& new_slacks, + const std::vector& solution, + branch_and_bound_worker_t* worker, + const std::vector& candidate_vars, + const simplex_solver_settings_t& settings, + shared_strong_branching_context_view_t& sb_view, + batch_pdlp_warm_cache_t& pdlp_warm_cache, + std::vector& pdlp_obj_down, + std::vector& pdlp_obj_up) +{ + log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n" + : "RB cooperative batch PDLP and DS for %d candidates\n", + num_candidates); + + f_t start_batch = tic(); + + std::vector original_soln_x; + + if (concurrent_halt.load() == 1) { return; } + + auto mps_model = + simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x); + { + const i_t n_orig = original_lp.num_cols - new_slacks.size(); + for (i_t j = 0; j < n_orig; j++) { + mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j]; + mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j]; + } + } + + std::vector fraction_values; + fraction_values.reserve(num_candidates); + for (i_t j : candidate_vars) { + fraction_values.push_back(original_soln_x[j]); + } + + if (concurrent_halt.load() == 1) { return; } + + const f_t batch_elapsed_time = toc(start_time); + const f_t batch_remaining_time = + std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); + if (batch_remaining_time <= 0.0) { return; } + + pdlp_solver_settings_t pdlp_settings; + if (rb_mode == 1) { + pdlp_settings.concurrent_halt = &concurrent_halt; + pdlp_settings.shared_sb_solved = sb_view.solved; + } + pdlp_settings.time_limit = batch_remaining_time; + + if (pdlp_warm_cache.populated) { + auto& cache = pdlp_warm_cache; + pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), + cache.initial_primal.size(), + cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution( + cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache.step_size); + pdlp_settings.set_initial_primal_weight(cache.primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); + } + + if (concurrent_halt.load() == 1) { return; } + + const auto solutions = batch_pdlp_solve( + &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); + + f_t batch_pdlp_time = toc(start_batch); + + if (solutions.get_additional_termination_informations().size() != + static_cast(num_candidates) * 2) { + log.printf("RB batch PDLP failed and produced no solutions\n"); + return; + } + + i_t amount_done = 0; + for (i_t k = 0; k < num_candidates * 2; k++) { + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + amount_done++; + } + } + + log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", + batch_pdlp_time, + amount_done, + num_candidates * 2); + + for (i_t k = 0; k < num_candidates; k++) { + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + pdlp_obj_down[k] = solutions.get_dual_objective_value(k); + } + if (solutions.get_termination_status(k + num_candidates) == + pdlp_termination_status_t::Optimal) { + pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates); + } + } +} + template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, @@ -450,7 +743,9 @@ void strong_branching(const lp_problem_t& original_lp, if (elapsed_time > settings.time_limit) { return; } const i_t effective_batch_pdlp = - (settings.sub_mip || settings.deterministic) ? 0 : settings.mip_batch_pdlp_strong_branching; + (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1)) + ? 0 + : settings.mip_batch_pdlp_strong_branching; if (settings.mip_batch_pdlp_strong_branching != 0 && (settings.sub_mip || settings.deterministic)) { @@ -464,200 +759,29 @@ void strong_branching(const lp_problem_t& original_lp, // Cooperative DS + PDLP: shared context tracks which subproblems are solved shared_strong_branching_context_t shared_ctx(2 * fractional.size()); - shared_strong_branching_context_view_t sb_view(std::span(shared_ctx.solved)); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); std::atomic concurrent_halt{0}; std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); - auto pdlp_thread = std::thread([&]() { - if (effective_batch_pdlp == 0) return; - - settings.log.printf(effective_batch_pdlp == 2 - ? "Batch PDLP only for strong branching\n" - : "Cooperative batch PDLP and Dual Simplex for strong branching\n"); - - f_t start_batch = tic(); - std::vector original_root_soln_x; - - if (concurrent_halt.load() == 1) { return; } - - const auto mps_model = - simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); - - std::vector fraction_values; - - std::vector original_root_soln_y, original_root_soln_z; - // TODO put back later once Chris has this part - /*uncrush_dual_solution( - original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, - original_root_soln_z);*/ - - for (i_t k = 0; k < fractional.size(); k++) { - const i_t j = fractional[k]; - fraction_values.push_back(original_root_soln_x[j]); - } - - if (concurrent_halt.load() == 1) { return; } - - f_t batch_elapsed_time = toc(start_time); - const f_t warm_start_remaining_time = - std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); - if (warm_start_remaining_time <= 0.0) { return; } - - assert(!pc.pdlp_warm_cache.populated && - "PDLP warm cache should not be populated at this point"); - - if (!pc.pdlp_warm_cache.populated) { - pdlp_solver_settings_t ws_settings; - ws_settings.method = method_t::PDLP; - ws_settings.presolver = presolver_t::None; - ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - ws_settings.detect_infeasibility = false; - // Since the warm start will be used over and over again we want to maximize the chance of - // convergeance Batch PDLP is very compute intensive so we want to minimize the number of - // iterations - constexpr int warm_start_iteration_limit = 500000; - ws_settings.iteration_limit = warm_start_iteration_limit; - ws_settings.time_limit = warm_start_remaining_time; - constexpr f_t pdlp_tolerance = 1e-5; - ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; - ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; - ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; - ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; - ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; - ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; - ws_settings.inside_mip = true; - if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } - -#ifdef BATCH_VERBOSE_MODE - auto start_time = std::chrono::high_resolution_clock::now(); -#endif - - auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); - -#ifdef BATCH_VERBOSE_MODE - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(end_time - start_time).count(); - std::cout << "Original problem solved in " << duration << " milliseconds" - << " and iterations: " - << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl; -#endif - - if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) { - auto& cache = pc.pdlp_warm_cache; - const auto& ws_primal = ws_solution.get_primal_solution(); - const auto& ws_dual = ws_solution.get_dual_solution(); - // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm - // start - cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); - cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); - cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; - cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; - cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; - cache.populated = true; - - settings.log.printf( - "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", - cache.initial_primal.size(), - cache.initial_dual.size(), - cache.step_size, - cache.primal_weight, - cache.pdlp_iteration); - } else { - settings.log.printf( - "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", - ws_solution.get_termination_status_string().c_str()); - return; - } - } - - if (concurrent_halt.load() == 1) { return; } - - pdlp_solver_settings_t pdlp_settings; - if (effective_batch_pdlp == 1) { - pdlp_settings.concurrent_halt = &concurrent_halt; - pdlp_settings.shared_sb_view = sb_view; - } - - batch_elapsed_time = toc(start_time); - const f_t batch_remaining_time = - std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); - if (batch_remaining_time <= 0.0) { return; } - pdlp_settings.time_limit = batch_remaining_time; - - if (pc.pdlp_warm_cache.populated) { - auto& cache = pc.pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), - cache.initial_primal.size(), - cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_step_size(cache.step_size); - pdlp_settings.set_initial_primal_weight(cache.primal_weight); - pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); - } - - if (concurrent_halt.load() == 1) { return; } - - const auto solutions = batch_pdlp_solve( - &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); - f_t batch_pdlp_strong_branching_time = toc(start_batch); - - // Fail safe in case the batch PDLP failed and produced no solutions - if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) { - settings.log.printf("Batch PDLP failed and produced no solutions\n"); - return; - } - - // Find max iteration on how many are done accross the batch - i_t max_iterations = 0; - i_t amount_done = 0; - for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) { - max_iterations = std::max( - max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken); - // TODO batch mode infeasible: should also count as done if infeasible - if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { - amount_done++; - } - } - - settings.log.printf( - "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n", - batch_pdlp_strong_branching_time, - amount_done, - fractional.size() * 2, - max_iterations); - - for (i_t k = 0; k < fractional.size(); k++) { - // Call BatchLP solver. Solve 2*fractional.size() subproblems. - // Let j = fractional[k]. We want to solve the two trial branching problems - // Branch down: - // minimize c^T x - // subject to lb <= A*x <= ub - // x_j <= floor(root_soln[j]) - // l <= x < u - // Let the optimal objective value of thie problem be obj_down - f_t obj_down = (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) - ? solutions.get_dual_objective_value(k) - : std::numeric_limits::quiet_NaN(); - - // Branch up: - // minimize c^T x - // subject to lb <= A*x <= ub - // x_j >= ceil(root_soln[j]) - // Let the optimal objective value of thie problem be obj_up - f_t obj_up = (solutions.get_termination_status(k + fractional.size()) == - pdlp_termination_status_t::Optimal) - ? solutions.get_dual_objective_value(k + fractional.size()) - : std::numeric_limits::quiet_NaN(); - - pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0)); - pdlp_obj_up[k] = std::max(obj_up - root_obj, f_t(0.0)); - } - }); + if (effective_batch_pdlp != 0) { +#pragma omp task default(shared) + batch_pdlp_strong_branching_task(settings, + effective_batch_pdlp, + start_time, + concurrent_halt, + original_lp, + new_slacks, + root_soln, + fractional, + root_obj, + pc, + sb_view, + pdlp_obj_down, + pdlp_obj_up); + } std::vector ds_status_down(fractional.size(), dual::status_t::UNSET); std::vector ds_status_up(fractional.size(), dual::status_t::UNSET); @@ -712,7 +836,9 @@ void strong_branching(const lp_problem_t& original_lp, concurrent_halt.store(1); } - pdlp_thread.join(); + if (effective_batch_pdlp != 0) { +#pragma omp taskwait + } settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time)); @@ -1062,114 +1188,37 @@ i_t pseudo_costs_t::reliable_variable_selection( // Shared context for cooperative work-stealing (mode 1) // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up shared_strong_branching_context_t shared_ctx(2 * num_candidates); - shared_strong_branching_context_view_t sb_view(std::span(shared_ctx.solved)); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); std::vector pdlp_obj_down(num_candidates, std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(num_candidates, std::numeric_limits::quiet_NaN()); std::atomic concurrent_halt{0}; - std::thread pdlp_thread; if (use_pdlp) { - pdlp_thread = std::thread([&]() { - log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n" - : "RB cooperative batch PDLP and DS for %d candidates\n", - num_candidates); - - f_t start_batch = tic(); - - std::vector original_soln_x; - - if (concurrent_halt.load() == 1) { return; } - - auto mps_model = - simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x); - { - const i_t n_orig = original_lp.num_cols - new_slacks.size(); - for (i_t j = 0; j < n_orig; j++) { - mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j]; - mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j]; - } - } - - std::vector fraction_values; - fraction_values.reserve(num_candidates); - for (i_t j : candidate_vars) { - fraction_values.push_back(original_soln_x[j]); - } - - if (concurrent_halt.load() == 1) { return; } - - const f_t batch_elapsed_time = toc(start_time); - const f_t batch_remaining_time = - std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); - if (batch_remaining_time <= 0.0) { return; } - - pdlp_solver_settings_t pdlp_settings; - if (rb_mode == 1) { - pdlp_settings.concurrent_halt = &concurrent_halt; - pdlp_settings.shared_sb_view = sb_view; - } - pdlp_settings.time_limit = batch_remaining_time; - - if (pdlp_warm_cache.populated) { - auto& cache = pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), - cache.initial_primal.size(), - cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution(cache.initial_dual.data(), - cache.initial_dual.size(), - cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_step_size(cache.step_size); - pdlp_settings.set_initial_primal_weight(cache.primal_weight); - pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); - } - - if (concurrent_halt.load() == 1) { return; } - - const auto solutions = batch_pdlp_solve(&pdlp_warm_cache.batch_pdlp_handle, - mps_model, - candidate_vars, - fraction_values, - pdlp_settings); - - f_t batch_pdlp_time = toc(start_batch); - - if (solutions.get_additional_termination_informations().size() != - static_cast(num_candidates) * 2) { - log.printf("RB batch PDLP failed and produced no solutions\n"); - return; - } - - i_t amount_done = 0; - for (i_t k = 0; k < num_candidates * 2; k++) { - if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { - amount_done++; - } - } - - log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", - batch_pdlp_time, - amount_done, - num_candidates * 2); - - for (i_t k = 0; k < num_candidates; k++) { - if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { - pdlp_obj_down[k] = solutions.get_dual_objective_value(k); - } - if (solutions.get_termination_status(k + num_candidates) == - pdlp_termination_status_t::Optimal) { - pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates); - } - } - }); +#pragma omp task default(shared) + batch_pdlp_reliability_branching_task(log, + rb_mode, + num_candidates, + start_time, + concurrent_halt, + original_lp, + new_slacks, + solution, + worker, + candidate_vars, + settings, + sb_view, + pdlp_warm_cache, + pdlp_obj_down, + pdlp_obj_up); } if (toc(start_time) > settings.time_limit) { log.printf("Time limit reached\n"); if (use_pdlp) { concurrent_halt.store(1); - pdlp_thread.join(); +#pragma omp taskwait } return branch_var; } @@ -1307,7 +1356,7 @@ i_t pseudo_costs_t::reliable_variable_selection( //} if (use_pdlp) { - pdlp_thread.join(); +#pragma omp taskwait i_t pdlp_applied = 0; i_t pdlp_optimal = 0; diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp index 60982d9344..a9e697ae58 100644 --- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp +++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include @@ -36,19 +37,22 @@ struct shared_strong_branching_context_view_t { bool is_solved(i_t local_idx) const { - assert(local_idx < solved.size() && "local_idx out of bounds"); + assert(local_idx >= 0 && static_cast(local_idx) < solved.size() && + "local_idx out of bounds"); return solved[local_idx].load() != 0; } void mark_solved(i_t local_idx) const { - assert(local_idx < solved.size() && "local_idx out of bounds"); + assert(local_idx >= 0 && static_cast(local_idx) < solved.size() && + "local_idx out of bounds"); solved[local_idx].store(1); } shared_strong_branching_context_view_t subview(i_t offset, i_t count) const { - assert(offset + count <= solved.size() && "subview out of bounds"); + assert(offset >= 0 && count >= 0 && static_cast(offset + count) <= solved.size() && + "subview out of bounds"); return {solved.subspan(offset, count)}; } }; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 642c17758d..85cba335ba 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -777,7 +777,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) #endif // Sync external solved status into internal termination strategy before all_done() check - if (settings_.shared_sb_view.is_valid()) { + if (sb_view_.is_valid()) { for (size_t i = 0; i < climber_strategies_.size(); ++i) { // If PDLP has solved it to optimality we want to keep it and resolved both solvers having // solved the problem later @@ -785,7 +785,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) current_termination_strategy_.get_termination_status(i))) continue; const i_t local_idx = climber_strategies_[i].original_index; - if (settings_.shared_sb_view.is_solved(local_idx)) { + if (sb_view_.is_solved(local_idx)) { current_termination_strategy_.set_termination_status( i, pdlp_termination_status_t::ConcurrentLimit); #ifdef BATCH_VERBOSE_MODE @@ -844,9 +844,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) .get_additional_termination_informations()[climber_strategies_[i].original_index] .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) != pdlp_termination_status_t::ConcurrentLimit); - if (settings_.shared_sb_view.is_valid()) { - settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); - } + if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); } } current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -863,9 +861,9 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) std::move(batch_solution_to_return_.get_additional_termination_informations()), std::move(batch_solution_to_return_.get_terminations_status())}; } - if (settings_.shared_sb_view.is_valid()) { + if (sb_view_.is_valid()) { for (size_t i = 0; i < climber_strategies_.size(); ++i) { - settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); + sb_view_.mark_solved(climber_strategies_[i].original_index); } } RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -923,9 +921,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) .get_additional_termination_informations()[climber_strategies_[i].original_index] .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) != pdlp_termination_status_t::ConcurrentLimit); - if (settings_.shared_sb_view.is_valid()) { - settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index); - } + if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); } } } if (to_remove.size() > 0) { diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index de0cf69c91..d03430f150 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -138,6 +139,8 @@ class pdlp_solver_t { rmm::cuda_stream_view stream_view_; // Intentionnaly take a copy to avoid an unintentional modification in the calling context const pdlp_solver_settings_t settings_; + dual_simplex::shared_strong_branching_context_view_t sb_view_{ + settings_.shared_sb_solved}; problem_t* problem_ptr; // Combined bounds in op_problem_scaled_ will only be scaled if diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 267e149029..341edb2c1f 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -1045,8 +1045,8 @@ optimization_problem_solution_t run_batch_pdlp( batch_settings.new_bounds = std::vector>( original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size); - if (settings.shared_sb_view.is_valid()) { - batch_settings.shared_sb_view = settings.shared_sb_view.subview(i, current_batch_size); + if (!settings.shared_sb_solved.empty()) { + batch_settings.shared_sb_solved = settings.shared_sb_solved.subspan(i, current_batch_size); } auto sol = solve_lp(problem, batch_settings); diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index ef43b1a591..5c6edad27b 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -5,6 +5,7 @@ */ /* clang-format on */ +#include #include #include #include @@ -46,8 +47,6 @@ #include #include -#include - namespace cuopt::linear_programming::test { constexpr double afiro_primal_objective = -464.0; @@ -2057,7 +2056,7 @@ TEST(pdlp_class, shared_sb_context_unit) constexpr int N = 10; shared_strong_branching_context_t ctx(N); - shared_strong_branching_context_view_t view(std::span(ctx.solved)); + shared_strong_branching_context_view_t view(ctx.solved); EXPECT_TRUE(view.is_valid()); @@ -2127,14 +2126,14 @@ TEST(pdlp_class, shared_sb_view_batch_pre_solved) std::ceil(root_soln_x[i]), op_problem.get_variable_upper_bounds()[fractional[i]]}); - shared_strong_branching_context_t ctx(batch_size); + shared_strong_branching_context_t shared_ctx(batch_size); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); // Pre-mark entries 1 and 4 as solved (simulating DS) - ctx.solved[1].store(1); - ctx.solved[4].store(1); + sb_view.mark_solved(1); + sb_view.mark_solved(4); - solver_settings.shared_sb_view = - shared_strong_branching_context_view_t(std::span(ctx.solved)); + solver_settings.shared_sb_solved = sb_view.solved; auto solution = solve_lp(&handle_, op_problem, solver_settings); @@ -2152,7 +2151,7 @@ TEST(pdlp_class, shared_sb_view_batch_pre_solved) // All entries should now be marked solved in the shared context for (int i = 0; i < batch_size; ++i) { - EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved"; } } @@ -2176,14 +2175,14 @@ TEST(pdlp_class, shared_sb_view_subbatch) solver_settings.presolver = cuopt::linear_programming::presolver_t::None; solver_settings.sub_batch_size = 2; - shared_strong_branching_context_t ctx(batch_size); + shared_strong_branching_context_t shared_ctx(batch_size); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); // Pre-mark one entry in each sub-batch of size 2: indices 1, 4 - ctx.solved[1].store(1); - ctx.solved[4].store(1); + sb_view.mark_solved(1); + sb_view.mark_solved(4); - solver_settings.shared_sb_view = - shared_strong_branching_context_view_t(std::span(ctx.solved)); + solver_settings.shared_sb_solved = sb_view.solved; auto solution = batch_pdlp_solve(&handle_, op_problem, fractional, root_soln_x, solver_settings); @@ -2202,7 +2201,7 @@ TEST(pdlp_class, shared_sb_view_subbatch) // All should be marked solved for (int i = 0; i < batch_size; ++i) { - EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved"; } } @@ -2234,10 +2233,10 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark) std::ceil(root_soln_x[i]), op_problem.get_variable_upper_bounds()[fractional[i]]}); - shared_strong_branching_context_t ctx(batch_size); + shared_strong_branching_context_t shared_ctx(batch_size); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - solver_settings.shared_sb_view = - shared_strong_branching_context_view_t(std::span(ctx.solved)); + solver_settings.shared_sb_solved = sb_view.solved; optimization_problem_solution_t* result_ptr = nullptr; @@ -2250,7 +2249,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark) // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS) std::this_thread::sleep_for(std::chrono::milliseconds(200)); for (int i = 0; i < n_fractional; ++i) - ctx.solved[i].store(1); + sb_view.mark_solved(i); pdlp_thread.join(); @@ -2271,7 +2270,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark) // All entries should end up marked solved for (int i = 0; i < batch_size; ++i) { - EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved"; } delete result_ptr; @@ -2300,10 +2299,10 @@ TEST(pdlp_class, shared_sb_view_all_infeasible) for (int i = 0; i < n_fractional; ++i) solver_settings.new_bounds.push_back({fractional[0], -5, -5}); - shared_strong_branching_context_t ctx(batch_size); + shared_strong_branching_context_t shared_ctx(batch_size); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - solver_settings.shared_sb_view = - shared_strong_branching_context_view_t(std::span(ctx.solved)); + solver_settings.shared_sb_solved = sb_view.solved; optimization_problem_solution_t* result_ptr = nullptr; @@ -2316,7 +2315,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible) // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS) std::this_thread::sleep_for(std::chrono::milliseconds(200)); for (int i = 0; i < n_fractional; ++i) - ctx.solved[i].store(1); + sb_view.mark_solved(i); pdlp_thread.join(); @@ -2336,7 +2335,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible) // All entries should end up marked solved for (int i = 0; i < batch_size; ++i) { - EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved"; + EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved"; } delete result_ptr; From 843c53236b3b0a6f7ef2083a6eb961cb9fe133ac Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 18:25:20 +0200 Subject: [PATCH 40/43] two improvements: mark variables as solved in DS if node became reliable, use one stream per BPDLP in RB --- cpp/src/branch_and_bound/pseudo_costs.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 5bcd819ba5..9e5ff12bbd 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -664,6 +664,9 @@ static void batch_pdlp_reliability_branching_task( std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (batch_remaining_time <= 0.0) { return; } + // One handle per batch PDLP since there can be concurrent calls + const raft::handle_t batch_pdlp_handle; + pdlp_solver_settings_t pdlp_settings; if (rb_mode == 1) { pdlp_settings.concurrent_halt = &concurrent_halt; @@ -675,9 +678,9 @@ static void batch_pdlp_reliability_branching_task( auto& cache = pdlp_warm_cache; pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), cache.initial_primal.size(), - cache.batch_pdlp_handle.get_stream()); + batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_step_size(cache.step_size); pdlp_settings.set_initial_primal_weight(cache.primal_weight); pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); @@ -686,7 +689,7 @@ static void batch_pdlp_reliability_branching_task( if (concurrent_halt.load() == 1) { return; } const auto solutions = batch_pdlp_solve( - &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); + &batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); f_t batch_pdlp_time = toc(start_batch); @@ -1274,6 +1277,9 @@ i_t pseudo_costs_t::reliable_variable_selection( // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); } } + } else { + // Variable became reliable, make it as solved so that batch PDLP does not solve it again + if (rb_mode == 1) sb_view.mark_solved(i); } pseudo_cost_mutex_down[j].unlock(); } @@ -1317,6 +1323,9 @@ i_t pseudo_costs_t::reliable_variable_selection( // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); } } + } else { + // Variable became reliable, make it as solved so that batch PDLP does not solve it again + if (rb_mode == 1) sb_view.mark_solved(shared_idx); } pseudo_cost_mutex_up[j].unlock(); } From a9fd42095cfebc23dd48dd62b1ce69367d7b7c02 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 18:52:52 +0200 Subject: [PATCH 41/43] fix: avoid early exit if solved at step 0 even when initial pdlp iteartion is given --- cpp/src/pdlp/pdlp.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 85cba335ba..33c080ee3c 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -995,7 +995,7 @@ std::optional> pdlp_solver_t // To avoid that we allow at least two iterations at first before checking (in practice 0 wasn't // enough) We still need to check iteration and time limit prior without breaking the logic below // of first checking termination before the limit - if (total_pdlp_iterations_ <= 1) { + if (internal_solver_iterations_ <= 1) { print_termination_criteria(timer); return check_limits(timer); } From edae2997fca04bc3690a4f3adcc93f9e5f795396 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 19:02:59 +0200 Subject: [PATCH 42/43] disable both by default --- cpp/include/cuopt/linear_programming/mip/solver_settings.hpp | 4 ++-- cpp/src/math_optimization/solver_settings.cu | 4 ++-- .../cuopt_server/utils/linear_programming/data_definition.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 77b18dc17e..3da9ea8f1f 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -99,9 +99,9 @@ class mip_solver_settings_t { f_t cut_change_threshold = -1.0; f_t cut_min_orthogonality = 0.5; i_t mip_batch_pdlp_strong_branching{ - 1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + 0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only i_t mip_batch_pdlp_reliability_branching{ - 1}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only + 0}; // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only i_t num_gpus = 1; bool log_to_console = true; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 52fc95a6bd..9d933f3c98 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -99,8 +99,8 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, - {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 1}, - {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 1}, + {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, + {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index ddc38539f5..32cf860f28 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -451,13 +451,13 @@ class SolverConfig(BaseModel): "heuristics and branch and bound for MILP", ) mip_batch_pdlp_strong_branching: Optional[int] = Field( - default=1, + default=0, description="Strong branching mode: 0 = Dual Simplex only, " "1 = cooperative work-stealing (DS + batch PDLP), " "2 = batch PDLP only.", ) mip_batch_pdlp_reliability_branching: Optional[int] = Field( - default=1, + default=0, description="Reliability branching mode: 0 = Dual Simplex only, " "1 = cooperative work-stealing (DS + batch PDLP), " "2 = batch PDLP only.", From 6e5baa5acea722cdce3231ef46ff4b56f7ef1e0a Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 30 Mar 2026 19:03:14 +0200 Subject: [PATCH 43/43] styl --- cpp/src/branch_and_bound/pseudo_costs.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 9e5ff12bbd..4625b4343b 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -676,9 +676,8 @@ static void batch_pdlp_reliability_branching_task( if (pdlp_warm_cache.populated) { auto& cache = pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), - cache.initial_primal.size(), - batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_primal_solution( + cache.initial_primal.data(), cache.initial_primal.size(), batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_dual_solution( cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream()); pdlp_settings.set_initial_step_size(cache.step_size); @@ -688,8 +687,8 @@ static void batch_pdlp_reliability_branching_task( if (concurrent_halt.load() == 1) { return; } - const auto solutions = batch_pdlp_solve( - &batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); + const auto solutions = + batch_pdlp_solve(&batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); f_t batch_pdlp_time = toc(start_batch);