From c72f7c8e8f4ff30a095a30c371cae83c9935cdb9 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Sun, 8 Mar 2026 20:52:20 +0530 Subject: [PATCH 1/5] Wip config regex hardening --- .claude/agents/pr-creator.md | 2 + .claude/agents/pr-reviewer.md | 3 + crates/common/src/auction/mod.rs | 19 +- crates/common/src/auth.rs | 61 +++- .../common/src/integrations/adserver_mock.rs | 15 +- crates/common/src/integrations/aps.rs | 15 +- crates/common/src/integrations/datadome.rs | 38 +-- crates/common/src/integrations/didomi.rs | 38 ++- .../src/integrations/google_tag_manager.rs | 36 ++- crates/common/src/integrations/gpt.rs | 48 ++-- crates/common/src/integrations/lockr.rs | 32 ++- crates/common/src/integrations/mod.rs | 6 +- .../integrations/nextjs/html_post_process.rs | 9 +- crates/common/src/integrations/nextjs/mod.rs | 43 ++- crates/common/src/integrations/nextjs/rsc.rs | 96 ++++--- .../integrations/nextjs/script_rewriter.rs | 265 +++++++++--------- .../common/src/integrations/nextjs/shared.rs | 115 ++++---- crates/common/src/integrations/permutive.rs | 39 ++- crates/common/src/integrations/prebid.rs | 58 ++-- crates/common/src/integrations/registry.rs | 2 +- crates/common/src/integrations/testlight.rs | 45 +-- crates/common/src/settings.rs | 246 +++++++++++++++- crates/common/src/settings_data.rs | 2 + crates/fastly/src/main.rs | 13 +- 24 files changed, 849 insertions(+), 397 deletions(-) diff --git a/.claude/agents/pr-creator.md b/.claude/agents/pr-creator.md index 1ac4c337..9ac03b38 100644 --- a/.claude/agents/pr-creator.md +++ b/.claude/agents/pr-creator.md @@ -49,6 +49,7 @@ Using the `.github/pull_request_template.md` structure, draft: - **Changes table**: list each file modified and what changed. - **Closes**: `Closes #` to auto-close the linked issue. - **Test plan**: check off which verification steps were run. +- **Hardening note**: when config-derived regex or pattern compilation is touched, state how invalid enabled config fails startup and which regression tests cover that path. - **Checklist**: verify each item applies. ### 5. Create the PR @@ -172,6 +173,7 @@ Do **not** use labels as a substitute for types. - Use sentence case for the title. - Use imperative mood (e.g., "Add caching to proxy" not "Added caching"). - The summary should focus on _why_, not just _what_. +- Do not describe config-derived regex/pattern compilation as safe unless invalid enabled config is handled without `panic!`, `unwrap()`, or `expect()`. - Always base PRs against `main` unless told otherwise. - Always assign the PR to the current user (`--assignee @me`). - Never force-push or rebase without explicit user approval. diff --git a/.claude/agents/pr-reviewer.md b/.claude/agents/pr-reviewer.md index 0a596f41..f1c3b8cd 100644 --- a/.claude/agents/pr-reviewer.md +++ b/.claude/agents/pr-reviewer.md @@ -78,6 +78,8 @@ For each changed file, evaluate: - `expect("should ...")` instead of `unwrap()` in production code - `error-stack` (`Report`) with `derive_more::Display` for errors (not thiserror/anyhow) - `log` macros (not `println!`) +- Config-derived regex/pattern compilation must not use panic-prone `expect()`/`unwrap()`; invalid enabled config should surface as startup/config errors +- Invalid enabled integrations/providers must not be silently logged-and-disabled during startup or registration - `vi.hoisted()` for mock definitions in JS tests - Integration IDs match JS directory names - Colocated tests with `#[cfg(test)]` @@ -105,6 +107,7 @@ For each changed file, evaluate: - Are new code paths tested? - Are edge cases covered (empty input, max values, error paths)? +- If config-derived regex/pattern compilation changed: are invalid enabled-config startup failures and explicit `enabled = false` bypass cases both covered? - Rust tests: `cargo test --workspace` - JS tests: `npx vitest run` in `crates/js/lib/` diff --git a/crates/common/src/auction/mod.rs b/crates/common/src/auction/mod.rs index 92e4d56c..6fa9cc2d 100644 --- a/crates/common/src/auction/mod.rs +++ b/crates/common/src/auction/mod.rs @@ -7,6 +7,9 @@ //! Note: Individual auction providers are located in the `integrations` module //! (e.g., `crate::integrations::aps`, `crate::integrations::prebid`). +use error_stack::Report; + +use crate::error::TrustedServerError; use crate::settings::Settings; use std::sync::Arc; @@ -27,7 +30,8 @@ pub use types::{ }; /// Type alias for provider builder functions. -type ProviderBuilder = fn(&Settings) -> Vec>; +type ProviderBuilder = + fn(&Settings) -> Result>, Report>; /// Returns the list of all available provider builder functions. /// @@ -49,15 +53,20 @@ fn provider_builders() -> &'static [ProviderBuilder] { /// /// # Arguments /// * `settings` - Application settings used to configure the orchestrator and providers -#[must_use] -pub fn build_orchestrator(settings: &Settings) -> AuctionOrchestrator { +/// +/// # Errors +/// +/// Returns an error when an enabled auction provider has invalid configuration. +pub fn build_orchestrator( + settings: &Settings, +) -> Result> { log::info!("Building auction orchestrator"); let mut orchestrator = AuctionOrchestrator::new(settings.auction.clone()); // Auto-discover and register all auction providers from settings for builder in provider_builders() { - for provider in builder(settings) { + for provider in builder(settings)? { orchestrator.register_provider(provider); } } @@ -67,5 +76,5 @@ pub fn build_orchestrator(settings: &Settings) -> AuctionOrchestrator { orchestrator.provider_count() ); - orchestrator + Ok(orchestrator) } diff --git a/crates/common/src/auth.rs b/crates/common/src/auth.rs index ba669e6a..20d2c412 100644 --- a/crates/common/src/auth.rs +++ b/crates/common/src/auth.rs @@ -1,23 +1,40 @@ use base64::{engine::general_purpose::STANDARD, Engine as _}; +use error_stack::Report; use fastly::http::{header, StatusCode}; use fastly::{Request, Response}; +use crate::error::TrustedServerError; use crate::settings::Settings; const BASIC_AUTH_REALM: &str = r#"Basic realm="Trusted Server""#; -pub fn enforce_basic_auth(settings: &Settings, req: &Request) -> Option { - let handler = settings.handler_for_path(req.get_path())?; +/// Enforce HTTP basic auth for the matched handler, if any. +/// +/// Returns `Ok(None)` when the request does not target a protected handler or +/// when the supplied credentials are valid. Returns `Ok(Some(Response))` with +/// the auth challenge when credentials are missing or invalid. +/// +/// # Errors +/// +/// Returns an error when handler configuration is invalid, such as an +/// un-compilable path regex. +pub fn enforce_basic_auth( + settings: &Settings, + req: &Request, +) -> Result, Report> { + let Some(handler) = settings.handler_for_path(req.get_path())? else { + return Ok(None); + }; let (username, password) = match extract_credentials(req) { Some(credentials) => credentials, - None => return Some(unauthorized_response()), + None => return Ok(Some(unauthorized_response())), }; if username == handler.username && password == handler.password { - None + Ok(None) } else { - Some(unauthorized_response()) + Ok(Some(unauthorized_response())) } } @@ -72,7 +89,9 @@ mod tests { let settings = settings_with_handlers(); let req = Request::new(Method::GET, "https://example.com/open"); - assert!(enforce_basic_auth(&settings, &req).is_none()); + assert!(enforce_basic_auth(&settings, &req) + .expect("should evaluate auth") + .is_none()); } #[test] @@ -80,7 +99,9 @@ mod tests { let settings = settings_with_handlers(); let req = Request::new(Method::GET, "https://example.com/secure"); - let response = enforce_basic_auth(&settings, &req).expect("should challenge"); + let response = enforce_basic_auth(&settings, &req) + .expect("should evaluate auth") + .expect("should challenge"); assert_eq!(response.get_status(), StatusCode::UNAUTHORIZED); let realm = response .get_header(header::WWW_AUTHENTICATE) @@ -95,7 +116,9 @@ mod tests { let token = STANDARD.encode("user:pass"); req.set_header(header::AUTHORIZATION, format!("Basic {token}")); - assert!(enforce_basic_auth(&settings, &req).is_none()); + assert!(enforce_basic_auth(&settings, &req) + .expect("should evaluate auth") + .is_none()); } #[test] @@ -105,7 +128,9 @@ mod tests { let token = STANDARD.encode("user:wrong"); req.set_header(header::AUTHORIZATION, format!("Basic {token}")); - let response = enforce_basic_auth(&settings, &req).expect("should challenge"); + let response = enforce_basic_auth(&settings, &req) + .expect("should evaluate auth") + .expect("should challenge"); assert_eq!(response.get_status(), StatusCode::UNAUTHORIZED); } @@ -115,7 +140,23 @@ mod tests { let mut req = Request::new(Method::GET, "https://example.com/secure"); req.set_header(header::AUTHORIZATION, "Bearer token"); - let response = enforce_basic_auth(&settings, &req).expect("should challenge"); + let response = enforce_basic_auth(&settings, &req) + .expect("should evaluate auth") + .expect("should challenge"); assert_eq!(response.get_status(), StatusCode::UNAUTHORIZED); } + + #[test] + fn returns_error_for_invalid_handler_regex_without_panicking() { + let config = crate_test_settings_str().replace(r#"path = "^/secure""#, r#"path = "(""#); + let settings = Settings::from_toml(&config).expect("should parse invalid regex TOML"); + let req = Request::new(Method::GET, "https://example.com/secure"); + + let err = enforce_basic_auth(&settings, &req).expect_err("should return config error"); + assert!( + err.to_string() + .contains("Handler path regex `(` failed to compile"), + "should describe the invalid handler regex" + ); + } } diff --git a/crates/common/src/integrations/adserver_mock.rs b/crates/common/src/integrations/adserver_mock.rs index 0d5422e6..1658bd44 100644 --- a/crates/common/src/integrations/adserver_mock.rs +++ b/crates/common/src/integrations/adserver_mock.rs @@ -34,6 +34,7 @@ pub struct AdServerMockConfig { pub enabled: bool, /// Mediation endpoint URL + #[validate(url)] pub endpoint: String, /// Timeout in milliseconds @@ -379,8 +380,14 @@ impl AuctionProvider for AdServerMockProvider { // ============================================================================ /// Auto-register ad server mock provider based on settings configuration. -#[must_use] -pub fn register_providers(settings: &Settings) -> Vec> { +/// +/// # Errors +/// +/// Returns an error when the ad server mock provider is enabled with invalid +/// configuration. +pub fn register_providers( + settings: &Settings, +) -> Result>, Report> { let mut providers: Vec> = Vec::new(); match settings.integration_config::("adserver_mock") { @@ -395,11 +402,11 @@ pub fn register_providers(settings: &Settings) -> Vec> log::debug!("AdServer Mock config found but is disabled"); } Err(e) => { - log::error!("Failed to load AdServer Mock config: {:?}", e); + return Err(e); } } - providers + Ok(providers) } // ============================================================================ diff --git a/crates/common/src/integrations/aps.rs b/crates/common/src/integrations/aps.rs index bdd9c25b..38acf5fb 100644 --- a/crates/common/src/integrations/aps.rs +++ b/crates/common/src/integrations/aps.rs @@ -170,6 +170,7 @@ pub struct ApsConfig { /// APS API endpoint #[serde(default = "default_endpoint")] + #[validate(url)] pub endpoint: String, /// Timeout in milliseconds @@ -532,8 +533,14 @@ use std::sync::Arc; /// Auto-register APS provider based on settings configuration. /// /// Returns the APS provider if enabled in settings. -#[must_use] -pub fn register_providers(settings: &Settings) -> Vec> { +/// +/// # Errors +/// +/// Returns an error when the APS provider is enabled with invalid +/// configuration. +pub fn register_providers( + settings: &Settings, +) -> Result>, Report> { let mut providers: Vec> = Vec::new(); // Check for real APS provider configuration @@ -550,11 +557,11 @@ pub fn register_providers(settings: &Settings) -> Vec> log::debug!("APS integration config found but is disabled"); } Err(e) => { - log::error!("Failed to load APS configuration: {:?}", e); + return Err(e); } } - providers + Ok(providers) } // ============================================================================ diff --git a/crates/common/src/integrations/datadome.rs b/crates/common/src/integrations/datadome.rs index e0f6e7c2..76470c78 100644 --- a/crates/common/src/integrations/datadome.rs +++ b/crates/common/src/integrations/datadome.rs @@ -461,17 +461,13 @@ impl IntegrationAttributeRewriter for DataDomeIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(DATADOME_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => { - log::debug!("[datadome] Integration disabled or not configured"); - return None; - } - Err(err) => { - log::error!("[datadome] Failed to load integration config: {err:?}"); - return None; - } +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = settings.integration_config::(DATADOME_INTEGRATION_ID)? + else { + log::debug!("[datadome] Integration disabled or not configured"); + return Ok(None); }; log::info!( @@ -480,20 +476,28 @@ fn build(settings: &Settings) -> Option> { config.rewrite_sdk ); - Some(DataDomeIntegration::new(config)) + Ok(Some(DataDomeIntegration::new(config))) } /// Register the `DataDome` integration with Trusted Server. -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; +/// +/// # Errors +/// +/// Returns an error when the `DataDome` integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; - Some( + Ok(Some( IntegrationRegistration::builder(DATADOME_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration) .build(), - ) + )) } #[cfg(test)] diff --git a/crates/common/src/integrations/didomi.rs b/crates/common/src/integrations/didomi.rs index b6f70567..7042af70 100644 --- a/crates/common/src/integrations/didomi.rs +++ b/crates/common/src/integrations/didomi.rs @@ -153,28 +153,36 @@ impl DidomiIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(DIDOMI_INTEGRATION_ID) - { - Ok(Some(config)) => Arc::new(config), - Ok(None) => return None, - Err(err) => { - log::error!("Failed to load Didomi integration config: {err:?}"); - return None; - } +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = + settings.integration_config::(DIDOMI_INTEGRATION_ID)? + else { + return Ok(None); }; - Some(DidomiIntegration::new(config)) + + Ok(Some(DidomiIntegration::new(Arc::new(config)))) } /// Register the Didomi consent notice integration when enabled. -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// +/// # Errors +/// +/// Returns an error when the Didomi integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(DIDOMI_INTEGRATION_ID) .with_proxy(integration) .build(), - ) + )) } #[async_trait(?Send)] diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 7bee4d49..6be4c7d9 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -315,29 +315,37 @@ impl GoogleTagManagerIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(GTM_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => return None, - Err(err) => { - log::error!("Failed to load GTM integration config: {err:?}"); - return None; - } +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = settings.integration_config::(GTM_INTEGRATION_ID)? + else { + return Ok(None); }; - Some(GoogleTagManagerIntegration::new(config)) + Ok(Some(GoogleTagManagerIntegration::new(config))) } -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// Register the Google Tag Manager integration when enabled. +/// +/// # Errors +/// +/// Returns an error when the Google Tag Manager integration is enabled with +/// invalid configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(GTM_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration.clone()) .with_script_rewriter(integration) .build(), - ) + )) } #[async_trait(?Send)] diff --git a/crates/common/src/integrations/gpt.rs b/crates/common/src/integrations/gpt.rs index 3e18ac9e..e14b61bc 100644 --- a/crates/common/src/integrations/gpt.rs +++ b/crates/common/src/integrations/gpt.rs @@ -325,33 +325,35 @@ impl GptIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(GPT_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => { - log::debug!("[gpt] Integration disabled or not configured"); - return None; - } - Err(err) => { - log::error!("Failed to load GPT integration config: {err:?}"); - return None; - } +fn build(settings: &Settings) -> Result>, Report> { + let Some(config) = settings.integration_config::(GPT_INTEGRATION_ID)? else { + log::debug!("[gpt] Integration disabled or not configured"); + return Ok(None); }; - Some(GptIntegration::new(config)) + Ok(Some(GptIntegration::new(config))) } /// Register the GPT integration. -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// +/// # Errors +/// +/// Returns an error when the GPT integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(GPT_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration.clone()) .with_head_injector(integration) .build(), - ) + )) } #[async_trait(?Send)] @@ -744,7 +746,9 @@ mod tests { fn build_requires_config() { let settings = create_test_settings(); assert!( - build(&settings).is_none(), + build(&settings) + .expect("should evaluate integration build") + .is_none(), "should not build without integration config" ); } @@ -766,7 +770,9 @@ mod tests { .expect("should insert GPT config"); assert!( - build(&settings).is_some(), + build(&settings) + .expect("should evaluate integration build") + .is_some(), "should build with valid integration config" ); } @@ -785,7 +791,9 @@ mod tests { .expect("should insert GPT config"); assert!( - build(&settings).is_none(), + build(&settings) + .expect("should evaluate integration build") + .is_none(), "should not build when integration is disabled" ); } diff --git a/crates/common/src/integrations/lockr.rs b/crates/common/src/integrations/lockr.rs index 29576e01..e999cbb1 100644 --- a/crates/common/src/integrations/lockr.rs +++ b/crates/common/src/integrations/lockr.rs @@ -293,29 +293,33 @@ impl LockrIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(LOCKR_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => return None, - Err(err) => { - log::error!("Failed to load Lockr integration config: {err:?}"); - return None; - } +fn build(settings: &Settings) -> Result>, Report> { + let Some(config) = settings.integration_config::(LOCKR_INTEGRATION_ID)? else { + return Ok(None); }; - Some(LockrIntegration::new(config)) + Ok(Some(LockrIntegration::new(config))) } /// Register the Lockr integration. -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// +/// # Errors +/// +/// Returns an error when the Lockr integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(LOCKR_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration) .build(), - ) + )) } #[async_trait(?Send)] diff --git a/crates/common/src/integrations/mod.rs b/crates/common/src/integrations/mod.rs index f2b8c69c..92f30219 100644 --- a/crates/common/src/integrations/mod.rs +++ b/crates/common/src/integrations/mod.rs @@ -1,5 +1,8 @@ //! Integration module registry and sample implementations. +use error_stack::Report; + +use crate::error::TrustedServerError; use crate::settings::Settings; pub mod adserver_mock; @@ -23,7 +26,8 @@ pub use registry::{ IntegrationRegistry, IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; -type IntegrationBuilder = fn(&Settings) -> Option; +type IntegrationBuilder = + fn(&Settings) -> Result, Report>; pub(crate) fn builders() -> &'static [IntegrationBuilder] { &[ diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index c85bf517..a80503c3 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -11,7 +11,7 @@ use super::rsc::rewrite_rsc_scripts_combined_with_limit; use super::rsc_placeholders::{ NextJsRscPostProcessState, RSC_PAYLOAD_PLACEHOLDER_PREFIX, RSC_PAYLOAD_PLACEHOLDER_SUFFIX, }; -use super::shared::find_rsc_push_payload_range; +use super::shared::{find_rsc_push_payload_range, RscUrlRewriter}; use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; pub(crate) struct NextJsHtmlPostProcessor { @@ -91,9 +91,10 @@ impl NextJsHtmlPostProcessor { payloads: Vec, ) -> bool { let payload_refs: Vec<&str> = payloads.iter().map(String::as_str).collect(); + let rsc_rewriter = RscUrlRewriter::new(ctx.origin_host); let mut rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( payload_refs.as_slice(), - ctx.origin_host, + &rsc_rewriter, ctx.request_host, ctx.request_scheme, self.config.max_combined_payload_bytes, @@ -453,9 +454,11 @@ fn post_process_rsc_html_in_place_with_limit( ); } + let rewriter = RscUrlRewriter::new(origin_host); + let rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( payloads.as_slice(), - origin_host, + &rewriter, request_host, request_scheme, max_combined_payload_bytes, diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs index 549e420d..4074ab5d 100644 --- a/crates/common/src/integrations/nextjs/mod.rs +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -1,8 +1,10 @@ use std::sync::Arc; +use error_stack::Report; use serde::{Deserialize, Serialize}; use validator::Validate; +use crate::error::TrustedServerError; use crate::integrations::IntegrationRegistration; use crate::settings::{IntegrationConfig, Settings}; @@ -56,9 +58,25 @@ fn default_max_combined_payload_bytes() -> usize { 10 * 1024 * 1024 } -#[must_use] -pub fn register(settings: &Settings) -> Option { - let config = match build(settings) { +pub(super) fn configuration_error(message: impl Into) -> Report { + Report::new(TrustedServerError::Configuration { + message: format!( + "Integration '{NEXTJS_INTEGRATION_ID}' configuration error: {}", + message.into() + ), + }) +} + +/// Register the Next.js integration when enabled. +/// +/// # Errors +/// +/// Returns an error when the Next.js integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let config = match build(settings)? { Some(config) => { log::info!( "NextJS integration registered: enabled={}, rewrite_attributes={:?}, max_combined_payload_bytes={}", @@ -70,12 +88,11 @@ pub fn register(settings: &Settings) -> Option { } None => { log::info!("NextJS integration not registered (disabled or missing config)"); - return None; + return Ok(None); } }; - // Register a structured (Pages Router __NEXT_DATA__) rewriter. - let structured = Arc::new(NextJsNextDataRewriter::new(config.clone())); + let structured = Arc::new(NextJsNextDataRewriter::new(config.clone())?); // Insert placeholders for App Router RSC payload scripts during the initial HTML rewrite pass, // then substitute them during post-processing without re-parsing HTML. @@ -89,15 +106,15 @@ pub fn register(settings: &Settings) -> Option { .with_script_rewriter(placeholders) .with_html_post_processor(post_processor); - Some(builder.build()) + Ok(Some(builder.build())) } -fn build(settings: &Settings) -> Option> { - let config = settings +fn build( + settings: &Settings, +) -> Result>, Report> { + settings .integration_config::(NEXTJS_INTEGRATION_ID) - .ok() - .flatten()?; - Some(Arc::new(config)) + .map(|config| config.map(Arc::new)) } #[cfg(test)] @@ -342,7 +359,7 @@ mod tests { #[test] fn register_respects_enabled_flag() { let settings = create_test_settings(); - let registration = register(&settings); + let registration = register(&settings).expect("should evaluate registration"); assert!( registration.is_none(), diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs index 704aa56f..3325f896 100644 --- a/crates/common/src/integrations/nextjs/rsc.rs +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -277,6 +277,8 @@ fn find_tchunks_with_markers(content: &str) -> Option> { pub(crate) fn rewrite_rsc_tchunks_with_rewriter( content: &str, rewriter: &RscUrlRewriter, + request_host: &str, + request_scheme: &str, ) -> String { let Some(chunks) = find_tchunks(content) else { log::warn!( @@ -286,7 +288,7 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( }; if chunks.is_empty() { - return rewriter.rewrite_to_string(content); + return rewriter.rewrite_to_string(content, request_host, request_scheme); } let mut result = String::with_capacity(content.len()); @@ -294,10 +296,15 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( for chunk in &chunks { let before = &content[last_end..chunk.match_start]; - result.push_str(rewriter.rewrite(before).as_ref()); + result.push_str( + rewriter + .rewrite(before, request_host, request_scheme) + .as_ref(), + ); let chunk_content = &content[chunk.header_end..chunk.content_end]; - let rewritten_content = rewriter.rewrite_to_string(chunk_content); + let rewritten_content = + rewriter.rewrite_to_string(chunk_content, request_host, request_scheme); let new_length = calculate_unescaped_byte_length(&rewritten_content); let new_length_hex = format!("{new_length:x}"); @@ -312,7 +319,11 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( } let remaining = &content[last_end..]; - result.push_str(rewriter.rewrite(remaining).as_ref()); + result.push_str( + rewriter + .rewrite(remaining, request_host, request_scheme) + .as_ref(), + ); result } @@ -335,9 +346,11 @@ pub fn rewrite_rsc_scripts_combined( request_host: &str, request_scheme: &str, ) -> Vec { + let rewriter = RscUrlRewriter::new(origin_host); + rewrite_rsc_scripts_combined_with_limit( payloads, - origin_host, + &rewriter, request_host, request_scheme, DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, @@ -375,7 +388,7 @@ fn payload_contains_incomplete_tchunk(payload: &str) -> bool { pub(crate) fn rewrite_rsc_scripts_combined_with_limit( payloads: &[&str], - origin_host: &str, + rewriter: &RscUrlRewriter, request_host: &str, request_scheme: &str, max_combined_payload_bytes: usize, @@ -385,14 +398,17 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( } // Early exit if no payload contains the origin host - avoids regex compilation - if !payloads.iter().any(|p| p.contains(origin_host)) { + if !payloads.iter().any(|p| p.contains(rewriter.origin_host())) { return payloads.iter().map(|p| (*p).to_string()).collect(); } - let rewriter = RscUrlRewriter::new(origin_host, request_host, request_scheme); - if payloads.len() == 1 { - return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)]; + return vec![rewrite_rsc_tchunks_with_rewriter( + payloads[0], + rewriter, + request_host, + request_scheme, + )]; } let max_combined_payload_bytes = if max_combined_payload_bytes == 0 { @@ -427,7 +443,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( return payloads .iter() - .map(|p| rewrite_rsc_tchunks_with_rewriter(p, &rewriter)) + .map(|p| rewrite_rsc_tchunks_with_rewriter(p, rewriter, request_host, request_scheme)) .collect(); } @@ -447,7 +463,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( if chunks.is_empty() { return payloads .iter() - .map(|p| rewriter.rewrite_to_string(p)) + .map(|p| rewriter.rewrite_to_string(p, request_host, request_scheme)) .collect(); } @@ -456,10 +472,15 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( for chunk in &chunks { let before = &combined[last_end..chunk.match_start]; - result.push_str(rewriter.rewrite(before).as_ref()); + result.push_str( + rewriter + .rewrite(before, request_host, request_scheme) + .as_ref(), + ); let chunk_content = &combined[chunk.header_end..chunk.content_end]; - let rewritten_content = rewriter.rewrite_to_string(chunk_content); + let rewritten_content = + rewriter.rewrite_to_string(chunk_content, request_host, request_scheme); let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); let new_length_hex = format!("{new_length:x}"); @@ -474,7 +495,11 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( } let remaining = &combined[last_end..]; - result.push_str(rewriter.rewrite(remaining).as_ref()); + result.push_str( + rewriter + .rewrite(remaining, request_host, request_scheme) + .as_ref(), + ); result.split(RSC_MARKER).map(String::from).collect() } @@ -486,8 +511,9 @@ mod tests { #[test] fn tchunk_length_recalculation() { let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); - let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let result = + rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); assert!( result.contains("test.example.com"), @@ -503,8 +529,9 @@ mod tests { #[test] fn tchunk_length_recalculation_with_length_increase() { let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; - let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); - let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + let rewriter = RscUrlRewriter::new("short.io"); + let result = + rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); assert!( result.contains("test.example.com"), @@ -532,8 +559,9 @@ mod tests { #[test] fn multiple_tchunks() { let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; - let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); - let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + let rewriter = RscUrlRewriter::new("short.io"); + let result = + rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); assert!( result.contains("test.example.com"), @@ -610,8 +638,8 @@ mod tests { #[test] fn preserves_protocol_relative_urls() { let input = r#"{"url":"//origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); - let rewritten = rewriter.rewrite_to_string(input); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); assert!( rewritten.contains(r#""url":"//proxy.example.com/path""#), @@ -622,8 +650,8 @@ mod tests { #[test] fn rewrites_bare_host_occurrences() { let input = r#"{"siteProductionDomain":"origin.example.com"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); - let rewritten = rewriter.rewrite_to_string(input); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); assert!( rewritten.contains(r#""siteProductionDomain":"proxy.example.com""#), @@ -634,8 +662,8 @@ mod tests { #[test] fn bare_host_rewrite_respects_hostname_boundaries() { let input = r#"{"sub":"cdn.origin.example.com","prefix":"notorigin.example.com","suffix":"origin.example.com.uk","path":"origin.example.com/news","exact":"origin.example.com"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); - let rewritten = rewriter.rewrite_to_string(input); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); assert!( rewritten.contains(r#""sub":"cdn.origin.example.com""#), @@ -743,7 +771,7 @@ mod tests { let payloads: Vec<&str> = vec![script0, script1]; let results = rewrite_rsc_scripts_combined_with_limit( &payloads, - "origin.example.com", + &RscUrlRewriter::new("origin.example.com"), "test.example.com", "https", 1, @@ -768,7 +796,7 @@ mod tests { let payloads: Vec<&str> = vec![script0, script1]; let results = rewrite_rsc_scripts_combined_with_limit( &payloads, - "origin.example.com", + &RscUrlRewriter::new("origin.example.com"), "test.example.com", "https", 1, @@ -800,8 +828,9 @@ mod tests { #[test] fn invalid_or_unreasonable_tchunk_length_skips_rewriting() { let content = r#"1a:T10000000,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); - let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let result = + rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); assert_eq!( result, content, @@ -812,8 +841,9 @@ mod tests { #[test] fn incomplete_tchunk_skips_rewriting() { let content = r#"1a:Tff,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); - let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + let rewriter = RscUrlRewriter::new("origin.example.com"); + let result = + rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); assert_eq!( result, content, diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs index 4df34935..f6a159ff 100644 --- a/crates/common/src/integrations/nextjs/script_rewriter.rs +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -1,7 +1,10 @@ +use std::cell::Cell; use std::sync::Arc; +use error_stack::Report; use regex::{escape, Regex}; +use crate::error::TrustedServerError; use crate::integrations::{ IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; @@ -10,11 +13,17 @@ use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; pub(super) struct NextJsNextDataRewriter { config: Arc, + rewriter: UrlRewriter, } impl NextJsNextDataRewriter { - pub(super) fn new(config: Arc) -> Self { - Self { config } + pub(super) fn new( + config: Arc, + ) -> Result> { + Ok(Self { + rewriter: UrlRewriter::new(&config.rewrite_attributes)?, + config, + }) } fn rewrite_structured( @@ -29,14 +38,13 @@ impl NextJsNextDataRewriter { return ScriptRewriteAction::keep(); } - let rewriter = UrlRewriter::new( + if let Some(rewritten) = rewrite_nextjs_values_with_rewriter( + content, + &self.rewriter, ctx.origin_host, ctx.request_host, ctx.request_scheme, - &self.config.rewrite_attributes, - ); - - if let Some(rewritten) = rewrite_nextjs_values_with_rewriter(content, &rewriter) { + ) { ScriptRewriteAction::replace(rewritten) } else { ScriptRewriteAction::keep() @@ -62,8 +70,14 @@ impl IntegrationScriptRewriter for NextJsNextDataRewriter { } } -fn rewrite_nextjs_values_with_rewriter(content: &str, rewriter: &UrlRewriter) -> Option { - rewriter.rewrite_embedded(content) +fn rewrite_nextjs_values_with_rewriter( + content: &str, + rewriter: &UrlRewriter, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> Option { + rewriter.rewrite_embedded(content, origin_host, request_host, request_scheme) } #[cfg(test)] @@ -78,9 +92,15 @@ fn rewrite_nextjs_values( return None; } - let rewriter = UrlRewriter::new(origin_host, request_host, request_scheme, attributes); + let rewriter = UrlRewriter::new(attributes).expect("should build Next.js URL rewriter"); - rewrite_nextjs_values_with_rewriter(content, &rewriter) + rewrite_nextjs_values_with_rewriter( + content, + &rewriter, + origin_host, + request_host, + request_scheme, + ) } /// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`). @@ -88,28 +108,15 @@ fn rewrite_nextjs_values( /// This rewriter uses combined regex patterns to find and replace URLs /// in JSON content. It handles full URLs, protocol-relative URLs, and bare hostnames. /// Patterns for all attributes are combined with alternation for efficiency. +#[derive(Clone)] struct UrlRewriter { - #[cfg_attr(not(test), allow(dead_code))] - origin_host: String, - request_host: String, - request_scheme: String, - /// Single regex matching URL patterns for all attributes - embedded_pattern: Option, - /// Single regex matching bare hostname patterns for all attributes - bare_host_pattern: Option, + /// Single regex matching value patterns for all configured attributes. + value_pattern: Option, } impl UrlRewriter { - fn new( - origin_host: &str, - request_host: &str, - request_scheme: &str, - attributes: &[String], - ) -> Self { - let escaped_origin = escape(origin_host); - - // Build a single regex with alternation for all attributes - let embedded_pattern = if attributes.is_empty() { + fn new(attributes: &[String]) -> Result> { + let value_pattern = if attributes.is_empty() { None } else { let attr_alternation = attributes @@ -118,116 +125,90 @@ impl UrlRewriter { .collect::>() .join("|"); let pattern = format!( - r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*")(?P[^"\\]*)(?P\\*")"#, attrs = attr_alternation, - origin = escaped_origin, ); - Some(Regex::new(&pattern).expect("valid Next.js rewrite regex")) + Some(Regex::new(&pattern).map_err(|err| { + super::configuration_error(format!( + "failed to compile __NEXT_DATA__ URL rewrite regex for attributes {:?}: {err}", + attributes + )) + })?) }; - let bare_host_pattern = if attributes.is_empty() { - None - } else { - let attr_alternation = attributes - .iter() - .map(|attr| escape(attr)) - .collect::>() - .join("|"); - let pattern = format!( - r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*"){origin}(?P\\*")"#, - attrs = attr_alternation, - origin = escaped_origin, - ); - Some(Regex::new(&pattern).expect("valid Next.js bare host rewrite regex")) - }; + Ok(Self { value_pattern }) + } - Self { - origin_host: origin_host.to_string(), - request_host: request_host.to_string(), - request_scheme: request_scheme.to_string(), - embedded_pattern, - bare_host_pattern, + fn rewrite_url_value( + &self, + origin_host: &str, + url: &str, + request_host: &str, + request_scheme: &str, + ) -> Option { + fn strip_origin_host_with_boundary<'a>( + value: &'a str, + origin_host: &str, + ) -> Option<&'a str> { + let suffix = value.strip_prefix(origin_host)?; + let boundary_ok = suffix.is_empty() + || matches!( + suffix.as_bytes().first(), + Some(b'/') | Some(b'?') | Some(b'#') + ); + boundary_ok.then_some(suffix) } - } - #[cfg(test)] - fn rewrite_url_value(&self, url: &str) -> Option { if let Some(rest) = url.strip_prefix("https://") { - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - return Some(format!( - "{}://{}{}", - self.request_scheme, self.request_host, path - )); + if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + return Some(format!("{request_scheme}://{request_host}{path}")); } } else if let Some(rest) = url.strip_prefix("http://") { - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - return Some(format!( - "{}://{}{}", - self.request_scheme, self.request_host, path - )); + if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + return Some(format!("{request_scheme}://{request_host}{path}")); } } else if let Some(rest) = url.strip_prefix("//") { - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - return Some(format!("//{}{}", self.request_host, path)); + if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + return Some(format!("//{request_host}{path}")); } - } else if url == self.origin_host { - return Some(self.request_host.clone()); - } else if url.starts_with(&self.origin_host) { - let path = &url[self.origin_host.len()..]; - return Some(format!("{}{}", self.request_host, path)); + } else if url == origin_host { + return Some(request_host.to_string()); + } else if let Some(path) = strip_origin_host_with_boundary(url, origin_host) { + return Some(format!("{request_host}{path}")); } None } - fn rewrite_embedded(&self, input: &str) -> Option { - let mut result = input.to_string(); - let mut changed = false; - - if let Some(regex) = &self.embedded_pattern { - let request_host = &self.request_host; - let request_scheme = &self.request_scheme; - - let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { - let prefix = &caps["prefix"]; - let scheme = &caps["scheme"]; - let path = &caps["path"]; - let quote = &caps["quote"]; - - let new_url = if scheme == "//" { - format!("//{}{}", request_host, path) - } else { - format!("{}://{}{}", request_scheme, request_host, path) - }; - + fn rewrite_embedded( + &self, + input: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, + ) -> Option { + let Some(regex) = &self.value_pattern else { + return None; + }; + let changed = Cell::new(false); + let next_value = regex.replace_all(input, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let value = &caps["value"]; + let quote = &caps["quote"]; + + if let Some(new_url) = + self.rewrite_url_value(origin_host, value, request_host, request_scheme) + { + changed.set(true); format!("{prefix}{new_url}{quote}") - }); - - if next_value != result { - changed = true; - result = next_value.into_owned(); + } else { + caps.get(0) + .expect("should capture matched attribute value") + .as_str() + .to_string() } - } - - if let Some(regex) = &self.bare_host_pattern { - let request_host = &self.request_host; - - let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { - let prefix = &caps["prefix"]; - let suffix = &caps["suffix"]; - - format!("{prefix}{request_host}{suffix}") - }); + }); - if next_value != result { - changed = true; - result = next_value.into_owned(); - } - } - - changed.then_some(result) + changed.get().then(|| next_value.into_owned()) } } @@ -262,7 +243,8 @@ mod tests { #[test] fn structured_rewriter_updates_next_data_payload() { let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; - let rewriter = NextJsNextDataRewriter::new(test_config()); + let rewriter = NextJsNextDataRewriter::new(test_config()) + .expect("should build Next.js structured rewriter"); let document_state = IntegrationDocumentState::default(); let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__", &document_state)); @@ -410,16 +392,45 @@ mod tests { #[test] fn url_rewriter_rewrites_url() { - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - ); + let rewriter = UrlRewriter::new(&["url".into()]).expect("should build URL rewriter"); let new_url = rewriter - .rewrite_url_value("https://origin.example.com/news") + .rewrite_url_value( + "origin.example.com", + "https://origin.example.com/news", + "proxy.example.com", + "http", + ) .expect("URL should be rewritten"); assert_eq!(new_url, "http://proxy.example.com/news"); } + + #[test] + fn url_rewriter_does_not_rewrite_partial_hostname_matches() { + let rewriter = UrlRewriter::new(&["url".into(), "siteProductionDomain".into()]) + .expect("should build URL rewriter"); + let input = r#"{"url":"https://origin.example.com.evil/news","siteProductionDomain":"origin.example.com.evil"}"#; + + let rewritten = + rewriter.rewrite_embedded(input, "origin.example.com", "proxy.example.com", "https"); + + assert!( + rewritten.is_none(), + "should not rewrite partial hostname matches" + ); + } + + #[test] + fn url_rewriter_supports_regex_metacharacters_in_literals() { + let rewriter = UrlRewriter::new(&["href(".into(), "u|rl".into()]) + .expect("should build URL rewriter with metacharacters"); + let input = r#"{"href(":"https://origin.(example).com/news","u|rl":"//origin.(example).com/assets/logo.png"}"#; + + let rewritten = rewriter + .rewrite_embedded(input, "origin.(example).com", "proxy.example.com", "https") + .expect("should rewrite metacharacter-heavy literals"); + + assert!(rewritten.contains("https://proxy.example.com/news")); + assert!(rewritten.contains("//proxy.example.com/assets/logo.png")); + } } diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs index aedac841..f7e7b3d4 100644 --- a/crates/common/src/integrations/nextjs/shared.rs +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use once_cell::sync::Lazy; -use regex::{escape, Regex}; +use regex::Regex; use crate::host_rewrite::rewrite_bare_host_at_boundaries; @@ -15,6 +15,14 @@ pub(crate) static RSC_PUSH_CALL_PATTERN: Lazy = Lazy::new(|| { .expect("valid RSC push call regex") }); +/// Generic URL pattern for RSC payload rewriting. +pub(crate) static RSC_URL_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//)(?P\[[^\]]+\](?::\d+)?|[^/"'\\\s?#<>,)]+)"#, + ) + .expect("valid RSC URL rewrite regex") +}); + /// Find the payload string boundaries within an RSC push script. /// /// Returns `Some((start, end))` where `start` is the position after the opening quote @@ -61,51 +69,50 @@ pub(crate) fn find_rsc_push_payload_range(script: &str) -> Option<(usize, usize) /// Use this for RSC T-chunk content where any origin URL should be rewritten. /// For attribute-specific rewriting (e.g., only rewrite `"href"` values), use /// the `UrlRewriter` in `script_rewriter.rs` instead. +#[derive(Clone)] pub(crate) struct RscUrlRewriter { origin_host: String, - request_host: String, - request_scheme: String, - pattern: Regex, } impl RscUrlRewriter { - pub(crate) fn new(origin_host: &str, request_host: &str, request_scheme: &str) -> Self { - let escaped_origin = escape(origin_host); - - // Match: - // - https://origin_host or http://origin_host - // - //origin_host (protocol-relative) - // - escaped variants inside JSON-in-JS strings (e.g., \/\/origin_host) - let pattern = Regex::new(&format!( - r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, - escaped_origin - )) - .expect("valid RSC URL rewrite regex"); + pub(crate) fn origin_host(&self) -> &str { + &self.origin_host + } + pub(crate) fn new(origin_host: &str) -> Self { Self { origin_host: origin_host.to_string(), - request_host: request_host.to_string(), - request_scheme: request_scheme.to_string(), - pattern, } } - pub(crate) fn rewrite<'a>(&self, input: &'a str) -> Cow<'a, str> { + pub(crate) fn rewrite<'a>( + &self, + input: &'a str, + request_host: &str, + request_scheme: &str, + ) -> Cow<'a, str> { if !input.contains(&self.origin_host) { return Cow::Borrowed(input); } // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) - let replaced = self - .pattern - .replace_all(input, |caps: ®ex::Captures<'_>| { - let slashes = caps.get(3).map_or("//", |m| m.as_str()); - if caps.get(1).is_some() { - format!("{}:{}{}", self.request_scheme, slashes, self.request_host) - } else { - format!("{}{}", slashes, self.request_host) - } - }); + let replaced = RSC_URL_PATTERN.replace_all(input, |caps: ®ex::Captures<'_>| { + let host = caps.name("host").map_or("", |m| m.as_str()); + if host != self.origin_host { + return caps + .get(0) + .expect("should capture the matched RSC URL") + .as_str() + .to_string(); + } + + let slashes = caps.get(3).map_or("//", |m| m.as_str()); + if caps.get(1).is_some() { + format!("{request_scheme}:{slashes}{request_host}") + } else { + format!("{slashes}{request_host}") + } + }); // Phase 2: Handle bare host occurrences not matched by the URL regex // (e.g., `siteProductionDomain`). Only check if regex made no changes, @@ -119,13 +126,19 @@ impl RscUrlRewriter { return replaced; } - rewrite_bare_host_at_boundaries(text, &self.origin_host, &self.request_host) + rewrite_bare_host_at_boundaries(text, &self.origin_host, request_host) .map(Cow::Owned) .unwrap_or(replaced) } - pub(crate) fn rewrite_to_string(&self, input: &str) -> String { - self.rewrite(input).into_owned() + pub(crate) fn rewrite_to_string( + &self, + input: &str, + request_host: &str, + request_scheme: &str, + ) -> String { + self.rewrite(input, request_host, request_scheme) + .into_owned() } } @@ -170,60 +183,68 @@ mod tests { #[test] fn rsc_url_rewriter_rewrites_https_url() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"url":"https://origin.example.com/path"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_http_url() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "http"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"url":"http://origin.example.com/path"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "http"); assert_eq!(result, r#"{"url":"http://proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_protocol_relative_url() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"url":"//origin.example.com/path"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"//proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_escaped_slashes() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"url":"\/\/origin.example.com/path"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"\/\/proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_bare_host() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"siteProductionDomain":"origin.example.com"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); assert_eq!(result, r#"{"siteProductionDomain":"proxy.example.com"}"#); } #[test] fn rsc_url_rewriter_does_not_rewrite_partial_hostname() { - let rewriter = RscUrlRewriter::new("example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("example.com"); let input = r#"{"domain":"subexample.com"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); // Should not rewrite because "example.com" is not a standalone host here assert_eq!(result, r#"{"domain":"subexample.com"}"#); } #[test] fn rsc_url_rewriter_no_change_when_origin_not_present() { - let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new("origin.example.com"); let input = r#"{"url":"https://other.example.com/path"}"#; - let result = rewriter.rewrite(input); + let result = rewriter.rewrite(input, "proxy.example.com", "https"); // Should return borrowed reference (no allocation) assert!(matches!(result, Cow::Borrowed(_))); assert_eq!(result, input); } + + #[test] + fn rsc_url_rewriter_supports_regex_metacharacters_in_origin_literal() { + let rewriter = RscUrlRewriter::new("origin.(example).com"); + let input = r#"{"url":"https://origin.(example).com/path"}"#; + let result = rewriter.rewrite(input, "proxy.example.com", "https"); + assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); + } } diff --git a/crates/common/src/integrations/permutive.rs b/crates/common/src/integrations/permutive.rs index 7c2ab483..179d7766 100644 --- a/crates/common/src/integrations/permutive.rs +++ b/crates/common/src/integrations/permutive.rs @@ -498,29 +498,36 @@ impl PermutiveIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(PERMUTIVE_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => return None, - Err(err) => { - log::error!("Failed to load Permutive integration config: {err:?}"); - return None; - } +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = settings.integration_config::(PERMUTIVE_INTEGRATION_ID)? + else { + return Ok(None); }; - Some(PermutiveIntegration::new(config)) + Ok(Some(PermutiveIntegration::new(config))) } /// Register the Permutive integration. -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// +/// # Errors +/// +/// Returns an error when the Permutive integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(PERMUTIVE_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration) .build(), - ) + )) } #[async_trait(?Send)] @@ -749,7 +756,9 @@ mod tests { let settings = create_test_settings(); // Without [integrations.permutive] config, should not build assert!( - build(&settings).is_none(), + build(&settings) + .expect("should evaluate integration build") + .is_none(), "Should not build without integration config" ); } diff --git a/crates/common/src/integrations/prebid.rs b/crates/common/src/integrations/prebid.rs index fb3fccc8..b942f7ea 100644 --- a/crates/common/src/integrations/prebid.rs +++ b/crates/common/src/integrations/prebid.rs @@ -38,6 +38,7 @@ const ZONE_KEY: &str = "zone"; pub struct PrebidIntegrationConfig { #[serde(default = "default_enabled")] pub enabled: bool, + #[validate(url)] pub server_url: String, /// Prebid Server account ID, injected into the client-side bundle via /// `window.__tsjs_prebid.accountId` so publishers don't need to configure @@ -206,31 +207,38 @@ impl PrebidIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = settings - .integration_config::(PREBID_INTEGRATION_ID) - .ok() - .flatten()?; - if !config.enabled { - return None; - } - if config.server_url.trim().is_empty() { - log::warn!("Prebid integration disabled: prebid.server_url missing"); - return None; - } - Some(PrebidIntegration::new(config)) +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = + settings.integration_config::(PREBID_INTEGRATION_ID)? + else { + return Ok(None); + }; + + Ok(Some(PrebidIntegration::new(config))) } -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( +/// Register the Prebid integration when enabled. +/// +/// # Errors +/// +/// Returns an error when the Prebid integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(PREBID_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration.clone()) .with_head_injector(integration) .build(), - ) + )) } #[async_trait(?Send)] @@ -958,8 +966,14 @@ impl AuctionProvider for PrebidAuctionProvider { /// /// This function checks the settings for Prebid configuration and returns /// the provider if enabled. -#[must_use] -pub fn register_auction_provider(settings: &Settings) -> Vec> { +/// +/// # Errors +/// +/// Returns an error when the Prebid provider is enabled with invalid +/// configuration. +pub fn register_auction_provider( + settings: &Settings, +) -> Result>, Report> { let mut providers: Vec> = Vec::new(); match settings.integration_config::("prebid") { @@ -980,11 +994,11 @@ pub fn register_auction_provider(settings: &Settings) -> Vec { - log::error!("Prebid auction provider not registered: config error: {e:?}"); + return Err(e); } } - providers + Ok(providers) } #[cfg(test)] diff --git a/crates/common/src/integrations/registry.rs b/crates/common/src/integrations/registry.rs index 07fec408..3df97193 100644 --- a/crates/common/src/integrations/registry.rs +++ b/crates/common/src/integrations/registry.rs @@ -541,7 +541,7 @@ impl IntegrationRegistry { let mut inner = IntegrationRegistryInner::default(); for builder in crate::integrations::builders() { - if let Some(registration) = builder(settings) { + if let Some(registration) = builder(settings)? { for proxy in registration.proxies { for route in proxy.routes() { let value = (proxy.clone(), registration.integration_id); diff --git a/crates/common/src/integrations/testlight.rs b/crates/common/src/integrations/testlight.rs index fda22ea9..01e26c2b 100644 --- a/crates/common/src/integrations/testlight.rs +++ b/crates/common/src/integrations/testlight.rs @@ -95,27 +95,36 @@ impl TestlightIntegration { } } -fn build(settings: &Settings) -> Option> { - let config = match settings.integration_config::(TESTLIGHT_INTEGRATION_ID) { - Ok(Some(config)) => config, - Ok(None) => return None, - Err(err) => { - log::error!("Failed to load Testlight integration config: {err:?}"); - return None; - } +fn build( + settings: &Settings, +) -> Result>, Report> { + let Some(config) = settings.integration_config::(TESTLIGHT_INTEGRATION_ID)? + else { + return Ok(None); }; - Some(TestlightIntegration::new(config)) + Ok(Some(TestlightIntegration::new(config))) } -#[must_use] -pub fn register(settings: &Settings) -> Option { - let integration = build(settings)?; - Some( + +/// Register the Testlight integration when enabled. +/// +/// # Errors +/// +/// Returns an error when the Testlight integration is enabled with invalid +/// configuration. +pub fn register( + settings: &Settings, +) -> Result, Report> { + let Some(integration) = build(settings)? else { + return Ok(None); + }; + + Ok(Some( IntegrationRegistration::builder(TESTLIGHT_INTEGRATION_ID) .with_proxy(integration.clone()) .with_attribute_rewriter(integration) .build(), - ) + )) } #[async_trait(?Send)] @@ -253,7 +262,9 @@ mod tests { fn build_requires_config() { let settings = create_test_settings(); assert!( - build(&settings).is_none(), + build(&settings) + .expect("should evaluate integration build") + .is_none(), "Should not build without integration config" ); } @@ -327,7 +338,9 @@ mod tests { ) .expect("should insert integration config"); - let integration = build(&settings).expect("Integration should build with config"); + let integration = build(&settings) + .expect("should evaluate integration build") + .expect("Integration should build with config"); let routes = integration.routes(); assert!( routes.iter().any(|route| route.method == Method::POST diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index eff23004..73f825dc 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -120,6 +120,13 @@ impl IntegrationSettings { } } + fn is_explicitly_disabled(raw: &JsonValue) -> bool { + raw.as_object() + .and_then(|map| map.get("enabled")) + .and_then(JsonValue::as_bool) + == Some(false) + } + /// Retrieves and validates a typed configuration for an integration. /// /// # Errors @@ -137,6 +144,10 @@ impl IntegrationSettings { None => return Ok(None), }; + if Self::is_explicitly_disabled(raw) { + return Ok(None); + } + let config: T = serde_json::from_value(raw.clone()).change_context( TrustedServerError::Configuration { message: format!( @@ -246,18 +257,41 @@ pub struct Handler { pub password: String, #[serde(skip, default)] #[validate(skip)] - regex: OnceLock, + regex: OnceLock>, } impl Handler { - fn compiled_regex(&self) -> &Regex { - self.regex.get_or_init(|| { - Regex::new(&self.path).expect("configuration validation should ensure regex compiles") - }) + fn compiled_regex(&self) -> Result<&Regex, Report> { + match self + .regex + .get_or_init(|| Regex::new(&self.path).map_err(|err| err.to_string())) + { + Ok(regex) => Ok(regex), + Err(message) => Err(Report::new(TrustedServerError::Configuration { + message: format!( + "Handler path regex `{}` failed to compile: {message}", + self.path + ), + })), + } } - pub fn matches_path(&self, path: &str) -> bool { - self.compiled_regex().is_match(path) + /// Eagerly compile the handler regex to fail fast during startup. + /// + /// # Errors + /// + /// Returns a configuration error if the handler path regex does not compile. + pub fn prepare_runtime(&self) -> Result<(), Report> { + self.compiled_regex().map(|_| ()) + } + + /// Determine whether this handler applies to the request path. + /// + /// # Errors + /// + /// Returns a configuration error if the handler path regex does not compile. + pub fn matches_path(&self, path: &str) -> Result> { + self.compiled_regex().map(|regex| regex.is_match(path)) } } @@ -374,14 +408,40 @@ impl Settings { }) })?; + settings.prepare_runtime()?; + Ok(settings) } - #[must_use] - pub fn handler_for_path(&self, path: &str) -> Option<&Handler> { - self.handlers - .iter() - .find(|handler| handler.matches_path(path)) + /// Eagerly prepare runtime-only settings artifacts. + /// + /// # Errors + /// + /// Returns a configuration error if any cached runtime artifact cannot be prepared. + pub fn prepare_runtime(&self) -> Result<(), Report> { + for handler in &self.handlers { + handler.prepare_runtime()?; + } + + Ok(()) + } + + /// Resolve the first handler whose regex matches the request path. + /// + /// # Errors + /// + /// Returns a configuration error if any handler regex does not compile. + pub fn handler_for_path( + &self, + path: &str, + ) -> Result, Report> { + for handler in &self.handlers { + if handler.matches_path(path)? { + return Ok(Some(handler)); + } + } + + Ok(None) } /// Retrieves the integration configuration of a specific type. @@ -542,9 +602,10 @@ mod tests { use serde_json::json; use std::collections::HashSet; + use crate::auction::build_orchestrator; use crate::integrations::{ - nextjs::NextJsIntegrationConfig, prebid::PrebidIntegrationConfig, - testlight::TestlightConfig, + gpt::GptConfig, nextjs::NextJsIntegrationConfig, prebid::PrebidIntegrationConfig, + testlight::TestlightConfig, IntegrationRegistry, }; use crate::test_support::tests::{crate_test_settings_str, create_test_settings}; @@ -610,6 +671,21 @@ mod tests { ); } + #[test] + fn prepare_runtime_rejects_invalid_handler_regex() { + let toml_str = crate_test_settings_str().replace(r#"path = "^/secure""#, r#"path = "(""#); + + let settings = Settings::from_toml(&toml_str).expect("should parse TOML"); + let err = settings + .prepare_runtime() + .expect_err("should reject invalid handler regex"); + assert!( + err.to_string() + .contains("Handler path regex `(` failed to compile"), + "should describe the invalid handler regex" + ); + } + #[test] fn test_settings_missing_required_fields() { let re = Regex::new(r"origin_url = .*").expect("regex should compile"); @@ -799,6 +875,36 @@ mod tests { ); } + #[test] + fn test_invalid_handler_override_fails_during_runtime_preparation() { + let toml_str = crate_test_settings_str(); + + let origin_key = format!( + "{}{}PUBLISHER{}ORIGIN_URL", + ENVIRONMENT_VARIABLE_PREFIX, + ENVIRONMENT_VARIABLE_SEPARATOR, + ENVIRONMENT_VARIABLE_SEPARATOR + ); + let path_key = format!( + "{}{}HANDLERS{}0{}PATH", + ENVIRONMENT_VARIABLE_PREFIX, + ENVIRONMENT_VARIABLE_SEPARATOR, + ENVIRONMENT_VARIABLE_SEPARATOR, + ENVIRONMENT_VARIABLE_SEPARATOR + ); + + temp_env::with_var( + origin_key, + Some("https://origin.test-publisher.com"), + || { + temp_env::with_var(path_key, Some("("), || { + let _ = Settings::from_toml_and_env(&toml_str) + .expect_err("should reject invalid handler regex override"); + }); + }, + ); + } + #[test] fn test_response_headers_override_with_json_env() { let toml_str = crate_test_settings_str(); @@ -1023,6 +1129,118 @@ mod tests { assert!(config.is_none(), "Disabled integrations should be skipped"); } + #[test] + fn disabled_invalid_integration_skips_validation() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "gpt", + &json!({ + "enabled": false, + "script_url": "not a url", + }), + ) + .expect("should insert GPT config"); + + let config = settings + .integration_config::("gpt") + .expect("disabled GPT config should be ignored"); + assert!(config.is_none(), "disabled GPT config should be skipped"); + IntegrationRegistry::new(&settings) + .expect("disabled invalid integration config should not fail registry startup"); + } + + #[test] + fn enabled_invalid_integration_fails_registry_startup() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "gpt", + &json!({ + "enabled": true, + "script_url": "not a url", + }), + ) + .expect("should insert GPT config"); + + let err = match IntegrationRegistry::new(&settings) { + Ok(_) => panic!("enabled invalid integration should fail registry startup"), + Err(err) => err, + }; + assert!( + err.to_string().contains("Integration 'gpt'"), + "should identify the invalid integration config" + ); + } + + #[test] + fn disabled_invalid_provider_config_does_not_fail_orchestrator_startup() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "adserver_mock", + &json!({ + "enabled": false, + "endpoint": "not a url", + }), + ) + .expect("should insert adserver mock config"); + + build_orchestrator(&settings).expect("disabled invalid provider config should be ignored"); + } + + #[test] + fn enabled_invalid_provider_config_fails_orchestrator_startup() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "adserver_mock", + &json!({ + "enabled": true, + "endpoint": "not a url", + }), + ) + .expect("should insert adserver mock config"); + + let err = match build_orchestrator(&settings) { + Ok(_) => panic!("enabled invalid provider config should fail startup"), + Err(err) => err, + }; + assert!( + err.to_string().contains("Integration 'adserver_mock'"), + "should identify the invalid provider config" + ); + } + + #[test] + fn empty_prebid_server_url_fails_orchestrator_startup() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "prebid", + &json!({ + "enabled": true, + "server_url": "", + }), + ) + .expect("should insert prebid config"); + + let err = match build_orchestrator(&settings) { + Ok(_) => panic!("empty prebid server_url should fail startup"), + Err(err) => err, + }; + assert!( + err.to_string() + .contains("Integration 'prebid' configuration failed validation"), + "should surface a validation error for prebid.server_url" + ); + } + /// Tests the full build.rs round-trip: env vars are baked into Settings /// at build time via `from_toml_and_env`, serialized to TOML, then parsed /// back at runtime via `from_toml`. Verifies that env-sourced integration diff --git a/crates/common/src/settings_data.rs b/crates/common/src/settings_data.rs index 4f8e36a0..430e7bb0 100644 --- a/crates/common/src/settings_data.rs +++ b/crates/common/src/settings_data.rs @@ -34,6 +34,8 @@ pub fn get_settings() -> Result> { message: "Failed to validate configuration".to_string(), })?; + settings.prepare_runtime()?; + if settings.synthetic.secret_key == "secret-key" { return Err(Report::new(TrustedServerError::InsecureSecretKey)); } diff --git a/crates/fastly/src/main.rs b/crates/fastly/src/main.rs index 77b37649..ce095808 100644 --- a/crates/fastly/src/main.rs +++ b/crates/fastly/src/main.rs @@ -42,7 +42,13 @@ fn main(req: Request) -> Result { log::info!("Settings {settings:?}"); // Build the auction orchestrator once at startup - let orchestrator = build_orchestrator(&settings); + let orchestrator = match build_orchestrator(&settings) { + Ok(orchestrator) => orchestrator, + Err(e) => { + log::error!("Failed to build auction orchestrator: {:?}", e); + return Ok(to_error_response(&e)); + } + }; let integration_registry = match IntegrationRegistry::new(&settings) { Ok(r) => r, @@ -69,7 +75,10 @@ async fn route_request( // Extract geo info before auth check or routing consumes the request let geo_info = GeoInfo::from_request(&req); - if let Some(mut response) = enforce_basic_auth(settings, &req) { + if let Some(mut response) = enforce_basic_auth(settings, &req).unwrap_or_else(|e| { + log::error!("Failed to evaluate basic auth: {:?}", e); + Some(to_error_response(&e)) + }) { finalize_response(settings, geo_info.as_ref(), &mut response); return Ok(response); } From 8b67018c5c53a70ae58590dc4c7a11b97acd51f6 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Sun, 8 Mar 2026 22:36:23 +0530 Subject: [PATCH 2/5] Harden config regex failures and close Next.js rewrite gaps --- crates/common/src/integrations/nextjs/mod.rs | 93 +++++++++++++++++++ .../integrations/nextjs/script_rewriter.rs | 84 +++++++++++++---- .../common/src/integrations/nextjs/shared.rs | 91 +++++++++++++++++- crates/common/src/publisher.rs | 25 ++++- crates/common/src/settings.rs | 24 +++++ 5 files changed, 294 insertions(+), 23 deletions(-) diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs index 4074ab5d..50244438 100644 --- a/crates/common/src/integrations/nextjs/mod.rs +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -203,6 +203,99 @@ mod tests { ); } + #[test] + fn html_processor_rewrites_next_data_fixture_like_payload() { + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": [ + "href", + "link", + "siteBaseUrl", + "siteProductionDomain", + "url" + ], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings).expect("should create registry"); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .expect("pipeline should process HTML"); + let processed = String::from_utf8_lossy(&output); + + assert!( + processed.contains(r#""siteProductionDomain": "test.example.com""#), + "should rewrite siteProductionDomain in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""siteBaseUrl": "https://test.example.com""#), + "should rewrite siteBaseUrl in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""href": "https://test.example.com:8443/reviews""#), + "should preserve explicit ports while rewriting href in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""link": "//test.example.com:9443/assets/logo.png""#), + "should preserve explicit ports while rewriting protocol-relative links in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""url": "https://test.example.com/news""#), + "should rewrite url fields in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""canonicalUrl": "https://origin.example.com/should-stay""#), + "should leave non-configured fields untouched in __NEXT_DATA__. Output: {processed}" + ); + assert!( + processed.contains(r#""ogUrl": "https://origin.example.com/should-stay-too""#), + "should leave nested non-configured fields untouched in __NEXT_DATA__. Output: {processed}" + ); + assert!( + !processed.contains(r#""siteProductionDomain": "origin.example.com""#), + "should not leave the origin host in rewritten __NEXT_DATA__ fields. Output: {processed}" + ); + } + #[test] fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { // RSC payloads (self.__next_f.push) are rewritten via post-processing. diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs index f6a159ff..4482d650 100644 --- a/crates/common/src/integrations/nextjs/script_rewriter.rs +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -9,6 +9,7 @@ use crate::integrations::{ IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; +use super::shared::strip_origin_host_with_optional_port; use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; pub(super) struct NextJsNextDataRewriter { @@ -125,7 +126,7 @@ impl UrlRewriter { .collect::>() .join("|"); let pattern = format!( - r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*")(?P[^"\\]*)(?P\\*")"#, + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?\s*:\s*\\*")(?P[^"\\]*)(?P\\*")"#, attrs = attr_alternation, ); Some(Regex::new(&pattern).map_err(|err| { @@ -146,34 +147,21 @@ impl UrlRewriter { request_host: &str, request_scheme: &str, ) -> Option { - fn strip_origin_host_with_boundary<'a>( - value: &'a str, - origin_host: &str, - ) -> Option<&'a str> { - let suffix = value.strip_prefix(origin_host)?; - let boundary_ok = suffix.is_empty() - || matches!( - suffix.as_bytes().first(), - Some(b'/') | Some(b'?') | Some(b'#') - ); - boundary_ok.then_some(suffix) - } - if let Some(rest) = url.strip_prefix("https://") { - if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + if let Some(path) = strip_origin_host_with_optional_port(rest, origin_host) { return Some(format!("{request_scheme}://{request_host}{path}")); } } else if let Some(rest) = url.strip_prefix("http://") { - if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + if let Some(path) = strip_origin_host_with_optional_port(rest, origin_host) { return Some(format!("{request_scheme}://{request_host}{path}")); } } else if let Some(rest) = url.strip_prefix("//") { - if let Some(path) = strip_origin_host_with_boundary(rest, origin_host) { + if let Some(path) = strip_origin_host_with_optional_port(rest, origin_host) { return Some(format!("//{request_host}{path}")); } } else if url == origin_host { return Some(request_host.to_string()); - } else if let Some(path) = strip_origin_host_with_boundary(url, origin_host) { + } else if let Some(path) = strip_origin_host_with_optional_port(url, origin_host) { return Some(format!("{request_host}{path}")); } None @@ -274,6 +262,51 @@ mod tests { assert!(rewritten.contains("ts.example.com") && rewritten.contains("/image.png")); } + #[test] + fn rewrite_helper_handles_whitespace_around_colons() { + let content = r#"{ + "props": { + "pageProps": { + "siteProductionDomain": "origin.example.com", + "siteBaseUrl": "https://origin.example.com", + "href": "https://origin.example.com/reviews" + } + } + }"#; + let rewritten = rewrite_nextjs_values( + content, + "origin.example.com", + "ts.example.com", + "https", + &[ + "href".into(), + "siteBaseUrl".into(), + "siteProductionDomain".into(), + ], + ) + .expect("should rewrite pretty-printed JSON values"); + + assert!(rewritten.contains(r#""siteProductionDomain": "ts.example.com""#)); + assert!(rewritten.contains(r#""siteBaseUrl": "https://ts.example.com""#)); + assert!(rewritten.contains(r#""href": "https://ts.example.com/reviews""#)); + } + + #[test] + fn rewrite_helper_rewrites_explicit_port_urls() { + let content = r#"{"props":{"pageProps":{"url":"https://origin.example.com:8443/news","link":"//origin.example.com:9443/image.png"}}}"#; + let rewritten = rewrite_nextjs_values( + content, + "origin.example.com", + "ts.example.com", + "https", + &["url".into(), "link".into()], + ) + .expect("should rewrite explicit port URLs"); + + assert!(rewritten.contains("https://ts.example.com:8443/news")); + assert!(rewritten.contains("//ts.example.com:9443/image.png")); + } + #[test] fn truncated_string_without_urls_is_not_modified() { let truncated = r#"self.__next_f.push([ @@ -420,6 +453,21 @@ mod tests { ); } + #[test] + fn url_rewriter_rewrites_explicit_port_url() { + let rewriter = UrlRewriter::new(&["url".into()]).expect("should build URL rewriter"); + + let new_url = rewriter + .rewrite_url_value( + "origin.example.com", + "https://origin.example.com:8443/news", + "proxy.example.com", + "http", + ) + .expect("URL with explicit port should be rewritten"); + assert_eq!(new_url, "http://proxy.example.com:8443/news"); + } + #[test] fn url_rewriter_supports_regex_metacharacters_in_literals() { let rewriter = UrlRewriter::new(&["href(".into(), "u|rl".into()]) diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs index f7e7b3d4..62cf912a 100644 --- a/crates/common/src/integrations/nextjs/shared.rs +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -54,6 +54,48 @@ pub(crate) fn find_rsc_push_payload_range(script: &str) -> Option<(usize, usize) None } +/// Strip an origin host from the start of an authority/value while preserving an +/// optional explicit port and requiring a safe hostname boundary. +/// +/// Examples: +/// - `origin.example.com/path` -> `Some("/path")` +/// - `origin.example.com:8443/path` -> `Some(":8443/path")` +/// - `origin.example.com.evil/path` -> `None` +pub(crate) fn strip_origin_host_with_optional_port<'a>( + value: &'a str, + origin_host: &str, +) -> Option<&'a str> { + let suffix = value.strip_prefix(origin_host)?; + if suffix.is_empty() { + return Some(suffix); + } + + if matches!( + suffix.as_bytes().first(), + Some(b'/') | Some(b'?') | Some(b'#') + ) { + return Some(suffix); + } + + let port_and_rest = suffix.strip_prefix(':')?; + let port_len = port_and_rest.bytes().take_while(u8::is_ascii_digit).count(); + if port_len == 0 { + return None; + } + + let rest = &port_and_rest[port_len..]; + if rest.is_empty() + || matches!( + rest.as_bytes().first(), + Some(b'/') | Some(b'?') | Some(b'#') + ) + { + Some(suffix) + } else { + None + } +} + // ============================================================================= // URL Rewriting // ============================================================================= @@ -98,19 +140,20 @@ impl RscUrlRewriter { // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) let replaced = RSC_URL_PATTERN.replace_all(input, |caps: ®ex::Captures<'_>| { let host = caps.name("host").map_or("", |m| m.as_str()); - if host != self.origin_host { + let Some(host_suffix) = strip_origin_host_with_optional_port(host, &self.origin_host) + else { return caps .get(0) .expect("should capture the matched RSC URL") .as_str() .to_string(); - } + }; let slashes = caps.get(3).map_or("//", |m| m.as_str()); if caps.get(1).is_some() { - format!("{request_scheme}:{slashes}{request_host}") + format!("{request_scheme}:{slashes}{request_host}{host_suffix}") } else { - format!("{slashes}{request_host}") + format!("{slashes}{request_host}{host_suffix}") } }); @@ -221,6 +264,17 @@ mod tests { assert_eq!(result, r#"{"siteProductionDomain":"proxy.example.com"}"#); } + #[test] + fn rsc_url_rewriter_rewrites_explicit_port_urls() { + let rewriter = RscUrlRewriter::new("origin.example.com"); + let input = r#"{"url":"https://origin.example.com:8443/path","asset":"//origin.example.com:9443/file.js"}"#; + let result = rewriter.rewrite(input, "proxy.example.com", "https"); + assert_eq!( + result, + r#"{"url":"https://proxy.example.com:8443/path","asset":"//proxy.example.com:9443/file.js"}"# + ); + } + #[test] fn rsc_url_rewriter_does_not_rewrite_partial_hostname() { let rewriter = RscUrlRewriter::new("example.com"); @@ -247,4 +301,33 @@ mod tests { let result = rewriter.rewrite(input, "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); } + + #[test] + fn strip_origin_host_with_optional_port_enforces_boundaries() { + assert_eq!( + strip_origin_host_with_optional_port( + "origin.example.com:8443/path", + "origin.example.com" + ), + Some(":8443/path") + ); + assert_eq!( + strip_origin_host_with_optional_port("origin.example.com/path", "origin.example.com"), + Some("/path") + ); + assert_eq!( + strip_origin_host_with_optional_port( + "origin.example.com.evil/path", + "origin.example.com" + ), + None + ); + assert_eq!( + strip_origin_host_with_optional_port( + "origin.example.com:not-a-port/path", + "origin.example.com" + ), + None + ); + } } diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index 78489d2e..ea53ed1b 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -15,6 +15,13 @@ use crate::streaming_processor::{Compression, PipelineConfig, StreamProcessor, S use crate::streaming_replacer::create_url_replacer; use crate::synthetic::get_or_generate_synthetic_id; +/// Encodings supported by the publisher response rewrite pipeline. +const SUPPORTED_ENCODINGS: &str = "gzip, deflate, br"; + +fn restrict_accept_encoding(req: &mut Request) { + req.set_header(header::ACCEPT_ENCODING, SUPPORTED_ENCODINGS); +} + /// Unified tsjs static serving: `/static/tsjs=` /// /// Concatenates core + enabled integration JS modules at runtime based on the @@ -218,6 +225,8 @@ pub fn handle_publisher_request( backend_name, settings.publisher.origin_url ); + // Only advertise encodings the rewrite pipeline can decode and re-encode. + restrict_accept_encoding(&mut req); req.set_header("host", &origin_host); let mut response = req @@ -312,7 +321,7 @@ mod tests { use super::*; use crate::integrations::IntegrationRegistry; use crate::test_support::tests::create_test_settings; - use fastly::http::{Method, StatusCode}; + use fastly::http::{header, Method, StatusCode}; #[test] fn test_content_type_detection() { @@ -426,6 +435,20 @@ mod tests { } } + #[test] + fn publisher_proxy_limits_accept_encoding_to_supported_values() { + let mut req = Request::new(Method::GET, "https://test.example.com/page"); + req.set_header(header::ACCEPT_ENCODING, "gzip, deflate, br, zstd"); + + restrict_accept_encoding(&mut req); + + assert_eq!( + req.get_header_str(header::ACCEPT_ENCODING), + Some(SUPPORTED_ENCODINGS), + "publisher fallback should only advertise encodings the rewrite pipeline supports" + ); + } + #[test] fn tsjs_dynamic_returns_not_found_for_unknown_filename() { let settings = create_test_settings(); diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 73f825dc..77d196d0 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -1151,6 +1151,30 @@ mod tests { .expect("disabled invalid integration config should not fail registry startup"); } + #[test] + fn disabled_invalid_default_enabled_prebid_skips_validation() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "prebid", + &json!({ + "enabled": false, + "server_url": "not a url", + }), + ) + .expect("should insert prebid config"); + + let config = settings + .integration_config::("prebid") + .expect("disabled prebid config should be ignored"); + assert!(config.is_none(), "disabled prebid config should be skipped"); + IntegrationRegistry::new(&settings) + .expect("disabled default-enabled prebid config should not fail registry startup"); + build_orchestrator(&settings) + .expect("disabled default-enabled prebid config should not fail orchestrator startup"); + } + #[test] fn enabled_invalid_integration_fails_registry_startup() { let mut settings = create_test_settings(); From ec89235a5f29852854c0ec6d69146b8392aff85e Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Mar 2026 09:53:18 +0530 Subject: [PATCH 3/5] Tighten regex hardening --- crates/common/src/integrations/lockr.rs | 43 ++++++++++++++++--- crates/common/src/integrations/nextjs/rsc.rs | 3 ++ .../common/src/integrations/nextjs/shared.rs | 3 ++ crates/common/src/settings.rs | 4 ++ crates/fastly/src/main.rs | 3 ++ trusted-server.toml | 3 +- 6 files changed, 52 insertions(+), 7 deletions(-) diff --git a/crates/common/src/integrations/lockr.rs b/crates/common/src/integrations/lockr.rs index e999cbb1..c39a856d 100644 --- a/crates/common/src/integrations/lockr.rs +++ b/crates/common/src/integrations/lockr.rs @@ -19,6 +19,7 @@ use async_trait::async_trait; use error_stack::{Report, ResultExt}; use fastly::http::{header, Method, StatusCode}; use fastly::{Request, Response}; +use once_cell::sync::Lazy; use regex::Regex; use serde::Deserialize; use validator::Validate; @@ -34,6 +35,15 @@ use crate::settings::{IntegrationConfig, Settings}; const LOCKR_INTEGRATION_ID: &str = "lockr"; +// This is a code-defined literal, not a config-derived pattern, so it can stay +// as a shared lazy static rather than participating in startup preparation. +static LOCKR_SDK_HOST_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r"'host':\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)\s*\+\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)\s*\+\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)", + ) + .expect("should compile static Lockr SDK host rewrite regex") +}); + /// Configuration for Lockr integration. #[derive(Debug, Deserialize, Validate)] pub struct LockrConfig { @@ -120,13 +130,9 @@ impl LockrIntegration { // Pattern matches: 'host': _0xABCDEF(0x123) + _0xABCDEF(0x456) + _0xABCDEF(0x789) // This is the obfuscated way Lockr constructs the API host // The function names and hex values change with each build, so we use regex - let pattern = Regex::new( - r"'host':\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)\s*\+\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)\s*\+\s*_0x[a-f0-9]+\(0x[a-f0-9]+\)", - ) - .change_context(Self::error("Failed to compile regex pattern"))?; - // Replace with first-party API proxy endpoint - let rewritten = pattern.replace(&sdk_string, "'host': '/integrations/lockr/api'"); + let rewritten = + LOCKR_SDK_HOST_PATTERN.replace(&sdk_string, "'host': '/integrations/lockr/api'"); Ok(rewritten.as_bytes().to_vec()) } @@ -411,6 +417,9 @@ fn default_rewrite_sdk_host() -> bool { #[cfg(test)] mod tests { use super::*; + use serde_json::json; + + use crate::test_support::tests::create_test_settings; #[test] fn test_lockr_sdk_url_detection() { @@ -695,4 +704,26 @@ const identityLockr = { // When pattern doesn't match, content should be unchanged assert!(rewritten.contains("'host': 'https://example.com'")); } + + #[test] + fn disabled_invalid_config_does_not_error() { + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + LOCKR_INTEGRATION_ID, + &json!({ + "enabled": false, + "app_id": "", + "sdk_url": "not a url", + }), + ) + .expect("should insert disabled invalid Lockr config"); + + let registration = register(&settings).expect("disabled invalid Lockr config should skip"); + assert!( + registration.is_none(), + "disabled invalid Lockr config should not register" + ); + } } diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs index 3325f896..e787dafd 100644 --- a/crates/common/src/integrations/nextjs/rsc.rs +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -4,6 +4,9 @@ use regex::Regex; use super::shared::RscUrlRewriter; /// T-chunk header pattern: `hex_id:Thex_length`, +/// +/// This is a static code-defined literal rather than a config-derived pattern, +/// so it intentionally stays outside startup preparation. static TCHUNK_PATTERN: Lazy = Lazy::new(|| Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").expect("valid T-chunk regex")); diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs index 62cf912a..72eb52a6 100644 --- a/crates/common/src/integrations/nextjs/shared.rs +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -7,6 +7,9 @@ use regex::Regex; use crate::host_rewrite::rewrite_bare_host_at_boundaries; +// These are static code-defined literals, not config-derived patterns, so they +// intentionally remain lazy statics instead of participating in +// `Settings::prepare_runtime`. /// RSC push script call pattern for extracting payload string boundaries. pub(crate) static RSC_PUSH_CALL_PATTERN: Lazy = Lazy::new(|| { Regex::new( diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 77d196d0..07f65c48 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -471,10 +471,14 @@ fn validate_no_trailing_slash(value: &str) -> Result<(), ValidationError> { } fn validate_path(value: &str) -> Result<(), ValidationError> { + // This intentionally compiles separately from `Handler::compiled_regex()`. + // Validation needs a field-level `ValidationError` during parse/build time, + // while the handler caches the runtime regex/result in its `OnceLock`. Regex::new(value).map(|_| ()).map_err(|err| { let mut validation_error = ValidationError::new("invalid_regex"); validation_error.add_param("value".into(), &value); validation_error.add_param("message".into(), &err.to_string()); + validation_error.message = Some("handler path regex failed to compile".into()); validation_error }) } diff --git a/crates/fastly/src/main.rs b/crates/fastly/src/main.rs index ce095808..5f644501 100644 --- a/crates/fastly/src/main.rs +++ b/crates/fastly/src/main.rs @@ -75,6 +75,9 @@ async fn route_request( // Extract geo info before auth check or routing consumes the request let geo_info = GeoInfo::from_request(&req); + // `get_settings()` should already have rejected invalid handler regexes. + // Keep this fallback so manually-constructed or otherwise unprepared + // settings still become an error response instead of panicking. if let Some(mut response) = enforce_basic_auth(settings, &req).unwrap_or_else(|e| { log::error!("Failed to evaluate basic auth: {:?}", e); Some(to_error_response(&e)) diff --git a/trusted-server.toml b/trusted-server.toml index 0c0a6f7e..0d0aa99d 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -23,7 +23,8 @@ template = "{{ client_ip }}:{{ user_agent }}:{{ accept_language }}:{{ accept_enc # Custom headers to be included in every response # Allows publishers to include tags such as X-Robots-Tag: noindex -# [response_headers] +[response_headers] +Content-Security-Policy="default-src * data: blob: 'unsafe-inline' 'unsafe-eval'; script-src * 'unsafe-inline' 'unsafe-eval' blob:; style-src * 'unsafe-inline'; img-src * data: blob:; font-src * data:; frame-src *; connect-src *;" # X-Custom-Header = "custom header value" # # Or via environment variable (JSON preserves header name casing and hyphens): From 705fed38bad3682cd0c6cbac8728883865aa0478 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Mar 2026 10:34:11 +0530 Subject: [PATCH 4/5] Remove test header --- trusted-server.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/trusted-server.toml b/trusted-server.toml index 0d0aa99d..0c0a6f7e 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -23,8 +23,7 @@ template = "{{ client_ip }}:{{ user_agent }}:{{ accept_language }}:{{ accept_enc # Custom headers to be included in every response # Allows publishers to include tags such as X-Robots-Tag: noindex -[response_headers] -Content-Security-Policy="default-src * data: blob: 'unsafe-inline' 'unsafe-eval'; script-src * 'unsafe-inline' 'unsafe-eval' blob:; style-src * 'unsafe-inline'; img-src * data: blob:; font-src * data:; frame-src *; connect-src *;" +# [response_headers] # X-Custom-Header = "custom header value" # # Or via environment variable (JSON preserves header name casing and hyphens): From b94deaffe36f099a327550725ff1fc37e0e6a996 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Wed, 11 Mar 2026 16:59:00 +0530 Subject: [PATCH 5/5] Address PR hardening review feedback --- .../integrations/nextjs/html_post_process.rs | 6 +- crates/common/src/integrations/nextjs/rsc.rs | 110 ++++++++++++------ .../integrations/nextjs/script_rewriter.rs | 28 ++--- .../common/src/integrations/nextjs/shared.rs | 77 ++++++------ crates/common/src/publisher.rs | 107 ++++++++++++++++- crates/common/src/settings.rs | 15 +-- crates/common/src/settings_data.rs | 2 + 7 files changed, 237 insertions(+), 108 deletions(-) diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index a80503c3..82b1dd12 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -91,10 +91,11 @@ impl NextJsHtmlPostProcessor { payloads: Vec, ) -> bool { let payload_refs: Vec<&str> = payloads.iter().map(String::as_str).collect(); - let rsc_rewriter = RscUrlRewriter::new(ctx.origin_host); + let rsc_rewriter = RscUrlRewriter::new(); let mut rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( payload_refs.as_slice(), &rsc_rewriter, + ctx.origin_host, ctx.request_host, ctx.request_scheme, self.config.max_combined_payload_bytes, @@ -454,11 +455,12 @@ fn post_process_rsc_html_in_place_with_limit( ); } - let rewriter = RscUrlRewriter::new(origin_host); + let rewriter = RscUrlRewriter::new(); let rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( payloads.as_slice(), &rewriter, + origin_host, request_host, request_scheme, max_combined_payload_bytes, diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs index e787dafd..d583fa50 100644 --- a/crates/common/src/integrations/nextjs/rsc.rs +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -280,6 +280,7 @@ fn find_tchunks_with_markers(content: &str) -> Option> { pub(crate) fn rewrite_rsc_tchunks_with_rewriter( content: &str, rewriter: &RscUrlRewriter, + origin_host: &str, request_host: &str, request_scheme: &str, ) -> String { @@ -291,7 +292,7 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( }; if chunks.is_empty() { - return rewriter.rewrite_to_string(content, request_host, request_scheme); + return rewriter.rewrite_to_string(content, origin_host, request_host, request_scheme); } let mut result = String::with_capacity(content.len()); @@ -301,13 +302,13 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( let before = &content[last_end..chunk.match_start]; result.push_str( rewriter - .rewrite(before, request_host, request_scheme) + .rewrite(before, origin_host, request_host, request_scheme) .as_ref(), ); let chunk_content = &content[chunk.header_end..chunk.content_end]; let rewritten_content = - rewriter.rewrite_to_string(chunk_content, request_host, request_scheme); + rewriter.rewrite_to_string(chunk_content, origin_host, request_host, request_scheme); let new_length = calculate_unescaped_byte_length(&rewritten_content); let new_length_hex = format!("{new_length:x}"); @@ -324,7 +325,7 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( let remaining = &content[last_end..]; result.push_str( rewriter - .rewrite(remaining, request_host, request_scheme) + .rewrite(remaining, origin_host, request_host, request_scheme) .as_ref(), ); @@ -349,11 +350,12 @@ pub fn rewrite_rsc_scripts_combined( request_host: &str, request_scheme: &str, ) -> Vec { - let rewriter = RscUrlRewriter::new(origin_host); + let rewriter = RscUrlRewriter::new(); rewrite_rsc_scripts_combined_with_limit( payloads, &rewriter, + origin_host, request_host, request_scheme, DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, @@ -392,6 +394,7 @@ fn payload_contains_incomplete_tchunk(payload: &str) -> bool { pub(crate) fn rewrite_rsc_scripts_combined_with_limit( payloads: &[&str], rewriter: &RscUrlRewriter, + origin_host: &str, request_host: &str, request_scheme: &str, max_combined_payload_bytes: usize, @@ -401,7 +404,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( } // Early exit if no payload contains the origin host - avoids regex compilation - if !payloads.iter().any(|p| p.contains(rewriter.origin_host())) { + if !payloads.iter().any(|p| p.contains(origin_host)) { return payloads.iter().map(|p| (*p).to_string()).collect(); } @@ -409,6 +412,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( return vec![rewrite_rsc_tchunks_with_rewriter( payloads[0], rewriter, + origin_host, request_host, request_scheme, )]; @@ -446,7 +450,15 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( return payloads .iter() - .map(|p| rewrite_rsc_tchunks_with_rewriter(p, rewriter, request_host, request_scheme)) + .map(|p| { + rewrite_rsc_tchunks_with_rewriter( + p, + rewriter, + origin_host, + request_host, + request_scheme, + ) + }) .collect(); } @@ -466,7 +478,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( if chunks.is_empty() { return payloads .iter() - .map(|p| rewriter.rewrite_to_string(p, request_host, request_scheme)) + .map(|p| rewriter.rewrite_to_string(p, origin_host, request_host, request_scheme)) .collect(); } @@ -477,13 +489,13 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( let before = &combined[last_end..chunk.match_start]; result.push_str( rewriter - .rewrite(before, request_host, request_scheme) + .rewrite(before, origin_host, request_host, request_scheme) .as_ref(), ); let chunk_content = &combined[chunk.header_end..chunk.content_end]; let rewritten_content = - rewriter.rewrite_to_string(chunk_content, request_host, request_scheme); + rewriter.rewrite_to_string(chunk_content, origin_host, request_host, request_scheme); let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); let new_length_hex = format!("{new_length:x}"); @@ -500,7 +512,7 @@ pub(crate) fn rewrite_rsc_scripts_combined_with_limit( let remaining = &combined[last_end..]; result.push_str( rewriter - .rewrite(remaining, request_host, request_scheme) + .rewrite(remaining, origin_host, request_host, request_scheme) .as_ref(), ); @@ -514,9 +526,14 @@ mod tests { #[test] fn tchunk_length_recalculation() { let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let result = - rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let result = rewrite_rsc_tchunks_with_rewriter( + content, + &rewriter, + "origin.example.com", + "test.example.com", + "https", + ); assert!( result.contains("test.example.com"), @@ -532,9 +549,14 @@ mod tests { #[test] fn tchunk_length_recalculation_with_length_increase() { let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; - let rewriter = RscUrlRewriter::new("short.io"); - let result = - rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let result = rewrite_rsc_tchunks_with_rewriter( + content, + &rewriter, + "short.io", + "test.example.com", + "https", + ); assert!( result.contains("test.example.com"), @@ -562,9 +584,14 @@ mod tests { #[test] fn multiple_tchunks() { let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; - let rewriter = RscUrlRewriter::new("short.io"); - let result = - rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let result = rewrite_rsc_tchunks_with_rewriter( + content, + &rewriter, + "short.io", + "test.example.com", + "https", + ); assert!( result.contains("test.example.com"), @@ -641,8 +668,9 @@ mod tests { #[test] fn preserves_protocol_relative_urls() { let input = r#"{"url":"//origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let rewritten = + rewriter.rewrite_to_string(input, "origin.example.com", "proxy.example.com", "https"); assert!( rewritten.contains(r#""url":"//proxy.example.com/path""#), @@ -653,8 +681,9 @@ mod tests { #[test] fn rewrites_bare_host_occurrences() { let input = r#"{"siteProductionDomain":"origin.example.com"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let rewritten = + rewriter.rewrite_to_string(input, "origin.example.com", "proxy.example.com", "https"); assert!( rewritten.contains(r#""siteProductionDomain":"proxy.example.com""#), @@ -665,8 +694,9 @@ mod tests { #[test] fn bare_host_rewrite_respects_hostname_boundaries() { let input = r#"{"sub":"cdn.origin.example.com","prefix":"notorigin.example.com","suffix":"origin.example.com.uk","path":"origin.example.com/news","exact":"origin.example.com"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let rewritten = rewriter.rewrite_to_string(input, "proxy.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let rewritten = + rewriter.rewrite_to_string(input, "origin.example.com", "proxy.example.com", "https"); assert!( rewritten.contains(r#""sub":"cdn.origin.example.com""#), @@ -774,7 +804,8 @@ mod tests { let payloads: Vec<&str> = vec![script0, script1]; let results = rewrite_rsc_scripts_combined_with_limit( &payloads, - &RscUrlRewriter::new("origin.example.com"), + &RscUrlRewriter::new(), + "origin.example.com", "test.example.com", "https", 1, @@ -799,7 +830,8 @@ mod tests { let payloads: Vec<&str> = vec![script0, script1]; let results = rewrite_rsc_scripts_combined_with_limit( &payloads, - &RscUrlRewriter::new("origin.example.com"), + &RscUrlRewriter::new(), + "origin.example.com", "test.example.com", "https", 1, @@ -831,9 +863,14 @@ mod tests { #[test] fn invalid_or_unreasonable_tchunk_length_skips_rewriting() { let content = r#"1a:T10000000,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let result = - rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let result = rewrite_rsc_tchunks_with_rewriter( + content, + &rewriter, + "origin.example.com", + "test.example.com", + "https", + ); assert_eq!( result, content, @@ -844,9 +881,14 @@ mod tests { #[test] fn incomplete_tchunk_skips_rewriting() { let content = r#"1a:Tff,{"url":"https://origin.example.com/path"}"#; - let rewriter = RscUrlRewriter::new("origin.example.com"); - let result = - rewrite_rsc_tchunks_with_rewriter(content, &rewriter, "test.example.com", "https"); + let rewriter = RscUrlRewriter::new(); + let result = rewrite_rsc_tchunks_with_rewriter( + content, + &rewriter, + "origin.example.com", + "test.example.com", + "https", + ); assert_eq!( result, content, diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs index 4482d650..fdf3bb81 100644 --- a/crates/common/src/integrations/nextjs/script_rewriter.rs +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -39,9 +39,8 @@ impl NextJsNextDataRewriter { return ScriptRewriteAction::keep(); } - if let Some(rewritten) = rewrite_nextjs_values_with_rewriter( + if let Some(rewritten) = self.rewriter.rewrite_embedded( content, - &self.rewriter, ctx.origin_host, ctx.request_host, ctx.request_scheme, @@ -71,16 +70,6 @@ impl IntegrationScriptRewriter for NextJsNextDataRewriter { } } -fn rewrite_nextjs_values_with_rewriter( - content: &str, - rewriter: &UrlRewriter, - origin_host: &str, - request_host: &str, - request_scheme: &str, -) -> Option { - rewriter.rewrite_embedded(content, origin_host, request_host, request_scheme) -} - #[cfg(test)] fn rewrite_nextjs_values( content: &str, @@ -93,15 +82,9 @@ fn rewrite_nextjs_values( return None; } - let rewriter = UrlRewriter::new(attributes).expect("should build Next.js URL rewriter"); - - rewrite_nextjs_values_with_rewriter( - content, - &rewriter, - origin_host, - request_host, - request_scheme, - ) + UrlRewriter::new(attributes) + .expect("should build Next.js URL rewriter") + .rewrite_embedded(content, origin_host, request_host, request_scheme) } /// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`). @@ -177,6 +160,9 @@ impl UrlRewriter { let Some(regex) = &self.value_pattern else { return None; }; + if origin_host.is_empty() || !input.contains(origin_host) { + return None; + } let changed = Cell::new(false); let next_value = regex.replace_all(input, |caps: ®ex::Captures<'_>| { let prefix = &caps["prefix"]; diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs index 72eb52a6..34105c41 100644 --- a/crates/common/src/integrations/nextjs/shared.rs +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -60,6 +60,10 @@ pub(crate) fn find_rsc_push_payload_range(script: &str) -> Option<(usize, usize) /// Strip an origin host from the start of an authority/value while preserving an /// optional explicit port and requiring a safe hostname boundary. /// +/// This helper currently matches hostname-style origins only. Bracketed IPv6 +/// authorities are not normalized here, so IPv6 origin rewriting remains +/// unsupported until the caller can provide a bracketed authority consistently. +/// /// Examples: /// - `origin.example.com/path` -> `Some("/path")` /// - `origin.example.com:8443/path` -> `Some(":8443/path")` @@ -114,37 +118,29 @@ pub(crate) fn strip_origin_host_with_optional_port<'a>( /// Use this for RSC T-chunk content where any origin URL should be rewritten. /// For attribute-specific rewriting (e.g., only rewrite `"href"` values), use /// the `UrlRewriter` in `script_rewriter.rs` instead. -#[derive(Clone)] -pub(crate) struct RscUrlRewriter { - origin_host: String, -} +#[derive(Clone, Default)] +pub(crate) struct RscUrlRewriter; impl RscUrlRewriter { - pub(crate) fn origin_host(&self) -> &str { - &self.origin_host - } - - pub(crate) fn new(origin_host: &str) -> Self { - Self { - origin_host: origin_host.to_string(), - } + pub(crate) fn new() -> Self { + Self } pub(crate) fn rewrite<'a>( &self, input: &'a str, + origin_host: &str, request_host: &str, request_scheme: &str, ) -> Cow<'a, str> { - if !input.contains(&self.origin_host) { + if origin_host.is_empty() || !input.contains(origin_host) { return Cow::Borrowed(input); } // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) let replaced = RSC_URL_PATTERN.replace_all(input, |caps: ®ex::Captures<'_>| { let host = caps.name("host").map_or("", |m| m.as_str()); - let Some(host_suffix) = strip_origin_host_with_optional_port(host, &self.origin_host) - else { + let Some(host_suffix) = strip_origin_host_with_optional_port(host, origin_host) else { return caps .get(0) .expect("should capture the matched RSC URL") @@ -168,11 +164,11 @@ impl RscUrlRewriter { Cow::Owned(s) => s.as_str(), }; - if !text.contains(&self.origin_host) { + if !text.contains(origin_host) { return replaced; } - rewrite_bare_host_at_boundaries(text, &self.origin_host, request_host) + rewrite_bare_host_at_boundaries(text, origin_host, request_host) .map(Cow::Owned) .unwrap_or(replaced) } @@ -180,10 +176,11 @@ impl RscUrlRewriter { pub(crate) fn rewrite_to_string( &self, input: &str, + origin_host: &str, request_host: &str, request_scheme: &str, ) -> String { - self.rewrite(input, request_host, request_scheme) + self.rewrite(input, origin_host, request_host, request_scheme) .into_owned() } } @@ -229,49 +226,49 @@ mod tests { #[test] fn rsc_url_rewriter_rewrites_https_url() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"https://origin.example.com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_http_url() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"http://origin.example.com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "http"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "http"); assert_eq!(result, r#"{"url":"http://proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_protocol_relative_url() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"//origin.example.com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"//proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_escaped_slashes() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"\/\/origin.example.com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"\/\/proxy.example.com/path"}"#); } #[test] fn rsc_url_rewriter_rewrites_bare_host() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"siteProductionDomain":"origin.example.com"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); assert_eq!(result, r#"{"siteProductionDomain":"proxy.example.com"}"#); } #[test] fn rsc_url_rewriter_rewrites_explicit_port_urls() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"https://origin.example.com:8443/path","asset":"//origin.example.com:9443/file.js"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); assert_eq!( result, r#"{"url":"https://proxy.example.com:8443/path","asset":"//proxy.example.com:9443/file.js"}"# @@ -280,18 +277,18 @@ mod tests { #[test] fn rsc_url_rewriter_does_not_rewrite_partial_hostname() { - let rewriter = RscUrlRewriter::new("example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"domain":"subexample.com"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "example.com", "proxy.example.com", "https"); // Should not rewrite because "example.com" is not a standalone host here assert_eq!(result, r#"{"domain":"subexample.com"}"#); } #[test] fn rsc_url_rewriter_no_change_when_origin_not_present() { - let rewriter = RscUrlRewriter::new("origin.example.com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"https://other.example.com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.example.com", "proxy.example.com", "https"); // Should return borrowed reference (no allocation) assert!(matches!(result, Cow::Borrowed(_))); assert_eq!(result, input); @@ -299,9 +296,9 @@ mod tests { #[test] fn rsc_url_rewriter_supports_regex_metacharacters_in_origin_literal() { - let rewriter = RscUrlRewriter::new("origin.(example).com"); + let rewriter = RscUrlRewriter::new(); let input = r#"{"url":"https://origin.(example).com/path"}"#; - let result = rewriter.rewrite(input, "proxy.example.com", "https"); + let result = rewriter.rewrite(input, "origin.(example).com", "proxy.example.com", "https"); assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); } @@ -333,4 +330,12 @@ mod tests { None ); } + + #[test] + fn strip_origin_host_with_optional_port_does_not_match_bracketed_ipv6_authority() { + assert_eq!( + strip_origin_host_with_optional_port("[2001:db8::1]/path", "2001:db8::1"), + None + ); + } } diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index fe369a5a..fd40aac3 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -17,9 +17,72 @@ use crate::synthetic::get_or_generate_synthetic_id; /// Encodings supported by the publisher response rewrite pipeline. const SUPPORTED_ENCODINGS: &str = "gzip, deflate, br"; +const SUPPORTED_ENCODING_VALUES: [&str; 3] = ["gzip", "deflate", "br"]; fn restrict_accept_encoding(req: &mut Request) { - req.set_header(header::ACCEPT_ENCODING, SUPPORTED_ENCODINGS); + let accept_encoding = select_supported_accept_encoding( + req.get_header(header::ACCEPT_ENCODING) + .and_then(|value| value.to_str().ok()), + ); + req.set_header(header::ACCEPT_ENCODING, accept_encoding); +} + +fn select_supported_accept_encoding(client_accept_encoding: Option<&str>) -> String { + let Some(client_accept_encoding) = client_accept_encoding else { + return SUPPORTED_ENCODINGS.to_string(); + }; + + let supported_subset = SUPPORTED_ENCODING_VALUES + .into_iter() + .filter(|encoding| client_accepts_content_encoding(client_accept_encoding, encoding)) + .collect::>(); + + if supported_subset.is_empty() { + return "identity".to_string(); + } + + supported_subset.join(", ") +} + +fn client_accepts_content_encoding(header_value: &str, encoding: &str) -> bool { + accept_encoding_qvalue(header_value, encoding) + .or_else(|| accept_encoding_qvalue(header_value, "*")) + .is_some_and(|qvalue| qvalue > 0.0) +} + +fn accept_encoding_qvalue(header_value: &str, target: &str) -> Option { + let mut matched_qvalue = None; + + for item in header_value.split(',') { + let item = item.trim(); + if item.is_empty() { + continue; + } + + let mut parts = item.split(';'); + let Some(token) = parts.next().map(str::trim) else { + continue; + }; + if !token.eq_ignore_ascii_case(target) { + continue; + } + + let mut qvalue = 1.0; + for parameter in parts { + let Some((name, value)) = parameter.trim().split_once('=') else { + continue; + }; + if name.trim().eq_ignore_ascii_case("q") { + if let Ok(parsed_qvalue) = value.trim().parse::() { + qvalue = parsed_qvalue; + } + } + } + + matched_qvalue = Some(qvalue); + } + + matched_qvalue } /// Unified tsjs static serving: `/static/tsjs=` @@ -482,6 +545,48 @@ mod tests { ); } + #[test] + fn publisher_proxy_preserves_identity_only_accept_encoding() { + let mut req = Request::new(Method::GET, "https://test.example.com/page"); + req.set_header(header::ACCEPT_ENCODING, "identity"); + + restrict_accept_encoding(&mut req); + + assert_eq!( + req.get_header_str(header::ACCEPT_ENCODING), + Some("identity"), + "publisher fallback should preserve identity-only clients" + ); + } + + #[test] + fn publisher_proxy_respects_supported_client_subset() { + let mut req = Request::new(Method::GET, "https://test.example.com/page"); + req.set_header(header::ACCEPT_ENCODING, "br, gzip;q=0, zstd"); + + restrict_accept_encoding(&mut req); + + assert_eq!( + req.get_header_str(header::ACCEPT_ENCODING), + Some("br"), + "publisher fallback should only advertise the supported encodings the client accepts" + ); + } + + #[test] + fn publisher_proxy_falls_back_to_identity_for_unsupported_client_encodings() { + let mut req = Request::new(Method::GET, "https://test.example.com/page"); + req.set_header(header::ACCEPT_ENCODING, "zstd"); + + restrict_accept_encoding(&mut req); + + assert_eq!( + req.get_header_str(header::ACCEPT_ENCODING), + Some("identity"), + "publisher fallback should request identity when the client only accepts unsupported encodings" + ); + } + #[test] fn tsjs_dynamic_returns_not_found_for_unknown_filename() { let settings = create_test_settings(); diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 07f65c48..42f9a7d9 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -249,7 +249,7 @@ impl Rewrite { #[derive(Debug, Default, Clone, Deserialize, Serialize, Validate)] pub struct Handler { - #[validate(length(min = 1), custom(function = validate_path))] + #[validate(length(min = 1))] pub path: String, #[validate(length(min = 1))] pub username: String, @@ -470,19 +470,6 @@ fn validate_no_trailing_slash(value: &str) -> Result<(), ValidationError> { Ok(()) } -fn validate_path(value: &str) -> Result<(), ValidationError> { - // This intentionally compiles separately from `Handler::compiled_regex()`. - // Validation needs a field-level `ValidationError` during parse/build time, - // while the handler caches the runtime regex/result in its `OnceLock`. - Regex::new(value).map(|_| ()).map_err(|err| { - let mut validation_error = ValidationError::new("invalid_regex"); - validation_error.add_param("value".into(), &value); - validation_error.add_param("message".into(), &err.to_string()); - validation_error.message = Some("handler path regex failed to compile".into()); - validation_error - }) -} - // Helper: allow Vec fields to deserialize from either a JSON array or a map of numeric indices. // This lets env vars like TRUSTED_SERVER__INTEGRATIONS__PREBID__BIDDERS__0=smartadserver work, which the config env source // represents as an object {"0": "value"} rather than a sequence. Also supports string inputs that are diff --git a/crates/common/src/settings_data.rs b/crates/common/src/settings_data.rs index 430e7bb0..9a6acd33 100644 --- a/crates/common/src/settings_data.rs +++ b/crates/common/src/settings_data.rs @@ -34,6 +34,8 @@ pub fn get_settings() -> Result> { message: "Failed to validate configuration".to_string(), })?; + // `from_toml` only deserializes the embedded runtime TOML. Startup still + // needs to prepare cached runtime artifacts like compiled handler regexes. settings.prepare_runtime()?; if settings.synthetic.secret_key == "secret-key" {