PolicyEngine · MaxGhenis · Mar 24, 2026
diff --git a/Makefile b/Makefile
@@ -188,7 +188,8 @@ calibrate-modal-national:
 	modal run modal_app/remote_calibration_runner.py::main \
 		--branch $(BRANCH) --gpu $(NATIONAL_GPU) \
 		--epochs $(NATIONAL_EPOCHS) \
-		--push-results --national
+		--push-results --national \
+		--target-config policyengine_us_data/calibration/target_config_national.yaml
 
 calibrate-both:
 	$(MAKE) calibrate-modal & $(MAKE) calibrate-modal-national & wait

diff --git a/changelog.d/acs-2024-donor.changed.md b/changelog.d/acs-2024-donor.changed.md
@@ -0,0 +1 @@
+Use 2024 ACS 1-year PUMS as the rent and property-tax donor source for source imputation, while keeping the historical 2022 ACS dataset available.
diff --git a/changelog.d/scf-2024-source-impute.changed.md b/changelog.d/scf-2024-source-impute.changed.md
@@ -0,0 +1 @@
+Age SCF donor values from 2022 to 2024 during source imputation, align household `net_worth` to the 2024 national total, and make the 2024 source-imputed CPS the tested net-worth dataset.
diff --git a/changelog.d/source-imputed-cps-datasets.changed.md b/changelog.d/source-imputed-cps-datasets.changed.md
@@ -0,0 +1 @@
+Add dataset classes for the stratified and source-imputed stratified extended CPS artifacts, and align `EnhancedCPS_2024` with the canonical source-imputed calibration input.
diff --git a/docs/calibration.md b/docs/calibration.md
@@ -182,20 +182,24 @@ weights = fit_l0_weights(
 
 ## Target Config
 
-The target config controls which targets reach the optimizer. It uses a YAML exclusion list:
+The target config controls which targets reach the optimizer. It can use either a YAML inclusion list or exclusion list:
 
 ```yaml
-exclude:
+include:
+  - variable: net_worth
+    geo_level: national
   - variable: rent
     geo_level: national
+
+exclude:
   - variable: eitc
     geo_level: district
   - variable: snap
     geo_level: state
     domain_variable: snap   # optional: further narrow the match
 ```
 
-Each rule drops rows from the calibration matrix where **all** specified fields match. Unrecognized variables silently match nothing.
+`include` keeps only matching rows. `exclude` drops matching rows. If both are present, `include` is applied first and `exclude` removes from that set. Unrecognized variables silently match nothing.
 
 ### Fields
 
@@ -207,12 +211,11 @@ Each rule drops rows from the calibration matrix where **all** specified fields
 
 ### Default config
 
-The checked-in config at `policyengine_us_data/calibration/target_config.yaml` reproduces the junkyard notebook's 22 excluded target groups. It drops:
+The default training config at `policyengine_us_data/calibration/target_config.yaml` is include-based. It defines the shared target subset used by local calibration and excludes national `net_worth`.
 
-- **13 national-level variables**: alimony, charitable deduction, child support, interest deduction, medical expense deduction, net worth, person count, real estate taxes, rent, social security dependents/survivors
-- **9 district-level variables**: ACA PTC, EITC, income tax before credits, medical expense deduction, net capital gains, rental income, tax unit count, partnership/S-corp income, taxable social security
+The national calibration preset uses `policyengine_us_data/calibration/target_config_national.yaml`, which is the same include-based target set plus national `net_worth`.
 
-Applying this config reduces targets from ~37K to ~21K, matching the junkyard's target selection.
+The checked-in backup config at `policyengine_us_data/calibration/target_config_full.yaml` preserves the earlier junkyard-style exclusion list for reference.
 
 ### Writing a custom config
 

diff --git a/modal_app/README.md b/modal_app/README.md
@@ -185,7 +185,7 @@ Loads pre-built matrices from Modal volume, fits L0-regularized weights on GPU.
 | **Modal CLI (national preset)** | `make calibrate-modal-national BRANCH=<branch>` |
 | **Both presets** | `make calibrate-both BRANCH=<branch>` |
 
-`make calibrate-modal` passes `--prebuilt-matrices --push-results` automatically. `make calibrate-modal-national` adds `--national`, which sets λ_L0=1e-4 for a smaller ~50K-record output. `make calibrate-both` runs both in parallel.
+`make calibrate-modal` passes `--prebuilt-matrices --push-results` automatically. `make calibrate-modal-national` adds `--national` and uses `policyengine_us_data/calibration/target_config_national.yaml`, which sets λ_L0=1e-4 for a smaller ~50K-record output and currently adds national `net_worth`. `make calibrate-both` runs both in parallel.
 
 Full example:
 ```
@@ -194,7 +194,7 @@ modal run modal_app/remote_calibration_runner.py::main \
   --gpu T4 --epochs 1000 \
   --beta 0.65 --lambda-l0 1e-6 --lambda-l2 1e-8 \
   --log-freq 500 \
-  --target-config policyengine_us_data/calibration/target_config.yaml \
+  --target-config policyengine_us_data/calibration/target_config_national.yaml \
   --prebuilt-matrices --push-results
 ```
 

diff --git a/modal_app/data_build.py b/modal_app/data_build.py
@@ -36,7 +36,7 @@
         "policyengine_us_data/storage/uprating_factors.csv"
     ),
     "policyengine_us_data/datasets/acs/acs.py": (
-        "policyengine_us_data/storage/acs_2022.h5"
+        "policyengine_us_data/storage/acs_2024.h5"
     ),
     "policyengine_us_data/datasets/puf/irs_puf.py": (
         "policyengine_us_data/storage/irs_puf_2015.h5"

diff --git a/policyengine_us_data/calibration/create_source_imputed_cps.py b/policyengine_us_data/calibration/create_source_imputed_cps.py
@@ -10,9 +10,9 @@
 
 import logging
 import sys
-from pathlib import Path
 
 import h5py
+import numpy as np
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
@@ -22,33 +22,48 @@
 OUTPUT_PATH = str(STORAGE_FOLDER / "source_imputed_stratified_extended_cps_2024.h5")
 
 
+def _resolve_household_state_fips(sim, n_records: int, seed: int, use_existing_state_fips: bool):
+    if use_existing_state_fips:
+        try:
+            existing_states = sim.calculate("state_fips", map_to="household").values
+            if len(existing_states) == n_records:
+                logger.info("Using existing household state_fips from input dataset")
+                return np.asarray(existing_states, dtype=np.int32)
+        except Exception as exc:
+            logger.info("Existing state_fips unavailable, assigning random geography: %s", exc)
+
+    from policyengine_us_data.calibration.clone_and_assign import assign_random_geography
+
+    geography = assign_random_geography(n_records=n_records, n_clones=1, seed=seed)
+    return geography.state_fips[:n_records].astype(np.int32)
+
+
 def create_source_imputed_cps(
     input_path: str = INPUT_PATH,
     output_path: str = OUTPUT_PATH,
     seed: int = 42,
+    use_existing_state_fips: bool = False,
+    time_period: int | None = None,
 ):
     from policyengine_us import Microsimulation
-    from policyengine_us_data.calibration.clone_and_assign import (
-        assign_random_geography,
-    )
     from policyengine_us_data.calibration.source_impute import (
         impute_source_variables,
     )
 
     logger.info("Loading dataset from %s", input_path)
     sim = Microsimulation(dataset=input_path)
     n_records = len(sim.calculate("household_id", map_to="household").values)
-
-    raw_keys = sim.dataset.load_dataset()["household_id"]
-    if isinstance(raw_keys, dict):
-        time_period = int(next(iter(raw_keys)))
-    else:
-        time_period = 2024
+    if time_period is None:
+        time_period = int(sim.default_calculation_period)
 
     logger.info("Loaded %d households, time_period=%d", n_records, time_period)
 
-    geography = assign_random_geography(n_records=n_records, n_clones=1, seed=seed)
-    base_states = geography.state_fips[:n_records]
+    base_states = _resolve_household_state_fips(
+        sim,
+        n_records=n_records,
+        seed=seed,
+        use_existing_state_fips=use_existing_state_fips,
+    )
 
     raw_data = sim.dataset.load_dataset()
     data_dict = {}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Use 2024 ACS 1-year PUMS as the rent and property-tax donor source for source imputation, while keeping the historical 2022 ACS dataset available.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Age SCF donor values from 2022 to 2024 during source imputation, align household `net_worth` to the 2024 national total, and make the 2024 source-imputed CPS the tested net-worth dataset.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Add dataset classes for the stratified and source-imputed stratified extended CPS artifacts, and align `EnhancedCPS_2024` with the canonical source-imputed calibration input.