From 5f487e00efb500cf8e1e6d5f17da8e52583c0687 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 29 Mar 2026 21:49:45 -0400
Subject: [PATCH 1/2] Refresh local SOI state targets for local calibration

---
 changelog.d/local-soi-refresh.changed.md      |   1 +
 policyengine_us_data/storage/README.md        |   7 +
 .../storage/calibration_targets/agi_state.csv |  36 +--
 .../refresh_local_agi_state_targets.py        | 212 ++++++++++++++++++
 tests/test_refresh_local_agi_state_targets.py | 109 +++++++++
 5 files changed, 347 insertions(+), 18 deletions(-)
 create mode 100644 changelog.d/local-soi-refresh.changed.md
 create mode 100644 policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py
 create mode 100644 tests/test_refresh_local_agi_state_targets.py

diff --git a/changelog.d/local-soi-refresh.changed.md b/changelog.d/local-soi-refresh.changed.md
new file mode 100644
index 000000000..7a2ca6788
--- /dev/null
+++ b/changelog.d/local-soi-refresh.changed.md
@@ -0,0 +1 @@
+Added an explicit refresh path and regression coverage for the legacy `agi_state.csv` SOI targets used by local calibration.
diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md
index 80d4c1cdc..a440d6195 100644
--- a/policyengine_us_data/storage/README.md
+++ b/policyengine_us_data/storage/README.md
@@ -5,6 +5,13 @@
   • Date: 2024  
   • Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf
 
+- **agi_state.csv**
+  • Source: IRS SOI state data file used by legacy local calibration
+  • Date: tax year 2022
+  • Created by: `policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py`
+  • Location: https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv
+  • Notes: This file intentionally keeps the legacy `utils/loss.py` schema (`AL`, `DC`, etc.) instead of the newer `state_AL` geography naming used in `soi.csv`/database overlays. It is separate from `soi_targets.csv`, and it currently lags the national SOI refresh because IRS geographic state SOI files are only published through TY2022.
+
 - **medicaid_enrollment_2024.csv**  
   • Source: MACPAC Enrollment Tables, FFY 2024  
   • Date: 2024  
diff --git a/policyengine_us_data/storage/calibration_targets/agi_state.csv b/policyengine_us_data/storage/calibration_targets/agi_state.csv
index ee57163ab..793626eed 100644
--- a/policyengine_us_data/storage/calibration_targets/agi_state.csv
+++ b/policyengine_us_data/storage/calibration_targets/agi_state.csv
@@ -63,14 +63,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
 0400000US10,DE,75000.0,100000.0,51060,1,adjusted_gross_income/count
 0400000US10,DE,100000.0,200000.0,89920,1,adjusted_gross_income/count
 0400000US10,DE,200000.0,500000.0,30280,1,adjusted_gross_income/count
-,DC,-inf,1.0,4700,1,adjusted_gross_income/count
-,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
-,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
-,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
-,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
-,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
-,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
-,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
+0400000US11,DC,-inf,1.0,4700,1,adjusted_gross_income/count
+0400000US11,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
+0400000US11,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
+0400000US11,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
+0400000US11,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
+0400000US11,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
+0400000US11,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
+0400000US11,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
 0400000US12,FL,-inf,1.0,216880,1,adjusted_gross_income/count
 0400000US12,FL,1.0,10000.0,1123740,1,adjusted_gross_income/count
 0400000US12,FL,10000.0,25000.0,2180990,1,adjusted_gross_income/count
@@ -414,7 +414,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
 0400000US06,CA,500000.0,inf,426810,1,adjusted_gross_income/count
 0400000US08,CO,500000.0,inf,51500,1,adjusted_gross_income/count
 0400000US09,CT,500000.0,inf,45510,1,adjusted_gross_income/count
-,DC,500000.0,inf,10530,1,adjusted_gross_income/count
+0400000US11,DC,500000.0,inf,10530,1,adjusted_gross_income/count
 0400000US10,DE,500000.0,inf,5350,1,adjusted_gross_income/count
 0400000US12,FL,500000.0,inf,197090,1,adjusted_gross_income/count
 0400000US13,GA,500000.0,inf,65350,1,adjusted_gross_income/count
@@ -522,14 +522,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
 0400000US10,DE,75000.0,100000.0,4427687000,0,adjusted_gross_income/amount
 0400000US10,DE,100000.0,200000.0,12401957000,0,adjusted_gross_income/amount
 0400000US10,DE,200000.0,500000.0,8502065000,0,adjusted_gross_income/amount
-,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
-,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
-,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
-,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
-,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
-,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
-,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
-,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
+0400000US11,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
+0400000US11,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
+0400000US11,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
+0400000US11,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
+0400000US11,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
+0400000US11,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
+0400000US11,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
+0400000US11,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
 0400000US12,FL,-inf,1.0,-19196408000,0,adjusted_gross_income/amount
 0400000US12,FL,1.0,10000.0,5776254000,0,adjusted_gross_income/amount
 0400000US12,FL,10000.0,25000.0,37314354000,0,adjusted_gross_income/amount
@@ -873,7 +873,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
 0400000US06,CA,500000.0,inf,613219427000,0,adjusted_gross_income/amount
 0400000US08,CO,500000.0,inf,71426453000,0,adjusted_gross_income/amount
 0400000US09,CT,500000.0,inf,77248832000,0,adjusted_gross_income/amount
-,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
+0400000US11,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
 0400000US10,DE,500000.0,inf,6773920000,0,adjusted_gross_income/amount
 0400000US12,FL,500000.0,inf,427887554000,0,adjusted_gross_income/amount
 0400000US13,GA,500000.0,inf,92080953000,0,adjusted_gross_income/amount
diff --git a/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py b/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py
new file mode 100644
index 000000000..f590ebe22
--- /dev/null
+++ b/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py
@@ -0,0 +1,212 @@
+"""Refresh tracked SOI targets used by legacy local calibration.
+
+This regenerates ``agi_state.csv`` from the IRS geographic SOI state file while
+preserving the legacy schema consumed by ``utils/loss.py``:
+
+- ``GEO_NAME`` is the two-letter state abbreviation
+- ``VARIABLE`` is ``adjusted_gross_income/count`` or ``.../amount``
+- AGI bounds live in ``AGI_LOWER_BOUND`` / ``AGI_UPPER_BOUND``
+
+This file intentionally remains separate from the national workbook-backed
+``soi_targets.csv`` refresh path because IRS geographic releases lag the
+national Publication 1304 tables.
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+CALIBRATION_FOLDER = Path(__file__).resolve().parent
+TARGETS_PATH = CALIBRATION_FOLDER / "agi_state.csv"
+STATE_SOI_TAX_YEAR = 2022
+LOCAL_STATE_SOI_TAX_YEAR = STATE_SOI_TAX_YEAR
+
+AGI_STUB_TO_BAND = {
+    1: "Under $1",
+    2: "$1 under $10,000",
+    3: "$10,000 under $25,000",
+    4: "$25,000 under $50,000",
+    5: "$50,000 under $75,000",
+    6: "$75,000 under $100,000",
+    7: "$100,000 under $200,000",
+    8: "$200,000 under $500,000",
+    9: "$500,000 or more",
+}
+
+AGI_BOUNDS = {
+    "Under $1": (-np.inf, 1),
+    "$1 under $10,000": (1, 10_000),
+    "$10,000 under $25,000": (10_000, 25_000),
+    "$25,000 under $50,000": (25_000, 50_000),
+    "$50,000 under $75,000": (50_000, 75_000),
+    "$75,000 under $100,000": (75_000, 100_000),
+    "$100,000 under $200,000": (100_000, 200_000),
+    "$200,000 under $500,000": (200_000, 500_000),
+    "$500,000 or more": (500_000, np.inf),
+}
+
+STATE_ABBR_TO_FIPS = {
+    "AL": "01",
+    "AK": "02",
+    "AZ": "04",
+    "AR": "05",
+    "CA": "06",
+    "CO": "08",
+    "CT": "09",
+    "DE": "10",
+    "DC": "11",
+    "FL": "12",
+    "GA": "13",
+    "HI": "15",
+    "ID": "16",
+    "IL": "17",
+    "IN": "18",
+    "IA": "19",
+    "KS": "20",
+    "KY": "21",
+    "LA": "22",
+    "ME": "23",
+    "MD": "24",
+    "MA": "25",
+    "MI": "26",
+    "MN": "27",
+    "MS": "28",
+    "MO": "29",
+    "MT": "30",
+    "NE": "31",
+    "NV": "32",
+    "NH": "33",
+    "NJ": "34",
+    "NM": "35",
+    "NY": "36",
+    "NC": "37",
+    "ND": "38",
+    "OH": "39",
+    "OK": "40",
+    "OR": "41",
+    "PA": "42",
+    "RI": "44",
+    "SC": "45",
+    "SD": "46",
+    "TN": "47",
+    "TX": "48",
+    "UT": "49",
+    "VT": "50",
+    "VA": "51",
+    "WA": "53",
+    "WV": "54",
+    "WI": "55",
+    "WY": "56",
+}
+
+NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
+VARIABLE_SPECS = (
+    ("N1", "adjusted_gross_income/count", True),
+    ("A00100", "adjusted_gross_income/amount", False),
+)
+
+
+def _state_soi_url(tax_year: int) -> str:
+    return f"https://www.irs.gov/pub/irs-soi/{tax_year % 100:02d}in55cmcsv.csv"
+
+
+def _load_state_soi_raw(tax_year: int = LOCAL_STATE_SOI_TAX_YEAR) -> pd.DataFrame:
+    return pd.read_csv(_state_soi_url(tax_year), thousands=",")
+
+
+def _base_state_frame(source_df: pd.DataFrame) -> pd.DataFrame:
+    df = source_df.copy()
+    merged_top_tail = (
+        df[df["AGI_STUB"].isin([9, 10])]
+        .groupby("STATE", as_index=False)
+        .agg({"N1": "sum", "A00100": "sum"})
+        .assign(AGI_STUB=9)
+    )
+    df = df[~df["AGI_STUB"].isin([9, 10])]
+    df = pd.concat([df, merged_top_tail], ignore_index=True)
+    df = df[df["AGI_STUB"] != 0].copy()
+    df = df.loc[~df["STATE"].isin(NON_VOTING_STATES.union({"US"}))].copy()
+    df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
+    df["GEO_NAME"] = df["STATE"]
+    df["GEO_ID"] = "0400000US" + df["GEO_NAME"].map(STATE_ABBR_TO_FIPS)
+    df["AGI_LOWER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][0])
+    df["AGI_UPPER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][1])
+    return df
+
+
+def build_local_agi_state_targets(
+    source_df: pd.DataFrame | None = None,
+    tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
+) -> pd.DataFrame:
+    base = _base_state_frame(
+        _load_state_soi_raw(tax_year=tax_year) if source_df is None else source_df
+    )
+    frames = []
+
+    for column, variable, is_count in VARIABLE_SPECS:
+        frame = base[
+            ["GEO_ID", "GEO_NAME", "AGI_LOWER_BOUND", "AGI_UPPER_BOUND", column]
+        ].rename(columns={column: "VALUE"})
+        frame["IS_COUNT"] = int(is_count)
+        frame["VARIABLE"] = variable
+        if not is_count:
+            frame["VALUE"] = frame["VALUE"] * 1_000
+        frames.append(frame)
+
+    return pd.concat(frames, ignore_index=True).reset_index(drop=True)
+
+
+def build_agi_state_targets(
+    tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
+    source_df: pd.DataFrame | None = None,
+) -> pd.DataFrame:
+    return build_local_agi_state_targets(
+        source_df=source_df,
+        tax_year=tax_year,
+    )
+
+
+def refresh_local_agi_state_targets(
+    out_path: Path = TARGETS_PATH,
+) -> Path:
+    targets = build_local_agi_state_targets()
+    targets.to_csv(out_path, index=False)
+    return out_path
+
+
+def refresh_agi_state_targets(
+    tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
+    out_path: Path = TARGETS_PATH,
+) -> pd.DataFrame:
+    targets = build_local_agi_state_targets(tax_year=tax_year)
+    targets.to_csv(out_path, index=False)
+    return targets
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Refresh agi_state.csv for local calibration"
+    )
+    parser.add_argument(
+        "--tax-year",
+        type=int,
+        default=LOCAL_STATE_SOI_TAX_YEAR,
+        help="IRS geographic SOI tax year to pull",
+    )
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=TARGETS_PATH,
+        help="Output CSV path",
+    )
+    args = parser.parse_args()
+    refresh_agi_state_targets(tax_year=args.tax_year, out_path=args.out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_refresh_local_agi_state_targets.py b/tests/test_refresh_local_agi_state_targets.py
new file mode 100644
index 000000000..1b092d6ae
--- /dev/null
+++ b/tests/test_refresh_local_agi_state_targets.py
@@ -0,0 +1,109 @@
+import importlib.util
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data"
+MODULE_PATH = (
+    PACKAGE_ROOT
+    / "storage"
+    / "calibration_targets"
+    / "refresh_local_agi_state_targets.py"
+)
+TRACKED_TARGET_PATH = PACKAGE_ROOT / "storage" / "calibration_targets" / "agi_state.csv"
+
+
+def load_module():
+    refresh_spec = importlib.util.spec_from_file_location(
+        "refresh_local_agi_state_targets",
+        MODULE_PATH,
+    )
+    refresh_module = importlib.util.module_from_spec(refresh_spec)
+    assert refresh_spec.loader is not None
+    refresh_spec.loader.exec_module(refresh_module)
+    return refresh_module
+
+
+def make_raw_state_soi():
+    return pd.DataFrame(
+        [
+            {"STATE": "AL", "AGI_STUB": 0, "N1": 999, "A00100": 999},
+            {"STATE": "AL", "AGI_STUB": 1, "N1": 10, "A00100": 5},
+            {"STATE": "AL", "AGI_STUB": 9, "N1": 20, "A00100": 7},
+            {"STATE": "AL", "AGI_STUB": 10, "N1": 30, "A00100": 11},
+            {"STATE": "DC", "AGI_STUB": 1, "N1": 4, "A00100": 2},
+            {"STATE": "DC", "AGI_STUB": 9, "N1": 6, "A00100": 3},
+            {"STATE": "DC", "AGI_STUB": 10, "N1": 8, "A00100": 5},
+            {"STATE": "PR", "AGI_STUB": 1, "N1": 100, "A00100": 100},
+        ]
+    )
+
+
+def test_build_local_agi_state_targets_uses_local_loss_format():
+    module = load_module()
+
+    refreshed = module.build_local_agi_state_targets(make_raw_state_soi())
+
+    assert list(refreshed.columns) == [
+        "GEO_ID",
+        "GEO_NAME",
+        "AGI_LOWER_BOUND",
+        "AGI_UPPER_BOUND",
+        "VALUE",
+        "IS_COUNT",
+        "VARIABLE",
+    ]
+    assert set(refreshed["GEO_NAME"]) == {"AL", "DC"}
+    assert "PR" not in set(refreshed["GEO_NAME"])
+    assert (refreshed["GEO_NAME"].str.startswith("state_")).sum() == 0
+
+    dc_rows = refreshed[refreshed["GEO_NAME"] == "DC"]
+    assert set(dc_rows["GEO_ID"]) == {"0400000US11"}
+
+    top_count = refreshed[
+        (refreshed["GEO_NAME"] == "AL")
+        & (refreshed["VARIABLE"] == "adjusted_gross_income/count")
+        & np.isposinf(refreshed["AGI_UPPER_BOUND"])
+    ]
+    top_amount = refreshed[
+        (refreshed["GEO_NAME"] == "AL")
+        & (refreshed["VARIABLE"] == "adjusted_gross_income/amount")
+        & np.isposinf(refreshed["AGI_UPPER_BOUND"])
+    ]
+    assert top_count["VALUE"].iat[0] == 50
+    assert top_amount["VALUE"].iat[0] == 18_000
+
+
+def test_refresh_local_agi_state_targets_writes_expected_csv(tmp_path, monkeypatch):
+    module = load_module()
+    monkeypatch.setattr(
+        module,
+        "_load_state_soi_raw",
+        lambda tax_year=module.LOCAL_STATE_SOI_TAX_YEAR: make_raw_state_soi(),
+    )
+
+    output_path = tmp_path / "agi_state.csv"
+    written_path = module.refresh_local_agi_state_targets(output_path)
+
+    written = pd.read_csv(written_path)
+
+    assert written_path == output_path
+    assert written["GEO_ID"].isna().sum() == 0
+    assert set(written["VARIABLE"]) == {
+        "adjusted_gross_income/count",
+        "adjusted_gross_income/amount",
+    }
+
+
+def test_tracked_agi_state_targets_have_complete_geo_ids():
+    tracked = pd.read_csv(TRACKED_TARGET_PATH)
+
+    assert tracked["GEO_ID"].isna().sum() == 0
+    assert tracked["GEO_NAME"].nunique() == 51
+    assert set(tracked["VARIABLE"]) == {
+        "adjusted_gross_income/count",
+        "adjusted_gross_income/amount",
+    }

From 59a0a6ab41d30b2b11a8d05ce5b5fb064048e102 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 29 Mar 2026 21:56:44 -0400
Subject: [PATCH 2/2] Fix national target test compatibility

---
 .../tests/test_etl_national_targets.py            | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/policyengine_us_data/tests/test_etl_national_targets.py b/policyengine_us_data/tests/test_etl_national_targets.py
index 7e38e18be..10e0ece31 100644
--- a/policyengine_us_data/tests/test_etl_national_targets.py
+++ b/policyengine_us_data/tests/test_etl_national_targets.py
@@ -8,7 +8,6 @@
     create_database,
 )
 from policyengine_us_data.db.etl_national_targets import (
-    TAX_EXPENDITURE_REFORM_ID,
     load_national_targets,
 )
 
@@ -90,6 +89,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp
     tax_expenditure_df = pd.DataFrame(
         [
             {
+                "reform_id": 1,
                 "variable": "salt_deduction",
                 "value": 21.247e9,
                 "source": "Joint Committee on Taxation",
@@ -97,6 +97,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp
                 "year": 2024,
             },
             {
+                "reform_id": 5,
                 "variable": "qualified_business_income_deduction",
                 "value": 63.1e9,
                 "source": "Joint Committee on Taxation",
@@ -124,16 +125,12 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp
         assert stale_rows
         assert all(not target.active for target in stale_rows)
 
-        reform_rows = (
-            session.query(Target)
-            .filter(Target.reform_id == TAX_EXPENDITURE_REFORM_ID)
-            .all()
-        )
+        reform_rows = session.query(Target).filter(Target.reform_id > 0).all()
         assert len(reform_rows) == 2
         assert all(target.active for target in reform_rows)
-        assert {target.variable for target in reform_rows} == {
-            "salt_deduction",
-            "qualified_business_income_deduction",
+        assert {(target.variable, target.reform_id) for target in reform_rows} == {
+            ("salt_deduction", 1),
+            ("qualified_business_income_deduction", 5),
         }
         assert all(
             "Modeled as repeal-based income tax expenditure target"