From 5f487e00efb500cf8e1e6d5f17da8e52583c0687 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 29 Mar 2026 21:49:45 -0400 Subject: [PATCH 1/2] Refresh local SOI state targets for local calibration --- changelog.d/local-soi-refresh.changed.md | 1 + policyengine_us_data/storage/README.md | 7 + .../storage/calibration_targets/agi_state.csv | 36 +-- .../refresh_local_agi_state_targets.py | 212 ++++++++++++++++++ tests/test_refresh_local_agi_state_targets.py | 109 +++++++++ 5 files changed, 347 insertions(+), 18 deletions(-) create mode 100644 changelog.d/local-soi-refresh.changed.md create mode 100644 policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py create mode 100644 tests/test_refresh_local_agi_state_targets.py diff --git a/changelog.d/local-soi-refresh.changed.md b/changelog.d/local-soi-refresh.changed.md new file mode 100644 index 000000000..7a2ca6788 --- /dev/null +++ b/changelog.d/local-soi-refresh.changed.md @@ -0,0 +1 @@ +Added an explicit refresh path and regression coverage for the legacy `agi_state.csv` SOI targets used by local calibration. diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md index 80d4c1cdc..a440d6195 100644 --- a/policyengine_us_data/storage/README.md +++ b/policyengine_us_data/storage/README.md @@ -5,6 +5,13 @@ • Date: 2024 • Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf +- **agi_state.csv** + • Source: IRS SOI state data file used by legacy local calibration + • Date: tax year 2022 + • Created by: `policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py` + • Location: https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv + • Notes: This file intentionally keeps the legacy `utils/loss.py` schema (`AL`, `DC`, etc.) instead of the newer `state_AL` geography naming used in `soi.csv`/database overlays. It is separate from `soi_targets.csv`, and it currently lags the national SOI refresh because IRS geographic state SOI files are only published through TY2022. + - **medicaid_enrollment_2024.csv** • Source: MACPAC Enrollment Tables, FFY 2024 • Date: 2024 diff --git a/policyengine_us_data/storage/calibration_targets/agi_state.csv b/policyengine_us_data/storage/calibration_targets/agi_state.csv index ee57163ab..793626eed 100644 --- a/policyengine_us_data/storage/calibration_targets/agi_state.csv +++ b/policyengine_us_data/storage/calibration_targets/agi_state.csv @@ -63,14 +63,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE 0400000US10,DE,75000.0,100000.0,51060,1,adjusted_gross_income/count 0400000US10,DE,100000.0,200000.0,89920,1,adjusted_gross_income/count 0400000US10,DE,200000.0,500000.0,30280,1,adjusted_gross_income/count -,DC,-inf,1.0,4700,1,adjusted_gross_income/count -,DC,1.0,10000.0,27330,1,adjusted_gross_income/count -,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count -,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count -,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count -,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count -,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count -,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count +0400000US11,DC,-inf,1.0,4700,1,adjusted_gross_income/count +0400000US11,DC,1.0,10000.0,27330,1,adjusted_gross_income/count +0400000US11,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count +0400000US11,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count +0400000US11,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count +0400000US11,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count +0400000US11,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count +0400000US11,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count 0400000US12,FL,-inf,1.0,216880,1,adjusted_gross_income/count 0400000US12,FL,1.0,10000.0,1123740,1,adjusted_gross_income/count 0400000US12,FL,10000.0,25000.0,2180990,1,adjusted_gross_income/count @@ -414,7 +414,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE 0400000US06,CA,500000.0,inf,426810,1,adjusted_gross_income/count 0400000US08,CO,500000.0,inf,51500,1,adjusted_gross_income/count 0400000US09,CT,500000.0,inf,45510,1,adjusted_gross_income/count -,DC,500000.0,inf,10530,1,adjusted_gross_income/count +0400000US11,DC,500000.0,inf,10530,1,adjusted_gross_income/count 0400000US10,DE,500000.0,inf,5350,1,adjusted_gross_income/count 0400000US12,FL,500000.0,inf,197090,1,adjusted_gross_income/count 0400000US13,GA,500000.0,inf,65350,1,adjusted_gross_income/count @@ -522,14 +522,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE 0400000US10,DE,75000.0,100000.0,4427687000,0,adjusted_gross_income/amount 0400000US10,DE,100000.0,200000.0,12401957000,0,adjusted_gross_income/amount 0400000US10,DE,200000.0,500000.0,8502065000,0,adjusted_gross_income/amount -,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount -,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount -,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount -,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount -,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount -,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount -,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount -,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount +0400000US11,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount +0400000US11,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount +0400000US11,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount +0400000US11,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount +0400000US11,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount +0400000US11,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount +0400000US11,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount +0400000US11,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount 0400000US12,FL,-inf,1.0,-19196408000,0,adjusted_gross_income/amount 0400000US12,FL,1.0,10000.0,5776254000,0,adjusted_gross_income/amount 0400000US12,FL,10000.0,25000.0,37314354000,0,adjusted_gross_income/amount @@ -873,7 +873,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE 0400000US06,CA,500000.0,inf,613219427000,0,adjusted_gross_income/amount 0400000US08,CO,500000.0,inf,71426453000,0,adjusted_gross_income/amount 0400000US09,CT,500000.0,inf,77248832000,0,adjusted_gross_income/amount -,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount +0400000US11,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount 0400000US10,DE,500000.0,inf,6773920000,0,adjusted_gross_income/amount 0400000US12,FL,500000.0,inf,427887554000,0,adjusted_gross_income/amount 0400000US13,GA,500000.0,inf,92080953000,0,adjusted_gross_income/amount diff --git a/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py b/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py new file mode 100644 index 000000000..f590ebe22 --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py @@ -0,0 +1,212 @@ +"""Refresh tracked SOI targets used by legacy local calibration. + +This regenerates ``agi_state.csv`` from the IRS geographic SOI state file while +preserving the legacy schema consumed by ``utils/loss.py``: + +- ``GEO_NAME`` is the two-letter state abbreviation +- ``VARIABLE`` is ``adjusted_gross_income/count`` or ``.../amount`` +- AGI bounds live in ``AGI_LOWER_BOUND`` / ``AGI_UPPER_BOUND`` + +This file intentionally remains separate from the national workbook-backed +``soi_targets.csv`` refresh path because IRS geographic releases lag the +national Publication 1304 tables. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import numpy as np +import pandas as pd + + +CALIBRATION_FOLDER = Path(__file__).resolve().parent +TARGETS_PATH = CALIBRATION_FOLDER / "agi_state.csv" +STATE_SOI_TAX_YEAR = 2022 +LOCAL_STATE_SOI_TAX_YEAR = STATE_SOI_TAX_YEAR + +AGI_STUB_TO_BAND = { + 1: "Under $1", + 2: "$1 under $10,000", + 3: "$10,000 under $25,000", + 4: "$25,000 under $50,000", + 5: "$50,000 under $75,000", + 6: "$75,000 under $100,000", + 7: "$100,000 under $200,000", + 8: "$200,000 under $500,000", + 9: "$500,000 or more", +} + +AGI_BOUNDS = { + "Under $1": (-np.inf, 1), + "$1 under $10,000": (1, 10_000), + "$10,000 under $25,000": (10_000, 25_000), + "$25,000 under $50,000": (25_000, 50_000), + "$50,000 under $75,000": (50_000, 75_000), + "$75,000 under $100,000": (75_000, 100_000), + "$100,000 under $200,000": (100_000, 200_000), + "$200,000 under $500,000": (200_000, 500_000), + "$500,000 or more": (500_000, np.inf), +} + +STATE_ABBR_TO_FIPS = { + "AL": "01", + "AK": "02", + "AZ": "04", + "AR": "05", + "CA": "06", + "CO": "08", + "CT": "09", + "DE": "10", + "DC": "11", + "FL": "12", + "GA": "13", + "HI": "15", + "ID": "16", + "IL": "17", + "IN": "18", + "IA": "19", + "KS": "20", + "KY": "21", + "LA": "22", + "ME": "23", + "MD": "24", + "MA": "25", + "MI": "26", + "MN": "27", + "MS": "28", + "MO": "29", + "MT": "30", + "NE": "31", + "NV": "32", + "NH": "33", + "NJ": "34", + "NM": "35", + "NY": "36", + "NC": "37", + "ND": "38", + "OH": "39", + "OK": "40", + "OR": "41", + "PA": "42", + "RI": "44", + "SC": "45", + "SD": "46", + "TN": "47", + "TX": "48", + "UT": "49", + "VT": "50", + "VA": "51", + "WA": "53", + "WV": "54", + "WI": "55", + "WY": "56", +} + +NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} +VARIABLE_SPECS = ( + ("N1", "adjusted_gross_income/count", True), + ("A00100", "adjusted_gross_income/amount", False), +) + + +def _state_soi_url(tax_year: int) -> str: + return f"https://www.irs.gov/pub/irs-soi/{tax_year % 100:02d}in55cmcsv.csv" + + +def _load_state_soi_raw(tax_year: int = LOCAL_STATE_SOI_TAX_YEAR) -> pd.DataFrame: + return pd.read_csv(_state_soi_url(tax_year), thousands=",") + + +def _base_state_frame(source_df: pd.DataFrame) -> pd.DataFrame: + df = source_df.copy() + merged_top_tail = ( + df[df["AGI_STUB"].isin([9, 10])] + .groupby("STATE", as_index=False) + .agg({"N1": "sum", "A00100": "sum"}) + .assign(AGI_STUB=9) + ) + df = df[~df["AGI_STUB"].isin([9, 10])] + df = pd.concat([df, merged_top_tail], ignore_index=True) + df = df[df["AGI_STUB"] != 0].copy() + df = df.loc[~df["STATE"].isin(NON_VOTING_STATES.union({"US"}))].copy() + df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND) + df["GEO_NAME"] = df["STATE"] + df["GEO_ID"] = "0400000US" + df["GEO_NAME"].map(STATE_ABBR_TO_FIPS) + df["AGI_LOWER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][0]) + df["AGI_UPPER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][1]) + return df + + +def build_local_agi_state_targets( + source_df: pd.DataFrame | None = None, + tax_year: int = LOCAL_STATE_SOI_TAX_YEAR, +) -> pd.DataFrame: + base = _base_state_frame( + _load_state_soi_raw(tax_year=tax_year) if source_df is None else source_df + ) + frames = [] + + for column, variable, is_count in VARIABLE_SPECS: + frame = base[ + ["GEO_ID", "GEO_NAME", "AGI_LOWER_BOUND", "AGI_UPPER_BOUND", column] + ].rename(columns={column: "VALUE"}) + frame["IS_COUNT"] = int(is_count) + frame["VARIABLE"] = variable + if not is_count: + frame["VALUE"] = frame["VALUE"] * 1_000 + frames.append(frame) + + return pd.concat(frames, ignore_index=True).reset_index(drop=True) + + +def build_agi_state_targets( + tax_year: int = LOCAL_STATE_SOI_TAX_YEAR, + source_df: pd.DataFrame | None = None, +) -> pd.DataFrame: + return build_local_agi_state_targets( + source_df=source_df, + tax_year=tax_year, + ) + + +def refresh_local_agi_state_targets( + out_path: Path = TARGETS_PATH, +) -> Path: + targets = build_local_agi_state_targets() + targets.to_csv(out_path, index=False) + return out_path + + +def refresh_agi_state_targets( + tax_year: int = LOCAL_STATE_SOI_TAX_YEAR, + out_path: Path = TARGETS_PATH, +) -> pd.DataFrame: + targets = build_local_agi_state_targets(tax_year=tax_year) + targets.to_csv(out_path, index=False) + return targets + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Refresh agi_state.csv for local calibration" + ) + parser.add_argument( + "--tax-year", + type=int, + default=LOCAL_STATE_SOI_TAX_YEAR, + help="IRS geographic SOI tax year to pull", + ) + parser.add_argument( + "--out", + type=Path, + default=TARGETS_PATH, + help="Output CSV path", + ) + args = parser.parse_args() + refresh_agi_state_targets(tax_year=args.tax_year, out_path=args.out) + + +if __name__ == "__main__": + main() diff --git a/tests/test_refresh_local_agi_state_targets.py b/tests/test_refresh_local_agi_state_targets.py new file mode 100644 index 000000000..1b092d6ae --- /dev/null +++ b/tests/test_refresh_local_agi_state_targets.py @@ -0,0 +1,109 @@ +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd + + +REPO_ROOT = Path(__file__).resolve().parent.parent +PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data" +MODULE_PATH = ( + PACKAGE_ROOT + / "storage" + / "calibration_targets" + / "refresh_local_agi_state_targets.py" +) +TRACKED_TARGET_PATH = PACKAGE_ROOT / "storage" / "calibration_targets" / "agi_state.csv" + + +def load_module(): + refresh_spec = importlib.util.spec_from_file_location( + "refresh_local_agi_state_targets", + MODULE_PATH, + ) + refresh_module = importlib.util.module_from_spec(refresh_spec) + assert refresh_spec.loader is not None + refresh_spec.loader.exec_module(refresh_module) + return refresh_module + + +def make_raw_state_soi(): + return pd.DataFrame( + [ + {"STATE": "AL", "AGI_STUB": 0, "N1": 999, "A00100": 999}, + {"STATE": "AL", "AGI_STUB": 1, "N1": 10, "A00100": 5}, + {"STATE": "AL", "AGI_STUB": 9, "N1": 20, "A00100": 7}, + {"STATE": "AL", "AGI_STUB": 10, "N1": 30, "A00100": 11}, + {"STATE": "DC", "AGI_STUB": 1, "N1": 4, "A00100": 2}, + {"STATE": "DC", "AGI_STUB": 9, "N1": 6, "A00100": 3}, + {"STATE": "DC", "AGI_STUB": 10, "N1": 8, "A00100": 5}, + {"STATE": "PR", "AGI_STUB": 1, "N1": 100, "A00100": 100}, + ] + ) + + +def test_build_local_agi_state_targets_uses_local_loss_format(): + module = load_module() + + refreshed = module.build_local_agi_state_targets(make_raw_state_soi()) + + assert list(refreshed.columns) == [ + "GEO_ID", + "GEO_NAME", + "AGI_LOWER_BOUND", + "AGI_UPPER_BOUND", + "VALUE", + "IS_COUNT", + "VARIABLE", + ] + assert set(refreshed["GEO_NAME"]) == {"AL", "DC"} + assert "PR" not in set(refreshed["GEO_NAME"]) + assert (refreshed["GEO_NAME"].str.startswith("state_")).sum() == 0 + + dc_rows = refreshed[refreshed["GEO_NAME"] == "DC"] + assert set(dc_rows["GEO_ID"]) == {"0400000US11"} + + top_count = refreshed[ + (refreshed["GEO_NAME"] == "AL") + & (refreshed["VARIABLE"] == "adjusted_gross_income/count") + & np.isposinf(refreshed["AGI_UPPER_BOUND"]) + ] + top_amount = refreshed[ + (refreshed["GEO_NAME"] == "AL") + & (refreshed["VARIABLE"] == "adjusted_gross_income/amount") + & np.isposinf(refreshed["AGI_UPPER_BOUND"]) + ] + assert top_count["VALUE"].iat[0] == 50 + assert top_amount["VALUE"].iat[0] == 18_000 + + +def test_refresh_local_agi_state_targets_writes_expected_csv(tmp_path, monkeypatch): + module = load_module() + monkeypatch.setattr( + module, + "_load_state_soi_raw", + lambda tax_year=module.LOCAL_STATE_SOI_TAX_YEAR: make_raw_state_soi(), + ) + + output_path = tmp_path / "agi_state.csv" + written_path = module.refresh_local_agi_state_targets(output_path) + + written = pd.read_csv(written_path) + + assert written_path == output_path + assert written["GEO_ID"].isna().sum() == 0 + assert set(written["VARIABLE"]) == { + "adjusted_gross_income/count", + "adjusted_gross_income/amount", + } + + +def test_tracked_agi_state_targets_have_complete_geo_ids(): + tracked = pd.read_csv(TRACKED_TARGET_PATH) + + assert tracked["GEO_ID"].isna().sum() == 0 + assert tracked["GEO_NAME"].nunique() == 51 + assert set(tracked["VARIABLE"]) == { + "adjusted_gross_income/count", + "adjusted_gross_income/amount", + } From 59a0a6ab41d30b2b11a8d05ce5b5fb064048e102 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 29 Mar 2026 21:56:44 -0400 Subject: [PATCH 2/2] Fix national target test compatibility --- .../tests/test_etl_national_targets.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/policyengine_us_data/tests/test_etl_national_targets.py b/policyengine_us_data/tests/test_etl_national_targets.py index 7e38e18be..10e0ece31 100644 --- a/policyengine_us_data/tests/test_etl_national_targets.py +++ b/policyengine_us_data/tests/test_etl_national_targets.py @@ -8,7 +8,6 @@ create_database, ) from policyengine_us_data.db.etl_national_targets import ( - TAX_EXPENDITURE_REFORM_ID, load_national_targets, ) @@ -90,6 +89,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp tax_expenditure_df = pd.DataFrame( [ { + "reform_id": 1, "variable": "salt_deduction", "value": 21.247e9, "source": "Joint Committee on Taxation", @@ -97,6 +97,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp "year": 2024, }, { + "reform_id": 5, "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", @@ -124,16 +125,12 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp assert stale_rows assert all(not target.active for target in stale_rows) - reform_rows = ( - session.query(Target) - .filter(Target.reform_id == TAX_EXPENDITURE_REFORM_ID) - .all() - ) + reform_rows = session.query(Target).filter(Target.reform_id > 0).all() assert len(reform_rows) == 2 assert all(target.active for target in reform_rows) - assert {target.variable for target in reform_rows} == { - "salt_deduction", - "qualified_business_income_deduction", + assert {(target.variable, target.reform_id) for target in reform_rows} == { + ("salt_deduction", 1), + ("qualified_business_income_deduction", 5), } assert all( "Modeled as repeal-based income tax expenditure target"