Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/local-soi-refresh.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added an explicit refresh path and regression coverage for the legacy `agi_state.csv` SOI targets used by local calibration.
7 changes: 7 additions & 0 deletions policyengine_us_data/storage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
• Date: 2024
• Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf

- **agi_state.csv**
• Source: IRS SOI state data file used by legacy local calibration
• Date: tax year 2022
• Created by: `policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py`
• Location: https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv
• Notes: This file intentionally keeps the legacy `utils/loss.py` schema (`AL`, `DC`, etc.) instead of the newer `state_AL` geography naming used in `soi.csv`/database overlays. It is separate from `soi_targets.csv`, and it currently lags the national SOI refresh because IRS geographic state SOI files are only published through TY2022.

- **medicaid_enrollment_2024.csv**
• Source: MACPAC Enrollment Tables, FFY 2024
• Date: 2024
Expand Down
36 changes: 18 additions & 18 deletions policyengine_us_data/storage/calibration_targets/agi_state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
0400000US10,DE,75000.0,100000.0,51060,1,adjusted_gross_income/count
0400000US10,DE,100000.0,200000.0,89920,1,adjusted_gross_income/count
0400000US10,DE,200000.0,500000.0,30280,1,adjusted_gross_income/count
,DC,-inf,1.0,4700,1,adjusted_gross_income/count
,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
0400000US11,DC,-inf,1.0,4700,1,adjusted_gross_income/count
0400000US11,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
0400000US11,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
0400000US11,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
0400000US11,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
0400000US11,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
0400000US11,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
0400000US11,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
0400000US12,FL,-inf,1.0,216880,1,adjusted_gross_income/count
0400000US12,FL,1.0,10000.0,1123740,1,adjusted_gross_income/count
0400000US12,FL,10000.0,25000.0,2180990,1,adjusted_gross_income/count
Expand Down Expand Up @@ -414,7 +414,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
0400000US06,CA,500000.0,inf,426810,1,adjusted_gross_income/count
0400000US08,CO,500000.0,inf,51500,1,adjusted_gross_income/count
0400000US09,CT,500000.0,inf,45510,1,adjusted_gross_income/count
,DC,500000.0,inf,10530,1,adjusted_gross_income/count
0400000US11,DC,500000.0,inf,10530,1,adjusted_gross_income/count
0400000US10,DE,500000.0,inf,5350,1,adjusted_gross_income/count
0400000US12,FL,500000.0,inf,197090,1,adjusted_gross_income/count
0400000US13,GA,500000.0,inf,65350,1,adjusted_gross_income/count
Expand Down Expand Up @@ -522,14 +522,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
0400000US10,DE,75000.0,100000.0,4427687000,0,adjusted_gross_income/amount
0400000US10,DE,100000.0,200000.0,12401957000,0,adjusted_gross_income/amount
0400000US10,DE,200000.0,500000.0,8502065000,0,adjusted_gross_income/amount
,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
0400000US11,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
0400000US11,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
0400000US11,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
0400000US11,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
0400000US11,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
0400000US11,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
0400000US11,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
0400000US11,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
0400000US12,FL,-inf,1.0,-19196408000,0,adjusted_gross_income/amount
0400000US12,FL,1.0,10000.0,5776254000,0,adjusted_gross_income/amount
0400000US12,FL,10000.0,25000.0,37314354000,0,adjusted_gross_income/amount
Expand Down Expand Up @@ -873,7 +873,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
0400000US06,CA,500000.0,inf,613219427000,0,adjusted_gross_income/amount
0400000US08,CO,500000.0,inf,71426453000,0,adjusted_gross_income/amount
0400000US09,CT,500000.0,inf,77248832000,0,adjusted_gross_income/amount
,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
0400000US11,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
0400000US10,DE,500000.0,inf,6773920000,0,adjusted_gross_income/amount
0400000US12,FL,500000.0,inf,427887554000,0,adjusted_gross_income/amount
0400000US13,GA,500000.0,inf,92080953000,0,adjusted_gross_income/amount
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""Refresh tracked SOI targets used by legacy local calibration.

This regenerates ``agi_state.csv`` from the IRS geographic SOI state file while
preserving the legacy schema consumed by ``utils/loss.py``:

- ``GEO_NAME`` is the two-letter state abbreviation
- ``VARIABLE`` is ``adjusted_gross_income/count`` or ``.../amount``
- AGI bounds live in ``AGI_LOWER_BOUND`` / ``AGI_UPPER_BOUND``

This file intentionally remains separate from the national workbook-backed
``soi_targets.csv`` refresh path because IRS geographic releases lag the
national Publication 1304 tables.
"""

from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np
import pandas as pd


CALIBRATION_FOLDER = Path(__file__).resolve().parent
TARGETS_PATH = CALIBRATION_FOLDER / "agi_state.csv"
STATE_SOI_TAX_YEAR = 2022
LOCAL_STATE_SOI_TAX_YEAR = STATE_SOI_TAX_YEAR

AGI_STUB_TO_BAND = {
1: "Under $1",
2: "$1 under $10,000",
3: "$10,000 under $25,000",
4: "$25,000 under $50,000",
5: "$50,000 under $75,000",
6: "$75,000 under $100,000",
7: "$100,000 under $200,000",
8: "$200,000 under $500,000",
9: "$500,000 or more",
}

AGI_BOUNDS = {
"Under $1": (-np.inf, 1),
"$1 under $10,000": (1, 10_000),
"$10,000 under $25,000": (10_000, 25_000),
"$25,000 under $50,000": (25_000, 50_000),
"$50,000 under $75,000": (50_000, 75_000),
"$75,000 under $100,000": (75_000, 100_000),
"$100,000 under $200,000": (100_000, 200_000),
"$200,000 under $500,000": (200_000, 500_000),
"$500,000 or more": (500_000, np.inf),
}

STATE_ABBR_TO_FIPS = {
"AL": "01",
"AK": "02",
"AZ": "04",
"AR": "05",
"CA": "06",
"CO": "08",
"CT": "09",
"DE": "10",
"DC": "11",
"FL": "12",
"GA": "13",
"HI": "15",
"ID": "16",
"IL": "17",
"IN": "18",
"IA": "19",
"KS": "20",
"KY": "21",
"LA": "22",
"ME": "23",
"MD": "24",
"MA": "25",
"MI": "26",
"MN": "27",
"MS": "28",
"MO": "29",
"MT": "30",
"NE": "31",
"NV": "32",
"NH": "33",
"NJ": "34",
"NM": "35",
"NY": "36",
"NC": "37",
"ND": "38",
"OH": "39",
"OK": "40",
"OR": "41",
"PA": "42",
"RI": "44",
"SC": "45",
"SD": "46",
"TN": "47",
"TX": "48",
"UT": "49",
"VT": "50",
"VA": "51",
"WA": "53",
"WV": "54",
"WI": "55",
"WY": "56",
}

NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
VARIABLE_SPECS = (
("N1", "adjusted_gross_income/count", True),
("A00100", "adjusted_gross_income/amount", False),
)


def _state_soi_url(tax_year: int) -> str:
return f"https://www.irs.gov/pub/irs-soi/{tax_year % 100:02d}in55cmcsv.csv"


def _load_state_soi_raw(tax_year: int = LOCAL_STATE_SOI_TAX_YEAR) -> pd.DataFrame:
return pd.read_csv(_state_soi_url(tax_year), thousands=",")


def _base_state_frame(source_df: pd.DataFrame) -> pd.DataFrame:
df = source_df.copy()
merged_top_tail = (
df[df["AGI_STUB"].isin([9, 10])]
.groupby("STATE", as_index=False)
.agg({"N1": "sum", "A00100": "sum"})
.assign(AGI_STUB=9)
)
df = df[~df["AGI_STUB"].isin([9, 10])]
df = pd.concat([df, merged_top_tail], ignore_index=True)
df = df[df["AGI_STUB"] != 0].copy()
df = df.loc[~df["STATE"].isin(NON_VOTING_STATES.union({"US"}))].copy()
df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
df["GEO_NAME"] = df["STATE"]
df["GEO_ID"] = "0400000US" + df["GEO_NAME"].map(STATE_ABBR_TO_FIPS)
df["AGI_LOWER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][0])
df["AGI_UPPER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][1])
return df


def build_local_agi_state_targets(
source_df: pd.DataFrame | None = None,
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
) -> pd.DataFrame:
base = _base_state_frame(
_load_state_soi_raw(tax_year=tax_year) if source_df is None else source_df
)
frames = []

for column, variable, is_count in VARIABLE_SPECS:
frame = base[
["GEO_ID", "GEO_NAME", "AGI_LOWER_BOUND", "AGI_UPPER_BOUND", column]
].rename(columns={column: "VALUE"})
frame["IS_COUNT"] = int(is_count)
frame["VARIABLE"] = variable
if not is_count:
frame["VALUE"] = frame["VALUE"] * 1_000
frames.append(frame)

return pd.concat(frames, ignore_index=True).reset_index(drop=True)


def build_agi_state_targets(
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
source_df: pd.DataFrame | None = None,
) -> pd.DataFrame:
return build_local_agi_state_targets(
source_df=source_df,
tax_year=tax_year,
)


def refresh_local_agi_state_targets(
out_path: Path = TARGETS_PATH,
) -> Path:
targets = build_local_agi_state_targets()
targets.to_csv(out_path, index=False)
return out_path


def refresh_agi_state_targets(
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
out_path: Path = TARGETS_PATH,
) -> pd.DataFrame:
targets = build_local_agi_state_targets(tax_year=tax_year)
targets.to_csv(out_path, index=False)
return targets


def main() -> None:
parser = argparse.ArgumentParser(
description="Refresh agi_state.csv for local calibration"
)
parser.add_argument(
"--tax-year",
type=int,
default=LOCAL_STATE_SOI_TAX_YEAR,
help="IRS geographic SOI tax year to pull",
)
parser.add_argument(
"--out",
type=Path,
default=TARGETS_PATH,
help="Output CSV path",
)
args = parser.parse_args()
refresh_agi_state_targets(tax_year=args.tax_year, out_path=args.out)


if __name__ == "__main__":
main()
15 changes: 6 additions & 9 deletions policyengine_us_data/tests/test_etl_national_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
create_database,
)
from policyengine_us_data.db.etl_national_targets import (
TAX_EXPENDITURE_REFORM_ID,
load_national_targets,
)

Expand Down Expand Up @@ -90,13 +89,15 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp
tax_expenditure_df = pd.DataFrame(
[
{
"reform_id": 1,
"variable": "salt_deduction",
"value": 21.247e9,
"source": "Joint Committee on Taxation",
"notes": "SALT deduction tax expenditure",
"year": 2024,
},
{
"reform_id": 5,
"variable": "qualified_business_income_deduction",
"value": 63.1e9,
"source": "Joint Committee on Taxation",
Expand Down Expand Up @@ -124,16 +125,12 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp
assert stale_rows
assert all(not target.active for target in stale_rows)

reform_rows = (
session.query(Target)
.filter(Target.reform_id == TAX_EXPENDITURE_REFORM_ID)
.all()
)
reform_rows = session.query(Target).filter(Target.reform_id > 0).all()
assert len(reform_rows) == 2
assert all(target.active for target in reform_rows)
assert {target.variable for target in reform_rows} == {
"salt_deduction",
"qualified_business_income_deduction",
assert {(target.variable, target.reform_id) for target in reform_rows} == {
("salt_deduction", 1),
("qualified_business_income_deduction", 5),
}
assert all(
"Modeled as repeal-based income tax expenditure target"
Expand Down
Loading
Loading