Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/impute-cps-clone-features.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Donor-impute race, Hispanic status, sex, and occupation-based CPS features onto the PUF clone half of the extended CPS so subgroup analyses and overtime-eligibility inputs better align with PUF-imputed incomes.
20 changes: 20 additions & 0 deletions policyengine_us_data/calibration/source_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@

import numpy as np
import pandas as pd
from policyengine_us_data.datasets.cps.tipped_occupation import (
derive_any_treasury_tipped_occupation_code,
derive_is_tipped_occupation,
)

from policyengine_us_data.datasets.org import (
ORG_BOOL_VARIABLES,
Expand Down Expand Up @@ -80,6 +84,7 @@
"age",
"count_under_18",
"count_under_6",
"is_tipped_occupation",
]

SIPP_ASSETS_PREDICTORS = [
Expand Down Expand Up @@ -109,6 +114,8 @@
"NONE": 0,
}

SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]


def _encode_tenure_type(df: pd.DataFrame) -> pd.DataFrame:
"""Convert tenure_type enum strings to numeric codes."""
Expand Down Expand Up @@ -381,6 +388,12 @@ def _impute_sipp(
sipp_df["age"] = sipp_df.TAGE
sipp_df["household_weight"] = sipp_df.WPFINWGT
sipp_df["household_id"] = sipp_df.SSUID
sipp_df["treasury_tipped_occupation_code"] = (
derive_any_treasury_tipped_occupation_code(sipp_df[SIPP_JOB_OCCUPATION_COLUMNS])
)
sipp_df["is_tipped_occupation"] = derive_is_tipped_occupation(
sipp_df["treasury_tipped_occupation_code"]
)

sipp_df["is_under_18"] = sipp_df.TAGE < 18
sipp_df["is_under_6"] = sipp_df.TAGE < 6
Expand All @@ -398,6 +411,7 @@ def _impute_sipp(
"count_under_18",
"count_under_6",
"age",
"is_tipped_occupation",
"household_weight",
]
tip_train = sipp_df[tip_cols].dropna()
Expand Down Expand Up @@ -428,6 +442,12 @@ def _impute_sipp(
else:
cps_tip_df["count_under_18"] = 0.0
cps_tip_df["count_under_6"] = 0.0
if "treasury_tipped_occupation_code" in data:
cps_tip_df["is_tipped_occupation"] = derive_is_tipped_occupation(
data["treasury_tipped_occupation_code"][time_period]
).astype(np.float32)
else:
cps_tip_df["is_tipped_occupation"] = 0.0

qrf = QRF()
logger.info(
Expand Down
10 changes: 10 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
predict_org_features,
)
from policyengine_us_data.utils.randomness import seeded_rng
from policyengine_us_data.datasets.cps.tipped_occupation import (
derive_treasury_tipped_occupation_code,
derive_is_tipped_occupation,
)


class CPS(Dataset):
Expand Down Expand Up @@ -487,6 +491,9 @@ def children_per_parent(col: str) -> pd.DataFrame:
cps["is_full_time_college_student"] = person.A_HSCOL == 2

cps["detailed_occupation_recode"] = person.POCCU2
cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
person.PEIOOCC
)
add_overtime_occupation(cps, person)


Expand Down Expand Up @@ -1785,6 +1792,9 @@ def add_tips(self, cps: h5py.File):
raw_data = self.raw_cps(require=True).load()
raw_person = raw_data["person"]
cps["is_married"] = raw_person.A_MARITL.isin([1, 2]).values
cps["is_tipped_occupation"] = derive_is_tipped_occupation(
derive_treasury_tipped_occupation_code(raw_person.PEIOOCC)
)
raw_data.close()

cps["is_under_18"] = cps.age < 18
Expand Down
Loading
Loading