From 5e328c46c56dff7965fa2fd10163121c0a877be5 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Wed, 2 Apr 2025 12:18:57 +0100 Subject: [PATCH 1/5] Ensure US model always downloads datasets from HuggingFace Fixes #2321 --- Makefile | 2 +- .../jobs/calculate_economy_simulation_job.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b259866ec..275c7a5b4 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ install: - pip install -e .[dev] --config-settings editable_mode=compat + pip install -e ".[dev]" --config-settings editable_mode=compat debug: FLASK_APP=policyengine_api.api FLASK_DEBUG=1 flask run --without-threads diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py index 54ee16713..244d4a685 100644 --- a/policyengine_api/jobs/calculate_economy_simulation_job.py +++ b/policyengine_api/jobs/calculate_economy_simulation_job.py @@ -230,6 +230,11 @@ def _compute_economy( seed=(region, time_period), time_period=time_period, ) + input_data = simulation.to_input_dataframe() + simulation = country.country_package.Microsimulation( + dataset=input_data, + reform=reform, + ) simulation.default_calculation_period = time_period for time_period in simulation.get_holder( @@ -360,6 +365,10 @@ def _create_simulation_us( else: sim_options["dataset"] = df[state_code == region.upper()] + if dataset == "default" and region == "us": + print(f"Running a default CPS simulation") + sim_options["dataset"] = CPS + # Return completed simulation return Microsimulation(**sim_options) From abb79d4c5871abdaa563bd0bff27d7be7f3b3051 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Wed, 2 Apr 2025 12:19:28 +0100 Subject: [PATCH 2/5] Versioning --- changelog_entry.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..179b568c0 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,5 @@ +- bump: patch + changes: + fixed: + - US model always downlaods from HuggingFace. + - Subsampling improvements. From a4f3b846d2940861c0e9ae4d131765708078d464 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Fri, 4 Apr 2025 10:43:33 +0100 Subject: [PATCH 3/5] Move to function --- .../jobs/calculate_economy_simulation_job.py | 61 ++++++++++++++----- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py index 244d4a685..50cc4171d 100644 --- a/policyengine_api/jobs/calculate_economy_simulation_job.py +++ b/policyengine_api/jobs/calculate_economy_simulation_job.py @@ -16,7 +16,11 @@ from policyengine_api.endpoints.economy.compare import compare_economic_outputs from policyengine_api.endpoints.economy.reform_impact import set_comment_on_job from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS -from policyengine_api.country import COUNTRIES, create_policy_reform +from policyengine_api.country import ( + COUNTRIES, + create_policy_reform, + PolicyEngineCountry, +) from policyengine_core.simulations import Microsimulation from policyengine_core.tools.hugging_face import download_huggingface_dataset import h5py @@ -220,20 +224,8 @@ def _compute_economy( options.get("max_households", os.environ.get("MAX_HOUSEHOLDS")) is not None ): - simulation.subsample( - int( - options.get( - "max_households", - os.environ.get("MAX_HOUSEHOLDS", 1_000_000), - ) - ), - seed=(region, time_period), - time_period=time_period, - ) - input_data = simulation.to_input_dataframe() - simulation = country.country_package.Microsimulation( - dataset=input_data, - reform=reform, + simulation = subsample( + options, simulation, region, time_period, reform, country ) simulation.default_calculation_period = time_period @@ -428,3 +420,42 @@ def _compute_cliff_impacts(self, simulation: Microsimulation) -> Dict: "cliff_share": float(cliff_share), "type": "cliff", } + + +def subsample( + options: dict, + simulation: Microsimulation, + region: str, + time_period: str, + reform: dict, + country: PolicyEngineCountry, +) -> Microsimulation: + """ + Subsamples a microsimulation dataset and reinitializes the simulation with the subsampled data. + Args: + options (dict): A dictionary of options, which may include "max_households" to specify the maximum number of households to subsample. + simulation (Microsimulation): The original microsimulation object to be subsampled. + region (str): The region for which the simulation is being run. + time_period (str): The time period for which the simulation is being run. + reform (dict): A dictionary representing the policy reform to apply to the simulation. + country (PolicyEngineCountry): The country-specific policy engine object. + Returns: + Microsimulation: A new microsimulation object initialized with the subsampled data and the specified reform. + """ + + simulation.subsample( + int( + options.get( + "max_households", + os.environ.get("MAX_HOUSEHOLDS", 1_000_000), + ) + ), + seed=(region, time_period), + time_period=time_period, + ) + input_data = simulation.to_input_dataframe() + simulation = country.country_package.Microsimulation( + dataset=input_data, + reform=reform, + ) + return simulation From 1e25b4005be6959c4cf0fb0f52bef4cf1448c7b4 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Apr 2025 11:35:16 +0100 Subject: [PATCH 4/5] Respond to comment --- policyengine_api/jobs/calculate_economy_simulation_job.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py index 50cc4171d..1b020e8fc 100644 --- a/policyengine_api/jobs/calculate_economy_simulation_job.py +++ b/policyengine_api/jobs/calculate_economy_simulation_job.py @@ -225,7 +225,12 @@ def _compute_economy( is not None ): simulation = subsample( - options, simulation, region, time_period, reform, country + options=options, + simulation=simulation, + region=region, + time_period=time_period, + reform=reform, + country=country, ) simulation.default_calculation_period = time_period From b484710f887f8ce5a296e2282b1ea1b8d2ef1638 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Apr 2025 11:43:41 +0100 Subject: [PATCH 5/5] Format --- policyengine_api/jobs/calculate_economy_simulation_job.py | 1 + 1 file changed, 1 insertion(+) diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py index a2f7d8139..10c72675f 100644 --- a/policyengine_api/jobs/calculate_economy_simulation_job.py +++ b/policyengine_api/jobs/calculate_economy_simulation_job.py @@ -503,6 +503,7 @@ def subsample( ) return simulation + def is_similar(x, y, parent_name: str = "") -> bool: if x is None or x == {}: if y is None or y == {}: