From 5e328c46c56dff7965fa2fd10163121c0a877be5 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Wed, 2 Apr 2025 12:18:57 +0100
Subject: [PATCH 1/5] Ensure US model always downloads datasets from
 HuggingFace Fixes #2321

---
 Makefile                                                 | 2 +-
 .../jobs/calculate_economy_simulation_job.py             | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b259866ec..275c7a5b4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 install:
-	pip install -e .[dev] --config-settings editable_mode=compat
+	pip install -e ".[dev]" --config-settings editable_mode=compat
 
 debug:
 	FLASK_APP=policyengine_api.api FLASK_DEBUG=1 flask run --without-threads
diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py
index 54ee16713..244d4a685 100644
--- a/policyengine_api/jobs/calculate_economy_simulation_job.py
+++ b/policyengine_api/jobs/calculate_economy_simulation_job.py
@@ -230,6 +230,11 @@ def _compute_economy(
                     seed=(region, time_period),
                     time_period=time_period,
                 )
+                input_data = simulation.to_input_dataframe()
+                simulation = country.country_package.Microsimulation(
+                    dataset=input_data,
+                    reform=reform,
+                )
             simulation.default_calculation_period = time_period
 
             for time_period in simulation.get_holder(
@@ -360,6 +365,10 @@ def _create_simulation_us(
             else:
                 sim_options["dataset"] = df[state_code == region.upper()]
 
+        if dataset == "default" and region == "us":
+            print(f"Running a default CPS simulation")
+            sim_options["dataset"] = CPS
+
         # Return completed simulation
         return Microsimulation(**sim_options)
 

From abb79d4c5871abdaa563bd0bff27d7be7f3b3051 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Wed, 2 Apr 2025 12:19:28 +0100
Subject: [PATCH 2/5] Versioning

---
 changelog_entry.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29bb..179b568c0 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: patch
+  changes:
+    fixed:
+    - US model always downlaods from HuggingFace.
+    - Subsampling improvements.

From a4f3b846d2940861c0e9ae4d131765708078d464 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Fri, 4 Apr 2025 10:43:33 +0100
Subject: [PATCH 3/5] Move to function

---
 .../jobs/calculate_economy_simulation_job.py  | 61 ++++++++++++++-----
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py
index 244d4a685..50cc4171d 100644
--- a/policyengine_api/jobs/calculate_economy_simulation_job.py
+++ b/policyengine_api/jobs/calculate_economy_simulation_job.py
@@ -16,7 +16,11 @@
 from policyengine_api.endpoints.economy.compare import compare_economic_outputs
 from policyengine_api.endpoints.economy.reform_impact import set_comment_on_job
 from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS
-from policyengine_api.country import COUNTRIES, create_policy_reform
+from policyengine_api.country import (
+    COUNTRIES,
+    create_policy_reform,
+    PolicyEngineCountry,
+)
 from policyengine_core.simulations import Microsimulation
 from policyengine_core.tools.hugging_face import download_huggingface_dataset
 import h5py
@@ -220,20 +224,8 @@ def _compute_economy(
                 options.get("max_households", os.environ.get("MAX_HOUSEHOLDS"))
                 is not None
             ):
-                simulation.subsample(
-                    int(
-                        options.get(
-                            "max_households",
-                            os.environ.get("MAX_HOUSEHOLDS", 1_000_000),
-                        )
-                    ),
-                    seed=(region, time_period),
-                    time_period=time_period,
-                )
-                input_data = simulation.to_input_dataframe()
-                simulation = country.country_package.Microsimulation(
-                    dataset=input_data,
-                    reform=reform,
+                simulation = subsample(
+                    options, simulation, region, time_period, reform, country
                 )
             simulation.default_calculation_period = time_period
 
@@ -428,3 +420,42 @@ def _compute_cliff_impacts(self, simulation: Microsimulation) -> Dict:
             "cliff_share": float(cliff_share),
             "type": "cliff",
         }
+
+
+def subsample(
+    options: dict,
+    simulation: Microsimulation,
+    region: str,
+    time_period: str,
+    reform: dict,
+    country: PolicyEngineCountry,
+) -> Microsimulation:
+    """
+    Subsamples a microsimulation dataset and reinitializes the simulation with the subsampled data.
+    Args:
+        options (dict): A dictionary of options, which may include "max_households" to specify the maximum number of households to subsample.
+        simulation (Microsimulation): The original microsimulation object to be subsampled.
+        region (str): The region for which the simulation is being run.
+        time_period (str): The time period for which the simulation is being run.
+        reform (dict): A dictionary representing the policy reform to apply to the simulation.
+        country (PolicyEngineCountry): The country-specific policy engine object.
+    Returns:
+        Microsimulation: A new microsimulation object initialized with the subsampled data and the specified reform.
+    """
+
+    simulation.subsample(
+        int(
+            options.get(
+                "max_households",
+                os.environ.get("MAX_HOUSEHOLDS", 1_000_000),
+            )
+        ),
+        seed=(region, time_period),
+        time_period=time_period,
+    )
+    input_data = simulation.to_input_dataframe()
+    simulation = country.country_package.Microsimulation(
+        dataset=input_data,
+        reform=reform,
+    )
+    return simulation

From 1e25b4005be6959c4cf0fb0f52bef4cf1448c7b4 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Apr 2025 11:35:16 +0100
Subject: [PATCH 4/5] Respond to comment

---
 policyengine_api/jobs/calculate_economy_simulation_job.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py
index 50cc4171d..1b020e8fc 100644
--- a/policyengine_api/jobs/calculate_economy_simulation_job.py
+++ b/policyengine_api/jobs/calculate_economy_simulation_job.py
@@ -225,7 +225,12 @@ def _compute_economy(
                 is not None
             ):
                 simulation = subsample(
-                    options, simulation, region, time_period, reform, country
+                    options=options,
+                    simulation=simulation,
+                    region=region,
+                    time_period=time_period,
+                    reform=reform,
+                    country=country,
                 )
             simulation.default_calculation_period = time_period
 

From b484710f887f8ce5a296e2282b1ea1b8d2ef1638 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Apr 2025 11:43:41 +0100
Subject: [PATCH 5/5] Format

---
 policyengine_api/jobs/calculate_economy_simulation_job.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/policyengine_api/jobs/calculate_economy_simulation_job.py b/policyengine_api/jobs/calculate_economy_simulation_job.py
index a2f7d8139..10c72675f 100644
--- a/policyengine_api/jobs/calculate_economy_simulation_job.py
+++ b/policyengine_api/jobs/calculate_economy_simulation_job.py
@@ -503,6 +503,7 @@ def subsample(
     )
     return simulation
 
+
 def is_similar(x, y, parent_name: str = "") -> bool:
     if x is None or x == {}:
         if y is None or y == {}: