diff --git a/changelog.d/add-calibration-pipeline-internals-docs.added.md b/changelog.d/add-calibration-pipeline-internals-docs.added.md
new file mode 100644
index 000000000..d34601c8c
--- /dev/null
+++ b/changelog.d/add-calibration-pipeline-internals-docs.added.md
@@ -0,0 +1 @@
+Add `docs/internals/` developer reference: three notebooks covering all nine pipeline stages (Stage 1 data build, Stage 2 calibration matrix assembly, Stages 3–4 L0 optimization and H5 assembly) plus a README with pipeline orchestration reference, run ID format, Modal volume layout, and HuggingFace artifact paths.
diff --git a/changelog.d/update-methodology-docs.changed.md b/changelog.d/update-methodology-docs.changed.md
new file mode 100644
index 000000000..13f5e2fa1
--- /dev/null
+++ b/changelog.d/update-methodology-docs.changed.md
@@ -0,0 +1 @@
+Update public-facing methodology and data documentation to reflect the current pipeline implementation; pipeline now uploads validation diagnostics to HuggingFace after H5 builds complete.
diff --git a/docs/calibration_internals.ipynb b/docs/calibration_internals.ipynb
deleted file mode 100644
index 8f7a0f5b7..000000000
--- a/docs/calibration_internals.ipynb
+++ /dev/null
@@ -1,1062 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Calibration Pipeline Internals\n",
- "\n",
- "Internal reference for debugging and development of the calibration pipeline.\n",
- "\n",
- "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the stratified CPS H5 file in `STORAGE_FOLDER`."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "---\n",
- "# Part 1: The Calibration Matrix\n",
- "\n",
- "The calibration pipeline has three stages: (1) compute uprated target values, (2) assemble the sparse constraint matrix, and (3) optimize weights (`unified_calibration.py`). This section is the diagnostic checkpoint between stages 1 and 2 — understand your matrix before you optimize.\n",
- "\n",
- "We build the full calibration matrix using `UnifiedMatrixBuilder` with clone-based geography from `assign_random_geography`, then inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n",
- "\n",
- "**Column layout:** `col = clone_idx * n_records + record_idx`"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.1 Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "from policyengine_us import Microsimulation\n",
- "from policyengine_us_data.storage import STORAGE_FOLDER\n",
- "from policyengine_us_data.calibration.unified_matrix_builder import (\n",
- " UnifiedMatrixBuilder,\n",
- ")\n",
- "from policyengine_us_data.calibration.clone_and_assign import (\n",
- " assign_random_geography,\n",
- ")\n",
- "from policyengine_us_data.calibration.calibration_utils import (\n",
- " create_target_groups,\n",
- " drop_target_groups,\n",
- " get_geo_level,\n",
- " STATE_CODES,\n",
- ")\n",
- "\n",
- "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
- "db_uri = f\"sqlite:///{db_path}\"\n",
- "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Records: 11,999, Clones: 3, Total columns: 35,997\n",
- "Matrix shape: (1411, 35997)\n",
- "Non-zero entries: 27,035\n"
- ]
- }
- ],
- "source": [
- "sim = Microsimulation(dataset=str(dataset_path))\n",
- "n_records = sim.calculate(\"household_id\", map_to=\"household\").values.shape[0]\n",
- "\n",
- "N_CLONES = 3 # keep small for diagnostics\n",
- "geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=42)\n",
- "\n",
- "builder = UnifiedMatrixBuilder(\n",
- " db_uri=db_uri,\n",
- " time_period=2024,\n",
- " dataset_path=str(dataset_path),\n",
- ")\n",
- "\n",
- "targets_df, X_sparse, target_names = builder.build_matrix(\n",
- " geography,\n",
- " sim,\n",
- " target_filter={\"domain_variables\": [\"aca_ptc\", \"snap\"]},\n",
- " hierarchical_domains=[\"aca_ptc\", \"snap\"],\n",
- ")\n",
- "\n",
- "n_total = n_records * N_CLONES\n",
- "print(f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\")\n",
- "print(f\"Matrix shape: {X_sparse.shape}\")\n",
- "print(f\"Non-zero entries: {X_sparse.nnz:,}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.2 Matrix overview"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Targets: 1411\n",
- "Columns: 35,997 (3 clones x 11,999 records)\n",
- "Non-zeros: 27,035\n",
- "Density: 0.000532\n",
- " National: 1 targets\n",
- " State: 102 targets\n",
- " District: 1308 targets\n"
- ]
- }
- ],
- "source": [
- "print(f\"Targets: {X_sparse.shape[0]}\")\n",
- "print(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\n",
- "print(f\"Non-zeros: {X_sparse.nnz:,}\")\n",
- "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n",
- "\n",
- "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
- "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
- "for level in [0, 1, 2]:\n",
- " n = (geo_levels == level).sum()\n",
- " if n > 0:\n",
- " print(f\" {level_names[level]}: {n} targets\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.3 Anatomy of a row\n",
- "\n",
- "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which cloned records can contribute to that target."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Row 705: cd_3402/household_count/[snap>0]\n",
- " variable: household_count\n",
- " geographic_id: 3402\n",
- " geo_level: district\n",
- " target value: 48,652\n",
- " uprating_factor: 1.0\n"
- ]
- }
- ],
- "source": [
- "mid_row = X_sparse.shape[0] // 2\n",
- "row = targets_df.iloc[mid_row]\n",
- "print(f\"Row {mid_row}: {target_names[mid_row]}\")\n",
- "print(f\" variable: {row['variable']}\")\n",
- "print(f\" geographic_id: {row['geographic_id']}\")\n",
- "print(f\" geo_level: {row['geo_level']}\")\n",
- "print(f\" target value: {row['value']:,.0f}\")\n",
- "print(f\" uprating_factor: {row.get('uprating_factor', 'N/A')}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Row 705 has 15 non-zero columns\n",
- " Spans 3 clone(s)\n",
- " Spans 15 unique record(s)\n",
- "\n",
- "First non-zero column (6161):\n",
- " clone_idx: 0\n",
- " record_idx: 6161\n",
- " state_fips: 34\n",
- " cd_geoid: 3402\n",
- " value: 1.00\n"
- ]
- }
- ],
- "source": [
- "row_vec = X_sparse[mid_row, :]\n",
- "nz_cols = row_vec.nonzero()[1]\n",
- "print(f\"Row {mid_row} has {len(nz_cols):,} non-zero columns\")\n",
- "\n",
- "if len(nz_cols) > 0:\n",
- " clone_indices = nz_cols // n_records\n",
- " record_indices = nz_cols % n_records\n",
- " print(f\" Spans {len(np.unique(clone_indices))} clone(s)\")\n",
- " print(f\" Spans {len(np.unique(record_indices))} unique record(s)\")\n",
- "\n",
- " first_col = nz_cols[0]\n",
- " print(f\"\\nFirst non-zero column ({first_col}):\")\n",
- " print(f\" clone_idx: {first_col // n_records}\")\n",
- " print(f\" record_idx: {first_col % n_records}\")\n",
- " print(f\" state_fips: {geography.state_fips[first_col]}\")\n",
- " print(f\" cd_geoid: {geography.cd_geoid[first_col]}\")\n",
- " print(f\" value: {X_sparse[mid_row, first_col]:.2f}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.4 Anatomy of a column\n",
- "\n",
- "Each column represents one (record, clone) pair. Columns are organized in clone blocks: the first `n_records` columns belong to clone 0, the next to clone 1, and so on. The block formula is:\n",
- "\n",
- "$$\\text{column\\_idx} = \\text{clone\\_idx} \\times n_{\\text{records}} + \\text{record\\_idx}$$"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Column 12041:\n",
- " clone_idx: 1\n",
- " record_idx: 42\n",
- " state_fips: 45\n",
- " cd_geoid: 4507\n",
- " block_geoid: 450410002022009\n",
- "\n",
- "This column has non-zero values in 0 target rows\n"
- ]
- }
- ],
- "source": [
- "col_idx = 1 * n_records + 42 # clone 1, record 42\n",
- "clone_idx = col_idx // n_records\n",
- "record_idx = col_idx % n_records\n",
- "print(f\"Column {col_idx}:\")\n",
- "print(f\" clone_idx: {clone_idx}\")\n",
- "print(f\" record_idx: {record_idx}\")\n",
- "print(f\" state_fips: {geography.state_fips[col_idx]}\")\n",
- "print(f\" cd_geoid: {geography.cd_geoid[col_idx]}\")\n",
- "print(f\" block_geoid: {geography.block_geoid[col_idx]}\")\n",
- "\n",
- "col_vec = X_sparse[:, col_idx]\n",
- "nz_rows = col_vec.nonzero()[0]\n",
- "print(f\"\\nThis column has non-zero values in {len(nz_rows)} target rows\")\n",
- "if len(nz_rows) > 0:\n",
- " print(\"First 5 target rows:\")\n",
- " for r in nz_rows[:5]:\n",
- " row = targets_df.iloc[r]\n",
- " print(\n",
- " f\" row {r}: {row['variable']} \"\n",
- " f\"(geo={row['geographic_id']}, \"\n",
- " f\"val={X_sparse[r, col_idx]:.2f})\"\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.5 Target groups and loss weighting\n",
- "\n",
- "Target groups partition the rows by (domain, variable, geographic level). Each group contributes equally to the loss function, so hundreds of district-level rows don't drown out a single national row."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "=== Creating Target Groups ===\n",
- "\n",
- "National targets:\n",
- " Group 0: ACA PTC Person Count = 19,743,689\n",
- "\n",
- "State targets:\n",
- " Group 1: SNAP Household Count (51 targets)\n",
- " Group 2: Snap (51 targets)\n",
- "\n",
- "District targets:\n",
- " Group 3: Aca Ptc (436 targets)\n",
- " Group 4: ACA PTC Tax Unit Count (436 targets)\n",
- " Group 5: SNAP Household Count (436 targets)\n",
- "\n",
- "Total groups created: 6\n",
- "========================================\n",
- " group_id description n_targets min_value median_value max_value\n",
- " 0 Group 0: National ACA PTC Person Count (1 target, value=19,743,689) 1 1.974369e+07 1.974369e+07 1.974369e+07\n",
- " 1 Group 1: State SNAP Household Count (51 targets) 51 1.369100e+04 2.772372e+05 3.128640e+06\n",
- " 2 Group 2: State Snap (51 targets) 51 5.670186e+07 1.293585e+09 1.237718e+10\n",
- " 3 Group 3: District Aca Ptc (436 targets) 436 5.420354e+06 2.937431e+07 3.880971e+08\n",
- " 4 Group 4: District ACA PTC Tax Unit Count (436 targets) 436 3.529773e+03 1.686570e+04 9.260854e+04\n",
- " 5 Group 5: District SNAP Household Count (436 targets) 436 1.156792e+04 4.687966e+04 1.735910e+05\n"
- ]
- }
- ],
- "source": [
- "target_groups, group_info = create_target_groups(targets_df)\n",
- "\n",
- "records = []\n",
- "for gid, info in enumerate(group_info):\n",
- " mask = target_groups == gid\n",
- " vals = targets_df.loc[mask, \"value\"]\n",
- " records.append(\n",
- " {\n",
- " \"group_id\": gid,\n",
- " \"description\": info,\n",
- " \"n_targets\": mask.sum(),\n",
- " \"min_value\": vals.min(),\n",
- " \"median_value\": vals.median(),\n",
- " \"max_value\": vals.max(),\n",
- " }\n",
- " )\n",
- "\n",
- "group_df = pd.DataFrame(records)\n",
- "print(group_df.to_string(index=False))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.6 Tracing a household across clones\n",
- "\n",
- "One CPS record appears once per clone (N_CLONES column positions). Each clone places it in a different census block/CD/state, so it contributes to different geographic targets depending on the clone."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Example SNAP-receiving household: record index 23\n",
- "SNAP value: $70\n",
- "\n",
- "Column positions across 3 clones:\n",
- " col 23: TX (state=48, CD=4829) — 3 non-zero rows\n",
- " col 12022: IL (state=17, CD=1707) — 3 non-zero rows\n",
- " col 24021: CA (state=6, CD=611) — 3 non-zero rows\n"
- ]
- }
- ],
- "source": [
- "snap_values = sim.calculate(\"snap\", map_to=\"household\").values\n",
- "hh_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n",
- "example_hh_idx = int(np.where(snap_values > 0)[0][0])\n",
- "print(f\"Example SNAP-receiving household: record index {example_hh_idx}\")\n",
- "print(f\"SNAP value: ${snap_values[example_hh_idx]:,.0f}\")\n",
- "\n",
- "clone_cols = [c * n_records + example_hh_idx for c in range(N_CLONES)]\n",
- "print(f\"\\nColumn positions across {N_CLONES} clones:\")\n",
- "for col in clone_cols:\n",
- " state = geography.state_fips[col]\n",
- " cd = geography.cd_geoid[col]\n",
- " col_vec = X_sparse[:, col]\n",
- " nnz = col_vec.nnz\n",
- " abbr = STATE_CODES.get(state, \"??\")\n",
- " print(f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.7 Sparsity analysis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Total cells: 50,791,767\n",
- "Non-zero entries: 27,035\n",
- "Density: 0.000532\n",
- "Sparsity: 99.9468%\n",
- "\n",
- "Non-zeros per row:\n",
- " min: 0\n",
- " median: 10\n",
- " mean: 19\n",
- " max: 4,241\n",
- "\n",
- "By geographic level:\n",
- " National : n= 1, median nnz= 4,241, range=[4,241, 4,241]\n",
- " State : n= 102, median nnz= 68, range=[5, 502]\n",
- " District : n=1308, median nnz= 10, range=[0, 21]\n"
- ]
- }
- ],
- "source": [
- "total_cells = X_sparse.shape[0] * X_sparse.shape[1]\n",
- "density = X_sparse.nnz / total_cells\n",
- "print(f\"Total cells: {total_cells:,}\")\n",
- "print(f\"Non-zero entries: {X_sparse.nnz:,}\")\n",
- "print(f\"Density: {density:.6f}\")\n",
- "print(f\"Sparsity: {1 - density:.4%}\")\n",
- "\n",
- "nnz_per_row = np.diff(X_sparse.indptr)\n",
- "print(f\"\\nNon-zeros per row:\")\n",
- "print(f\" min: {nnz_per_row.min():,}\")\n",
- "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n",
- "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n",
- "print(f\" max: {nnz_per_row.max():,}\")\n",
- "\n",
- "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
- "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
- "print(\"\\nBy geographic level:\")\n",
- "for level in [0, 1, 2]:\n",
- " mask = (geo_levels == level).values\n",
- " if mask.any():\n",
- " vals = nnz_per_row[mask]\n",
- " print(\n",
- " f\" {level_names[level]:10s}: \"\n",
- " f\"n={mask.sum():>4d}, \"\n",
- " f\"median nnz={int(np.median(vals)):>7,}, \"\n",
- " f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1.8 Dropping target groups and achievable targets\n",
- "\n",
- "Some target groups are redundant after hierarchical uprating. For example, state-level SNAP Household Count may be redundant with district-level SNAP Household Count once the districts were reconciled to sum to the state totals.\n",
- "\n",
- "A target is achievable if at least one household can contribute to it (row sum > 0). Rows with sum = 0 are impossible constraints that the optimizer cannot satisfy."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Matrix before: 1411 rows\n",
- " DROPPING Group 1: State SNAP Household Count (51 targets) (51 rows)\n",
- "\n",
- " KEEPING Group 0: National ACA PTC Person Count (1 target, value=19,743,689) (1 rows)\n",
- " KEEPING Group 2: State Snap (51 targets) (51 rows)\n",
- " KEEPING Group 3: District Aca Ptc (436 targets) (436 rows)\n",
- " KEEPING Group 4: District ACA PTC Tax Unit Count (436 targets) (436 rows)\n",
- " KEEPING Group 5: District SNAP Household Count (436 targets) (436 rows)\n",
- "\n",
- "Matrix after: 1360 rows\n",
- "\n",
- "Achievable targets: 1339\n",
- "Impossible targets: 21\n",
- "\n",
- "Final matrix shape: (1339, 35997)\n",
- "Final non-zero entries: 22,186\n",
- "This is what the optimizer receives.\n"
- ]
- }
- ],
- "source": [
- "GROUPS_TO_DROP = [\n",
- " (\"SNAP Household Count\", \"State\"),\n",
- "]\n",
- "\n",
- "targets_filtered, X_filtered = drop_target_groups(\n",
- " targets_df, X_sparse, target_groups, group_info, GROUPS_TO_DROP\n",
- ")\n",
- "\n",
- "row_sums = np.array(X_filtered.sum(axis=1)).flatten()\n",
- "achievable_mask = row_sums > 0\n",
- "n_achievable = achievable_mask.sum()\n",
- "n_impossible = (~achievable_mask).sum()\n",
- "\n",
- "print(f\"\\nAchievable targets: {n_achievable}\")\n",
- "print(f\"Impossible targets: {n_impossible}\")\n",
- "\n",
- "X_final = X_filtered[achievable_mask, :]\n",
- "print(f\"\\nFinal matrix shape: {X_final.shape}\")\n",
- "print(f\"Final non-zero entries: {X_final.nnz:,}\")\n",
- "print(f\"This is what the optimizer receives.\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Matrix summary\n",
- "\n",
- "The calibration matrix pipeline has five steps:\n",
- "\n",
- "1. **Clone + assign** — `assign_random_geography()` creates N clones of each CPS record, each with a random census block (and derived CD/state).\n",
- "2. **Build** — `UnifiedMatrixBuilder.build_matrix()` queries targets, applies hierarchical uprating, simulates each clone with its assigned geography, and assembles the sparse CSR matrix.\n",
- "3. **Groups** — `create_target_groups()` partitions rows for balanced loss weighting.\n",
- "4. **Sparsity** — Most of the matrix is zero. District-level targets confine non-zeros to clones assigned to that district; national targets span all clones.\n",
- "5. **Filter** — Remove impossible targets (row sum = 0) before handing to the optimizer."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "---\n",
- "# Part 2: Hierarchical Uprating\n",
- "\n",
- "Calibration targets in `policy_data.db` come from different sources, at different geographic levels, and from different time periods. Before we can use them, two adjustments are needed:\n",
- "\n",
- "1. **Uprating factor (UF)**: Bridges the time gap between the source data's period and the calibration year. For most domains, dollar-valued targets use CPI and count targets use population growth. For **ACA PTC**, we use real state-level enrollment and average APTC changes from CMS/KFF data, giving each state its own UF.\n",
- "\n",
- "2. **Hierarchy inconsistency factor (HIF)**: Corrects for the fact that district-level totals from one source may not sum to the state-level total from another. This is a pure base-year geometry correction with no time dimension.\n",
- "\n",
- "These two factors are **separable by linearity**. For each congressional district row:\n",
- "\n",
- "$$\\text{value} = \\text{original\\_value} \\times \\text{HIF} \\times \\text{UF}$$\n",
- "\n",
- "where $\\text{HIF} = S_{\\text{base}} \\;/\\; \\sum_i CD_{i,\\text{base}}$ and the sum constraint holds:\n",
- "\n",
- "$$\\sum_i (CD_i \\times \\text{HIF} \\times \\text{UF}) = \\text{UF} \\times S_{\\text{base}} = S_{\\text{uprated}}$$\n",
- "\n",
- "Two example domains:\n",
- "- **ACA PTC** (IRS data): Districts sum exactly to state totals, so HIF = 1.0 everywhere. The UF varies by state, reflecting real enrollment and APTC changes between 2022 and 2024.\n",
- "- **SNAP** (USDA data): District household counts substantially undercount the state administrative totals, so HIF > 1 (often 1.2 to 1.7). The SNAP data is already at the target period, so UF = 1.0."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2.1 Raw targets and generic uprating"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
domain_variable
\n",
- "
geo_level
\n",
- "
variable
\n",
- "
period
\n",
- "
count
\n",
- "
total_value
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
0
\n",
- "
aca_ptc
\n",
- "
district
\n",
- "
aca_ptc
\n",
- "
2022
\n",
- "
436
\n",
- "
1.419185e+10
\n",
- "
\n",
- "
\n",
- "
1
\n",
- "
aca_ptc
\n",
- "
district
\n",
- "
tax_unit_count
\n",
- "
2022
\n",
- "
436
\n",
- "
6.848330e+06
\n",
- "
\n",
- "
\n",
- "
2
\n",
- "
aca_ptc
\n",
- "
national
\n",
- "
aca_ptc
\n",
- "
2022
\n",
- "
1
\n",
- "
1.419185e+10
\n",
- "
\n",
- "
\n",
- "
3
\n",
- "
aca_ptc
\n",
- "
national
\n",
- "
person_count
\n",
- "
2024
\n",
- "
1
\n",
- "
1.974369e+07
\n",
- "
\n",
- "
\n",
- "
4
\n",
- "
aca_ptc
\n",
- "
national
\n",
- "
tax_unit_count
\n",
- "
2022
\n",
- "
1
\n",
- "
6.848330e+06
\n",
- "
\n",
- "
\n",
- "
5
\n",
- "
aca_ptc
\n",
- "
state
\n",
- "
aca_ptc
\n",
- "
2022
\n",
- "
51
\n",
- "
1.419185e+10
\n",
- "
\n",
- "
\n",
- "
6
\n",
- "
aca_ptc
\n",
- "
state
\n",
- "
tax_unit_count
\n",
- "
2022
\n",
- "
51
\n",
- "
6.848330e+06
\n",
- "
\n",
- "
\n",
- "
7
\n",
- "
snap
\n",
- "
district
\n",
- "
household_count
\n",
- "
2024
\n",
- "
436
\n",
- "
1.563268e+07
\n",
- "
\n",
- "
\n",
- "
8
\n",
- "
snap
\n",
- "
state
\n",
- "
household_count
\n",
- "
2024
\n",
- "
51
\n",
- "
2.217709e+07
\n",
- "
\n",
- "
\n",
- "
9
\n",
- "
snap
\n",
- "
state
\n",
- "
snap
\n",
- "
2024
\n",
- "
51
\n",
- "
9.365787e+10
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " domain_variable geo_level variable period count total_value\n",
- "0 aca_ptc district aca_ptc 2022 436 1.419185e+10\n",
- "1 aca_ptc district tax_unit_count 2022 436 6.848330e+06\n",
- "2 aca_ptc national aca_ptc 2022 1 1.419185e+10\n",
- "3 aca_ptc national person_count 2024 1 1.974369e+07\n",
- "4 aca_ptc national tax_unit_count 2022 1 6.848330e+06\n",
- "5 aca_ptc state aca_ptc 2022 51 1.419185e+10\n",
- "6 aca_ptc state tax_unit_count 2022 51 6.848330e+06\n",
- "7 snap district household_count 2024 436 1.563268e+07\n",
- "8 snap state household_count 2024 51 2.217709e+07\n",
- "9 snap state snap 2024 51 9.365787e+10"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "DOMAINS = [\"aca_ptc\", \"snap\"]\n",
- "\n",
- "raw = builder._query_targets({\"domain_variables\": DOMAINS})\n",
- "\n",
- "summary = (\n",
- " raw.groupby([\"domain_variable\", \"geo_level\", \"variable\", \"period\"])\n",
- " .agg(count=(\"value\", \"size\"), total_value=(\"value\", \"sum\"))\n",
- " .reset_index()\n",
- ")\n",
- "summary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 2022 -> 2024 (cpi): 1.101889\n",
- " 2022 -> 2024 (pop): 1.020415\n",
- " 2023 -> 2024 (cpi): 1.035512\n",
- " 2023 -> 2024 (pop): 1.010947\n",
- " 2025 -> 2024 (cpi): 0.970879\n",
- " 2025 -> 2024 (pop): 0.990801\n"
- ]
- }
- ],
- "source": [
- "params = sim.tax_benefit_system.parameters\n",
- "uprating_factors = builder._calculate_uprating_factors(params)\n",
- "\n",
- "for (yr, kind), f in sorted(uprating_factors.items()):\n",
- " if f != 1.0:\n",
- " print(f\" {yr} -> 2024 ({kind}): {f:.6f}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2.2 Hierarchical reconciliation\n",
- "\n",
- "For each (state, variable) pair within a domain:\n",
- "\n",
- "- **HIF** = `state_original / sum(cd_originals)` — pure base-year correction\n",
- "- **UF** = state-specific uprating factor:\n",
- " - For **ACA PTC**: loaded from `aca_ptc_multipliers_2022_2024.csv` (CMS/KFF enrollment data)\n",
- " - For other domains: national CPI/pop factors as fallback"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "raw[\"original_value\"] = raw[\"value\"].copy()\n",
- "raw[\"uprating_factor\"] = raw.apply(\n",
- " lambda r: builder._get_uprating_info(r[\"variable\"], r[\"period\"], uprating_factors)[\n",
- " 0\n",
- " ],\n",
- " axis=1,\n",
- ")\n",
- "raw[\"value\"] = raw[\"original_value\"] * raw[\"uprating_factor\"]\n",
- "\n",
- "result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ACA PTC (HIF=1.0, state-varying UF):\n",
- " CA aca_ptc hif=1.000000 uprating=1.209499 sum(CDs)= 3,332,007,010\n",
- " CA tax_unit_count hif=1.000000 uprating=1.055438 sum(CDs)= 1,302,653\n",
- " TX aca_ptc hif=1.000000 uprating=1.957664 sum(CDs)= 2,270,594,110\n",
- " TX tax_unit_count hif=1.000000 uprating=1.968621 sum(CDs)= 1,125,834\n",
- " NY aca_ptc hif=1.000000 uprating=1.343861 sum(CDs)= 2,049,797,288\n",
- " NY tax_unit_count hif=1.000000 uprating=1.075089 sum(CDs)= 593,653\n",
- "\n",
- "SNAP (HIF>1, UF=1.0):\n",
- " CA household_count hif=1.681273 uprating=1.000000 sum(CDs)= 3,128,640\n",
- " TX household_count hif=1.244524 uprating=1.000000 sum(CDs)= 1,466,107\n",
- " NY household_count hif=1.344447 uprating=1.000000 sum(CDs)= 1,707,770\n"
- ]
- }
- ],
- "source": [
- "sample_states = {6: \"CA\", 48: \"TX\", 36: \"NY\"}\n",
- "\n",
- "\n",
- "def show_reconciliation(result, raw, domain, sample_states):\n",
- " domain_rows = result[result[\"domain_variable\"] == domain]\n",
- " cd_domain = domain_rows[domain_rows[\"geo_level\"] == \"district\"]\n",
- " if cd_domain.empty:\n",
- " print(\" (no district rows)\")\n",
- " return\n",
- " for fips, abbr in sample_states.items():\n",
- " cd_state = cd_domain[\n",
- " cd_domain[\"geographic_id\"].apply(\n",
- " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
- " )\n",
- " ]\n",
- " if cd_state.empty:\n",
- " continue\n",
- " for var in sorted(cd_state[\"variable\"].unique()):\n",
- " var_rows = cd_state[cd_state[\"variable\"] == var]\n",
- " hif = var_rows[\"hif\"].iloc[0]\n",
- " uf = var_rows[\"state_uprating_factor\"].iloc[0]\n",
- " cd_sum = var_rows[\"value\"].sum()\n",
- " print(\n",
- " f\" {abbr} {var:20s} \"\n",
- " f\"hif={hif:.6f} \"\n",
- " f\"uprating={uf:.6f} \"\n",
- " f\"sum(CDs)={cd_sum:>14,.0f}\"\n",
- " )\n",
- "\n",
- "\n",
- "print(\"ACA PTC (HIF=1.0, state-varying UF):\")\n",
- "show_reconciliation(result, raw, \"aca_ptc\", sample_states)\n",
- "print(\"\\nSNAP (HIF>1, UF=1.0):\")\n",
- "show_reconciliation(result, raw, \"snap\", sample_states)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2.3 Verification: sum(CDs) == uprated state\n",
- "\n",
- "The core invariant: for every (state, variable) pair that has district rows, the sum of reconciled district values must equal the uprated state total."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 153 checks across 2 domains: ALL PASSED\n"
- ]
- }
- ],
- "source": [
- "all_ok = True\n",
- "checks = 0\n",
- "for domain in DOMAINS:\n",
- " domain_result = result[result[\"domain_variable\"] == domain]\n",
- " cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n",
- " if cd_result.empty:\n",
- " continue\n",
- " for fips, abbr in sorted(STATE_CODES.items()):\n",
- " cd_rows = cd_result[\n",
- " cd_result[\"geographic_id\"].apply(\n",
- " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
- " )\n",
- " ]\n",
- " if cd_rows.empty:\n",
- " continue\n",
- " for var in cd_rows[\"variable\"].unique():\n",
- " var_rows = cd_rows[cd_rows[\"variable\"] == var]\n",
- " cd_sum = var_rows[\"value\"].sum()\n",
- " st = raw[\n",
- " (raw[\"geo_level\"] == \"state\")\n",
- " & (raw[\"geographic_id\"] == str(fips))\n",
- " & (raw[\"variable\"] == var)\n",
- " & (raw[\"domain_variable\"] == domain)\n",
- " ]\n",
- " if st.empty:\n",
- " continue\n",
- " state_original = st[\"original_value\"].iloc[0]\n",
- " state_uf = var_rows[\"state_uprating_factor\"].iloc[0]\n",
- " expected = state_original * state_uf\n",
- " ok = np.isclose(cd_sum, expected, rtol=1e-6)\n",
- " checks += 1\n",
- " if not ok:\n",
- " print(\n",
- " f\" FAIL [{domain}] {abbr} {var}: \"\n",
- " f\"sum(CDs)={cd_sum:.2f} != \"\n",
- " f\"expected={expected:.2f}\"\n",
- " )\n",
- " all_ok = False\n",
- "\n",
- "print(\n",
- " f\" {checks} checks across {len(DOMAINS)} domains: \"\n",
- " + (\"ALL PASSED\" if all_ok else \"SOME FAILED\")\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "---\n",
- "# Part 3: H5 Builder Reference\n",
- "\n",
- "`build_h5` is the single function that produces all local-area H5 datasets (national, state, district, city). It lives in `policyengine_us_data/calibration/publish_local_area.py`."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3.1 Signature\n",
- "\n",
- "```python\n",
- "def build_h5(\n",
- " weights: np.ndarray,\n",
- " geography: GeographyAssignment,\n",
- " dataset_path: Path,\n",
- " output_path: Path,\n",
- " cd_subset: List[str] = None,\n",
- " county_filter: set = None,\n",
- " takeup_filter: List[str] = None,\n",
- ") -> Path:\n",
- "```\n",
- "\n",
- "## 3.2 Parameter Semantics\n",
- "\n",
- "| Parameter | Type | Purpose |\n",
- "|---|---|---|\n",
- "| `weights` | `np.ndarray` | Clone-level weight vector, shape `(n_clones * n_hh,)` |\n",
- "| `geography` | `GeographyAssignment` | Geography assignment from `assign_random_geography` |\n",
- "| `dataset_path` | `Path` | Path to base dataset H5 file |\n",
- "| `output_path` | `Path` | Where to write the output H5 file |\n",
- "| `cd_subset` | `List[str]` | If provided, only include clones for these CDs |\n",
- "| `county_filter` | `set` | If provided, scale weights by P(target counties \\| CD) for city datasets |\n",
- "| `takeup_filter` | `List[str]` | List of takeup variables to re-randomize |\n",
- "\n",
- "## 3.3 How `cd_subset` Controls Output Level\n",
- "\n",
- "- **National** (`cd_subset=None`): All CDs included — produces a full national dataset.\n",
- "- **State** (`cd_subset=[CDs in state]`): Filter to CDs whose FIPS prefix matches the state.\n",
- "- **District** (`cd_subset=[single_cd]`): Single CD — produces a district dataset.\n",
- "- **City** (`cd_subset=[NYC CDs]` + `county_filter=NYC_COUNTIES`): Multiple CDs with county filtering. The `county_filter` scales weights by the probability that a household in each CD falls within the target counties."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3.4 Internal Pipeline\n",
- "\n",
- "1. **Load base simulation** — One `Microsimulation` loaded from `dataset_path`. Entity arrays and membership mappings extracted.\n",
- "2. **Reshape weights** — The flat weight vector is reshaped to `(n_clones, n_hh)`.\n",
- "3. **CD subset filtering** — Clones for CDs not in `cd_subset` are zeroed out.\n",
- "4. **County filtering** — If `county_filter` is set, each clone's weight is scaled by `P(target_counties | CD)` via `get_county_filter_probability()`.\n",
- "5. **Identify active clones** — `np.where(W > 0)` finds all nonzero entries. Each represents a distinct household clone.\n",
- "6. **Clone entity arrays** — Entity arrays (household, person, tax_unit, spm_unit, family, marital_unit) are cloned using fancy indexing on the base simulation arrays.\n",
- "7. **Reindex entity IDs** — All entity IDs are reassigned to be globally unique. Cross-reference arrays (e.g., `person_household_id`) are updated accordingly.\n",
- "8. **Derive geography** — Block GEOIDs are mapped to state FIPS, county, tract, CBSA, etc. via `derive_geography_from_blocks()`. Unique blocks are deduplicated for efficiency.\n",
- "9. **Recalculate SPM thresholds** — SPM thresholds are recomputed using `calculate_spm_thresholds_vectorized()` with the clone's CD-level geographic adjustment factor.\n",
- "10. **Rerandomize takeup** (optional) — If enabled, takeup booleans are redrawn per census block using `apply_block_takeup_to_arrays()`.\n",
- "11. **Write H5** — All variable arrays are written to the output file."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3.5 Usage Examples\n",
- "\n",
- "### National\n",
- "```python\n",
- "build_h5(\n",
- " weights=w,\n",
- " geography=geography,\n",
- " dataset_path=Path(\"base.h5\"),\n",
- " output_path=Path(\"national/US.h5\"),\n",
- ")\n",
- "```\n",
- "\n",
- "### State\n",
- "```python\n",
- "state_cds = [cd for cd in geography.cd_geoid if int(cd) // 100 == 6]\n",
- "build_h5(\n",
- " weights=w,\n",
- " geography=geography,\n",
- " dataset_path=Path(\"base.h5\"),\n",
- " output_path=Path(\"states/CA.h5\"),\n",
- " cd_subset=list(set(state_cds)),\n",
- ")\n",
- "```\n",
- "\n",
- "### District\n",
- "```python\n",
- "build_h5(\n",
- " weights=w,\n",
- " geography=geography,\n",
- " dataset_path=Path(\"base.h5\"),\n",
- " output_path=Path(\"districts/CA-12.h5\"),\n",
- " cd_subset=[\"0612\"],\n",
- ")\n",
- "```\n",
- "\n",
- "### City (NYC)\n",
- "```python\n",
- "from policyengine_us_data.calibration.publish_local_area import (\n",
- " NYC_COUNTIES, NYC_CDS,\n",
- ")\n",
- "\n",
- "build_h5(\n",
- " weights=w,\n",
- " geography=geography,\n",
- " dataset_path=Path(\"base.h5\"),\n",
- " output_path=Path(\"cities/NYC.h5\"),\n",
- " cd_subset=NYC_CDS,\n",
- " county_filter=NYC_COUNTIES,\n",
- ")\n",
- "```"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "pe3.13 (3.13.0)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.13.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/data.md b/docs/data.md
index 96ae1d129..8007a5934 100644
--- a/docs/data.md
+++ b/docs/data.md
@@ -2,6 +2,17 @@
Our methodology combines two primary data sources with calibration targets from administrative sources.
+## Data vintages
+
+| Dataset | Vintage | Notes |
+|---------|---------|-------|
+| CPS ASEC | 2024 (income year 2023) | Base microdata; pipeline ages values to target policy year |
+| IRS PUF | 2015 | Pipeline ages values to target policy year using income growth indices |
+| ACS | 2022 | Provides rent and real estate tax imputation targets |
+| SCF | 2022 | Provides wealth and debt variable imputation targets |
+| SIPP | 2023 | Provides tip income and asset imputation targets |
+| Calibration targets | Primarily 2023–2024 | Varies by source; see calibration data sources below |
+
## Primary Data Sources
### Current Population Survey (CPS)
@@ -20,6 +31,8 @@ The PUF provides tax-related variables drawn from filed tax returns. It provides
The PUF has limitations for policy analysis. The file contains minimal demographic information, limited to filing status and exemptions claimed. The IRS removes geographic identifiers to protect taxpayer privacy, which prevents state-level analysis. The population excludes non-filers. The PUF lacks household structure, preventing analysis of how tax policies interact with transfer programs that operate at the household level.
+The enhanced dataset retains the full CPS population, including non-filers. CPS copy-1 records keep original CPS income values for all households, covering non-filers and low-income households absent from PUF. Only copy-2 records replace income variables with PUF imputations and represent the filer population. This structure preserves non-filer coverage while enriching the tax detail for filers.
+
## Additional Data Sources for Imputation
Beyond the PUF, we incorporate data from three additional surveys to impute specific variables missing from the CPS:
diff --git a/docs/internals/README.md b/docs/internals/README.md
new file mode 100644
index 000000000..3e4a99bdd
--- /dev/null
+++ b/docs/internals/README.md
@@ -0,0 +1,231 @@
+# Pipeline internals — developer reference
+
+Internal notebooks for the policyengine-us-data calibration pipeline. Not published in the Jupyter Book. Use these when debugging a wrong aggregate, understanding an implementation choice, or extending the pipeline.
+
+---
+
+## Notebooks
+
+| Notebook | Stages | Required files / inputs |
+|---|---|---|
+| [`data_build_internals.ipynb`](data_build_internals.ipynb) | Stage 1: build_datasets | donor QRF cells need ACS/SIPP/SCF files |
+| [`calibration_package_internals.ipynb`](calibration_package_internals.ipynb) | Stage 2: build_package | Part 1 uses a toy sparse matrix; Parts 2–5 use static excerpts or toy demos |
+| [`local_dataset_assembly_internals.ipynb`](local_dataset_assembly_internals.ipynb) | Stages 3–4: fit_weights, publish_and_stage | L0 toy run; diagnostic cells need a completed run's CSV output |
+
+### Which notebook to open
+
+**Wrong value in an individual record** → `data_build_internals.ipynb`
+The record value is set in Stage 1 and never changed by calibration. The problem is in clone creation, source imputation, or PUF imputation.
+
+**Wrong weighted aggregate despite correct record values** → `calibration_package_internals.ipynb`
+The calibration matrix determines which records contribute to which targets. Check matrix assembly, domain constraints, and takeup randomization.
+
+**Calibration converged but aggregate still off, or H5 values unexpected** → `local_dataset_assembly_internals.ipynb`
+The optimizer may have failed to match a target, or the weight expansion step is applying incorrect geographic filtering. Check L0 diagnostics and weight expansion.
+
+---
+
+## Pipeline orchestration reference
+
+The pipeline runs on [Modal](https://modal.com) via `modal_app/pipeline.py`. It chains five steps under a single **run ID**, with resume support and per-step checkpointing.
+
+### Run ID format
+
+```
+{version}_{sha[:8]}_{timestamp}
+```
+
+Example: `1.23.0_a3f1b2c4_20260315_142037`
+
+- `version`: package version from `pyproject.toml` at the baked image
+- `sha[:8]`: first 8 characters of the branch tip SHA at orchestrator start
+- `timestamp`: UTC datetime in `YYYYMMDD_HHMMSS`
+
+The SHA is pinned at orchestrator start. If the branch moves mid-run, intermediate artifacts may come from different commits — the pipeline warns but does not abort.
+
+### Step dependency graph
+
+```
+Step 1: build_datasets → produces source_imputed_*.h5, policy_data.db
+ ↓
+Step 2: build_package → produces calibration_package.pkl (the calibration matrix)
+ ↓
+Step 3: fit_weights → regional and national fits run in parallel
+ ↓ produces calibration_weights.npy
+Step 4: publish_and_stage → builds H5 files per area, validates, stages to HuggingFace
+ ↓
+Step 5: promote → moves staged H5s to production (no new computation)
+```
+
+Steps 3 regional and national fits spawn concurrently (`regional_handle.spawn()` / `national_handle.spawn()`). The orchestrator waits for both before advancing to Step 4.
+
+Default hyperparameters passed in `run_pipeline()`:
+- Regional: `beta=0.65`, `lambda_l0=1e-7`, `lambda_l2=1e-8`, 1,000 epochs, T4 GPU
+- National: `beta=0.65`, `lambda_l0=1e-4`, `lambda_l2=1e-12`, 4,000 epochs, T4 GPU
+
+### Modal volumes
+
+Two Modal volumes back the pipeline:
+
+| Volume name | Mount path | Purpose |
+|---|---|---|
+| `pipeline-artifacts` | `/pipeline` | Run metadata, calibration artifacts, diagnostics |
+| `local-area-staging` | `/staging` | Intermediate H5 files during publish step |
+
+Directory layout inside `pipeline-artifacts`:
+
+```
+/pipeline/
+ runs/
+ {run_id}/
+ meta.json ← run metadata (status, step timings, validation summary)
+ diagnostics/
+ calibration_log.csv
+ unified_diagnostics.csv
+ unified_run_config.json
+ national_calibration_log.csv
+ national_unified_diagnostics.csv
+ validation_results.csv
+ national_validation.txt
+ artifacts/
+ {run_id}/
+ calibration_package.pkl
+ calibration_weights.npy
+ national_calibration_weights.npy
+ source_imputed_*.h5
+ policy_data.db
+```
+
+### `meta.json` structure
+
+```json
+{
+ "run_id": "1.23.0_a3f1b2c4_20260315_142037",
+ "branch": "main",
+ "sha": "a3f1b2c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0",
+ "version": "1.23.0",
+ "start_time": "2026-03-15T14:20:37+00:00",
+ "status": "running | completed | failed | promoted",
+ "step_timings": {
+ "build_datasets": {
+ "start": "2026-03-15T14:20:40+00:00",
+ "end": "2026-03-15T16:45:12+00:00",
+ "duration_s": 8672.1,
+ "status": "completed"
+ },
+ "build_package": { "...": "..." },
+ "fit_weights": { "...": "..." },
+ "publish": { "...": "..." },
+ "validation": {
+ "total_targets": 3842,
+ "sanity_failures": 12,
+ "mean_rel_abs_error": 0.0231,
+ "worst_areas": [...]
+ }
+ },
+ "error": null
+}
+```
+
+### Resume logic
+
+The orchestrator auto-resumes if it finds a run with the same `branch` + `sha` and `status == "running"` in the pipeline volume. Resume skips any step whose `step_timings[step]["status"] == "completed"`.
+
+If the branch has moved since the run started (SHA mismatch), the orchestrator raises a `RuntimeError` and requires starting a fresh run.
+
+To force a resume of a specific run:
+```bash
+modal run --detach modal_app/pipeline.py::main \
+ --action run --resume-run-id 1.23.0_a3f1b2c4_20260315_142037
+```
+
+To start fresh (ignore resumable runs):
+```bash
+modal run --detach modal_app/pipeline.py::main \
+ --action run --branch main
+```
+
+### HuggingFace artifact paths
+
+All artifacts land in `policyengine/policyengine-us-data` (model repo) under the `staging/` prefix until promoted.
+
+| Artifact | HF path (staging) | HF path (production after promote) |
+|---|---|---|
+| source_imputed H5s | `staging/calibration/source_imputed_*.h5` | `calibration/source_imputed_*.h5` |
+| policy_data.db | `staging/calibration/policy_data.db` | `calibration/policy_data.db` |
+| Calibration log | `calibration/runs/{run_id}/diagnostics/calibration_log.csv` | — (never promoted) |
+| Unified diagnostics | `calibration/runs/{run_id}/diagnostics/unified_diagnostics.csv` | — (never promoted) |
+| Validation results | `calibration/runs/{run_id}/diagnostics/validation_results.csv` | — (never promoted) |
+| Local area H5s | `staging/` (area-specific paths) | final dataset paths |
+
+Diagnostics are never promoted — they remain under `calibration/runs/{run_id}/` permanently.
+
+To fetch a diagnostic file from a known run ID:
+```python
+from huggingface_hub import hf_hub_download
+
+path = hf_hub_download(
+ repo_id="policyengine/policyengine-us-data",
+ repo_type="model",
+ filename=f"calibration/runs/{run_id}/diagnostics/unified_diagnostics.csv",
+)
+```
+
+### Checking pipeline status
+
+```bash
+modal run modal_app/pipeline.py::main --action status
+```
+
+This reads `meta.json` for all runs in the pipeline volume and prints step completion status and timings.
+
+### Promoting a completed run
+
+```bash
+modal run modal_app/pipeline.py::main \
+ --action promote --run-id 1.23.0_a3f1b2c4_20260315_142037
+```
+
+Promote moves staged H5s to their production paths on HuggingFace. It does not re-run any computation. After promotion, the run's `status` in `meta.json` changes to `"promoted"`.
+
+---
+
+## File reference
+
+> **Note:** This reference reflects the codebase as of the time of writing. File responsibilities may shift as the pipeline evolves — use this as a starting point, then read the file to confirm.
+
+### `policyengine_us_data/calibration/`
+
+| File | Purpose |
+|---|---|
+| `unified_calibration.py` | Main calibration entry point: clones CPS, assigns geography, builds matrix, runs L0 optimizer, saves weights. Start here for the end-to-end flow. |
+| `unified_matrix_builder.py` | Builds the sparse calibration matrix. Per-state simulation, clone loop, domain constraints, takeup re-randomization, COO assembly. |
+| `clone_and_assign.py` | Clones CPS records N times, assigns each clone a random census block with no-CD-collision constraint and AGI-conditional routing. |
+| `block_assignment.py` | Per-CD block assignment and geographic variable derivation (county, tract, CBSA, SLDU, SLDL, place, PUMA, VTD, ZCTA) from block GEOIDs. |
+| `county_assignment.py` | Legacy/fallback: assigns counties within CDs using P(county \| CD). Only called by `block_assignment.py::_generate_fallback_blocks()` when a CD is missing from the pre-computed block distribution (primarily in tests). Not used in production pipeline runs. |
+| `puf_impute.py` | PUF cloning: doubles the dataset, imputes 70+ tax variables via sequential QRF, reconciles Social Security sub-components. |
+| `source_impute.py` | Re-imputes housing and asset variables from ACS, SIPP, and SCF donor surveys using QRF. |
+| `create_source_imputed_cps.py` | Standalone script that runs `source_impute.py` on the stratified extended CPS to produce the dataset used by calibration. |
+| `create_stratified_cps.py` | Creates a stratified CPS sample preserving all high-income households while maintaining low-income diversity. |
+| `publish_local_area.py` | Builds per-area H5 files (states, districts, cities) from calibrated weights. Weight expansion, entity cloning, geography override, SPM recalculation, takeup draws. |
+| `calibration_utils.py` | Shared utilities: state mappings, SPM threshold calculation, geographic adjustment factors, target group functions, initial weight computation. |
+| `target_config.yaml` | Include rules that gate which DB targets enter calibration (applied post-matrix-build). The training config. |
+| `target_config_full.yaml` | Broader include rules used for validation — includes targets not in the training set for holdout evaluation. |
+| `validate_staging.py` | Validates built H5 files by running `sim.calculate()` and comparing weighted aggregates against DB targets. Produces `validation_results.csv`. |
+| `validate_national_h5.py` | Validates the national `US.h5` against known national totals and runs structural sanity checks. |
+| `validate_package.py` | Validates a calibration package (matrix + targets) before uploading to Modal — checks structure, achievability, and provenance. |
+| `sanity_checks.py` | Structural integrity checks on H5 files: weights, monetary variable ranges, takeup booleans, entity ID consistency. |
+| `check_staging_sums.py` | Standalone CLI utility (not part of the automated pipeline): sums key variables across all 51 state H5 files and compares to national references. Run manually via `make check-staging` or `python -m ...`. |
+| `promote_local_h5s.py` | Standalone CLI utility (not part of the automated pipeline): promotes locally-built H5 files to production via HuggingFace staging and GCS upload. Used for manual local builds outside Modal. |
+
+### `modal_app/`
+
+| File | Purpose |
+|---|---|
+| `pipeline.py` | End-to-end pipeline orchestrator: chains dataset build → matrix build → weight fitting → H5 publish → promote. Manages run IDs, resume, and diagnostics upload. |
+| `data_build.py` | Modal app for Stage 1: parallel dataset building (CPS extraction, PUF cloning, source imputation) with checkpoint persistence. |
+| `remote_calibration_runner.py` | Modal app for Stages 2–3: builds calibration package and/or runs L0 optimizer on GPU. Supports `build_package` and `fit_from_package` workflows. |
+| `local_area.py` | Modal app for Stage 4: parallel H5 building with distributed worker coordination, LPT scheduling, and validation aggregation. |
+| `worker_script.py` | Subprocess worker called by `local_area.py` to build individual H5 files. Runs in a separate process to avoid import conflicts. |
+| `images.py` | Defines pre-baked Modal container images with source code, dependencies, and Git metadata for reproducibility. |
+| `resilience.py` | Retry and resume utilities for Modal workflows (exponential backoff, idempotent step execution). |
diff --git a/docs/internals/calibration_package_internals.ipynb b/docs/internals/calibration_package_internals.ipynb
new file mode 100644
index 000000000..195fb7da6
--- /dev/null
+++ b/docs/internals/calibration_package_internals.ipynb
@@ -0,0 +1,1586 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Calibration Pipeline Internals\n",
+ "\n",
+ "Internal reference for debugging and development of the calibration pipeline.\n",
+ "\n",
+ "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the source-imputed stratified CPS H5 file in `STORAGE_FOLDER`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "# Part 1: Anatomy of the calibration matrix \n",
+ "\n",
+ "This section demonstrates the structure of the calibration matrix so you can inspect it. We build toward a crucial idea: understand your matrix and the values constraining your problem before you optimize for it.\n",
+ "\n",
+ "We build the full calibration matrix using `UnifiedMatrixBuilder` with clone-based geography from `assign_random_geography`, then inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n",
+ "\n",
+ "**Column layout:** `col = clone_idx * n_records + record_idx`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.1 Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import scipy.sparse as sp\n",
+ "from dataclasses import dataclass\n",
+ "from policyengine_us_data.calibration.calibration_utils import (\n",
+ " create_target_groups,\n",
+ " drop_target_groups,\n",
+ " get_geo_level,\n",
+ " STATE_CODES,\n",
+ ")\n",
+ "\n",
+ "# Toy parameters — used in place of a real CPS H5 to avoid the multi-minute\n",
+ "# runtime of build_matrix() (runs PolicyEngine per state, per clone).\n",
+ "N_CLONES = 3\n",
+ "n_records = 8"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Records: 8, Clones: 3, Columns: 24\n",
+ "Matrix shape: (12, 24)\n",
+ "Non-zeros: 150\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Geography: each of the N_CLONES * n_records columns gets a state + CD.\n",
+ "# The real assign_random_geography() enforces the no-CD-collision invariant;\n",
+ "# here we hand-assign a small example satisfying the same property.\n",
+ "\n",
+ "np.random.seed(42)\n",
+ "\n",
+ "# state_fips and cd_geoid per column (clone * n_records + record)\n",
+ "_state_per_record = np.array([6, 48, 36, 6, 48, 17, 36, 6])\n",
+ "_cd_per_record = np.array([601, 4801, 3601, 602, 4802, 1701, 3602, 603])\n",
+ "\n",
+ "\n",
+ "@dataclass\n",
+ "class ToyGeography:\n",
+ " state_fips: np.ndarray\n",
+ " cd_geoid: np.ndarray\n",
+ " block_geoid: np.ndarray\n",
+ "\n",
+ "\n",
+ "# Replicate across clones with different block assignments\n",
+ "_sf = np.tile(_state_per_record, N_CLONES)\n",
+ "_cd = np.tile(_cd_per_record, N_CLONES)\n",
+ "_bk = np.array([f\"{_cd[i]}B{i:04d}\" for i in range(len(_sf))])\n",
+ "\n",
+ "geography = ToyGeography(\n",
+ " state_fips=_sf,\n",
+ " cd_geoid=np.array([str(c) for c in _cd]),\n",
+ " block_geoid=_bk,\n",
+ ")\n",
+ "\n",
+ "# Toy targets.\n",
+ "# geographic_id: 'US' = national, str(FIPS < 100) = state, str(geoid >= 100) = district.\n",
+ "_rows = [\n",
+ " # domain_variable variable geographic_id value\n",
+ " (\"snap\", \"snap\", \"US\", 15_000_000_000),\n",
+ " (\"snap\", \"snap\", \"6\", 5_000_000_000),\n",
+ " (\"snap\", \"snap\", \"48\", 3_000_000_000),\n",
+ " (\"snap\", \"snap\", \"36\", 2_000_000_000),\n",
+ " (\"snap\", \"snap\", \"17\", 1_000_000_000),\n",
+ " (\"snap\", \"snap_household_count\", \"US\", 5_000_000),\n",
+ " (\"snap\", \"snap_household_count\", \"6\", 1_500_000),\n",
+ " (\"aca_ptc\", \"aca_ptc\", \"US\", 8_000_000_000),\n",
+ " (\"aca_ptc\", \"aca_ptc\", \"6\", 3_000_000_000),\n",
+ " (\"aca_ptc\", \"aca_ptc\", \"601\", 400_000_000),\n",
+ " (\"employment_income\", \"employment_income\", \"US\", 9_000_000_000_000),\n",
+ " (\"employment_income\", \"employment_income\", \"6\", 3_000_000_000_000),\n",
+ "]\n",
+ "targets_df = pd.DataFrame(\n",
+ " _rows, columns=[\"domain_variable\", \"variable\", \"geographic_id\", \"value\"]\n",
+ ")\n",
+ "targets_df[\"uprating_factor\"] = 1.02\n",
+ "target_names = [\n",
+ " f\"{r.domain_variable}_{r.geographic_id}\" for _, r in targets_df.iterrows()\n",
+ "]\n",
+ "n_targets = len(targets_df)\n",
+ "\n",
+ "# Build toy sparse matrix.\n",
+ "# X[i, j] = variable value for column j, or 0 if geography doesn't match target i.\n",
+ "rng = np.random.default_rng(0)\n",
+ "_sr, _sc, _sv = [], [], []\n",
+ "for t_idx, row in targets_df.iterrows():\n",
+ " geo = row[\"geographic_id\"]\n",
+ " for col in range(N_CLONES * n_records):\n",
+ " state = str(geography.state_fips[col])\n",
+ " cd = geography.cd_geoid[col]\n",
+ " if geo == \"US\" or geo == state or geo == cd:\n",
+ " _sr.append(t_idx)\n",
+ " _sc.append(col)\n",
+ " _sv.append(rng.uniform(0.5, 5.0))\n",
+ "\n",
+ "X_sparse = sp.csr_matrix((_sv, (_sr, _sc)), shape=(n_targets, N_CLONES * n_records))\n",
+ "\n",
+ "print(f\"Records: {n_records}, Clones: {N_CLONES}, Columns: {N_CLONES * n_records}\")\n",
+ "print(f\"Matrix shape: {X_sparse.shape}\")\n",
+ "print(f\"Non-zeros: {X_sparse.nnz}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.2 Matrix overview"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Targets: 12\n",
+ "Columns: 24 (3 clones x 8 records)\n",
+ "Non-zeros: 150\n",
+ "Density: 0.520833\n",
+ " National: 4 targets\n",
+ " State: 7 targets\n",
+ " District: 1 targets\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Targets: {X_sparse.shape[0]}\")\n",
+ "print(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\n",
+ "print(f\"Non-zeros: {X_sparse.nnz:,}\")\n",
+ "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n",
+ "\n",
+ "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
+ "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
+ "for level in [0, 1, 2]:\n",
+ " n = (geo_levels == level).sum()\n",
+ " if n > 0:\n",
+ " print(f\" {level_names[level]}: {n} targets\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.3 Anatomy of a row\n",
+ "\n",
+ "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which cloned records can contribute to that target."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Row 6: snap_6\n",
+ " variable: snap_household_count\n",
+ " geographic_id: 6\n",
+ " geo_level: 1\n",
+ " target value: 1,500,000\n",
+ " uprating_factor: 1.02\n"
+ ]
+ }
+ ],
+ "source": [
+ "mid_row = X_sparse.shape[0] // 2\n",
+ "row = targets_df.iloc[mid_row]\n",
+ "print(f\"Row {mid_row}: {target_names[mid_row]}\")\n",
+ "print(f\" variable: {row['variable']}\")\n",
+ "print(f\" geographic_id: {row['geographic_id']}\")\n",
+ "print(f\" geo_level: {get_geo_level(row['geographic_id'])}\")\n",
+ "print(f\" target value: {row['value']:,.0f}\")\n",
+ "print(f\" uprating_factor: {row.get('uprating_factor', 'N/A')}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Row 6 has 9 non-zero columns\n",
+ " Spans 3 clone(s)\n",
+ " Spans 3 unique record(s)\n",
+ "\n",
+ "First non-zero column (0):\n",
+ " clone_idx: 0\n",
+ " record_idx: 0\n",
+ " state_fips: 6\n",
+ " cd_geoid: 601\n",
+ " value: 2.48\n"
+ ]
+ }
+ ],
+ "source": [
+ "row_vec = X_sparse[mid_row, :]\n",
+ "nz_cols = row_vec.nonzero()[1]\n",
+ "print(f\"Row {mid_row} has {len(nz_cols):,} non-zero columns\")\n",
+ "\n",
+ "if len(nz_cols) > 0:\n",
+ " clone_indices = nz_cols // n_records\n",
+ " record_indices = nz_cols % n_records\n",
+ " print(f\" Spans {len(np.unique(clone_indices))} clone(s)\")\n",
+ " print(f\" Spans {len(np.unique(record_indices))} unique record(s)\")\n",
+ "\n",
+ " first_col = nz_cols[0]\n",
+ " print(f\"\\nFirst non-zero column ({first_col}):\")\n",
+ " print(f\" clone_idx: {first_col // n_records}\")\n",
+ " print(f\" record_idx: {first_col % n_records}\")\n",
+ " print(f\" state_fips: {geography.state_fips[first_col]}\")\n",
+ " print(f\" cd_geoid: {geography.cd_geoid[first_col]}\")\n",
+ " print(f\" value: {X_sparse[mid_row, first_col]:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.4 Anatomy of a column\n",
+ "\n",
+ "Each column represents one (record, clone) pair. Columns are organized in clone blocks: the first `n_records` columns belong to clone 0, the next to clone 1, and so on. The block formula is:\n",
+ "\n",
+ "$$\\text{column\\_idx} = \\text{clone\\_idx} \\times n_{\\text{records}} + \\text{record\\_idx}$$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Column 11:\n",
+ " clone_idx: 1\n",
+ " record_idx: 3\n",
+ " state_fips: 6\n",
+ " cd_geoid: 602\n",
+ " block_geoid: 602B0011\n",
+ "\n",
+ "This column has non-zero values in 8 target rows\n",
+ "First 5 target rows:\n",
+ " row 0: snap (geo=US, val=0.51)\n",
+ " row 1: snap (geo=6, val=3.58)\n",
+ " row 5: snap_household_count (geo=US, val=0.73)\n",
+ " row 6: snap_household_count (geo=6, val=3.29)\n",
+ " row 7: aca_ptc (geo=US, val=0.57)\n"
+ ]
+ }
+ ],
+ "source": [
+ "col_idx = 1 * n_records + 3 # clone 1, record 3\n",
+ "clone_idx = col_idx // n_records\n",
+ "record_idx = col_idx % n_records\n",
+ "print(f\"Column {col_idx}:\")\n",
+ "print(f\" clone_idx: {clone_idx}\")\n",
+ "print(f\" record_idx: {record_idx}\")\n",
+ "print(f\" state_fips: {geography.state_fips[col_idx]}\")\n",
+ "print(f\" cd_geoid: {geography.cd_geoid[col_idx]}\")\n",
+ "print(f\" block_geoid: {geography.block_geoid[col_idx]}\")\n",
+ "\n",
+ "col_vec = X_sparse[:, col_idx]\n",
+ "nz_rows = col_vec.nonzero()[0]\n",
+ "print(f\"\\nThis column has non-zero values in {len(nz_rows)} target rows\")\n",
+ "if len(nz_rows) > 0:\n",
+ " print(\"First 5 target rows:\")\n",
+ " for r in nz_rows[:5]:\n",
+ " row = targets_df.iloc[r]\n",
+ " print(\n",
+ " f\" row {r}: {row['variable']} \"\n",
+ " f\"(geo={row['geographic_id']}, \"\n",
+ " f\"val={X_sparse[r, col_idx]:.2f})\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.5 Target groups and loss weighting\n",
+ "\n",
+ "Target groups partition the rows by (domain, variable, geographic level). This grouping can be used to make each group contribute equally to the loss function, so hundreds of district-level rows don't drown out a single national row. However, this is currently not part of the pipeline. In its current form, all targets contribute equally to the loss function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== Creating Target Groups ===\n",
+ "\n",
+ "National targets:\n",
+ " Group 0: Aca Ptc = 8,000,000,000\n",
+ " Group 1: Employment Income = 9,000,000,000,000\n",
+ " Group 2: Snap = 15,000,000,000\n",
+ " Group 3: SNAP Snap Household Count = 5,000,000\n",
+ "\n",
+ "State targets:\n",
+ " Group 4: Aca Ptc = 3,000,000,000\n",
+ " Group 5: Employment Income = 3,000,000,000,000\n",
+ " Group 6: Snap (4 targets)\n",
+ " Group 7: SNAP Snap Household Count = 1,500,000\n",
+ "\n",
+ "District targets:\n",
+ " Group 8: Aca Ptc = 400,000,000\n",
+ "\n",
+ "Total groups created: 9\n",
+ "========================================\n",
+ " group_id description n_targets min_value median_value max_value\n",
+ " 0 Group 0: National Aca Ptc (1 target, value=8,000,000,000) 1 8000000000 8.000000e+09 8000000000\n",
+ " 1 Group 1: National Employment Income (1 target, value=9,000,000,000,000) 1 9000000000000 9.000000e+12 9000000000000\n",
+ " 2 Group 2: National Snap (1 target, value=15,000,000,000) 1 15000000000 1.500000e+10 15000000000\n",
+ " 3 Group 3: National SNAP Snap Household Count (1 target, value=5,000,000) 1 5000000 5.000000e+06 5000000\n",
+ " 4 Group 4: State Aca Ptc (1 target, value=3,000,000,000) 1 3000000000 3.000000e+09 3000000000\n",
+ " 5 Group 5: State Employment Income (1 target, value=3,000,000,000,000) 1 3000000000000 3.000000e+12 3000000000000\n",
+ " 6 Group 6: State Snap (4 targets) 4 1000000000 2.500000e+09 5000000000\n",
+ " 7 Group 7: State SNAP Snap Household Count (1 target, value=1,500,000) 1 1500000 1.500000e+06 1500000\n",
+ " 8 Group 8: District Aca Ptc (1 target, value=400,000,000) 1 400000000 4.000000e+08 400000000\n"
+ ]
+ }
+ ],
+ "source": [
+ "target_groups, group_info = create_target_groups(targets_df)\n",
+ "\n",
+ "records = []\n",
+ "for gid, info in enumerate(group_info):\n",
+ " mask = target_groups == gid\n",
+ " vals = targets_df.loc[mask, \"value\"]\n",
+ " records.append(\n",
+ " {\n",
+ " \"group_id\": gid,\n",
+ " \"description\": info,\n",
+ " \"n_targets\": mask.sum(),\n",
+ " \"min_value\": vals.min(),\n",
+ " \"median_value\": vals.median(),\n",
+ " \"max_value\": vals.max(),\n",
+ " }\n",
+ " )\n",
+ "\n",
+ "group_df = pd.DataFrame(records)\n",
+ "print(group_df.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.6 Tracing a household across clones\n",
+ "\n",
+ "One CPS record appears once per clone (N_CLONES column positions). Each clone places it in a different census block/CD/state, so it contributes to different geographic targets depending on the clone."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Example SNAP-contributing household: base record index 0\n",
+ "\n",
+ "Column positions across 3 clones:\n",
+ " col 0: CA (state=6, CD=601) — 9 non-zero rows\n",
+ " col 8: CA (state=6, CD=601) — 9 non-zero rows\n",
+ " col 16: CA (state=6, CD=601) — 9 non-zero rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Trace one SNAP-receiving household across all clones.\n",
+ "# In production this uses sim.calculate('snap', ...); here we find a record\n",
+ "# that contributes to at least one SNAP row in the toy matrix.\n",
+ "\n",
+ "snap_mask = targets_df[\"domain_variable\"] == \"snap\"\n",
+ "snap_rows = np.where(snap_mask)[0]\n",
+ "\n",
+ "# Find the first record (base index) with non-zero SNAP contributions\n",
+ "snap_sub = X_sparse[snap_rows, :] # sub-matrix: SNAP rows only\n",
+ "col_nnz = np.diff(snap_sub.tocsc().indptr) # nnz per column\n",
+ "example_hh_idx = int(np.where(col_nnz > 0)[0][0]) % n_records\n",
+ "\n",
+ "print(f\"Example SNAP-contributing household: base record index {example_hh_idx}\")\n",
+ "\n",
+ "clone_cols = [c * n_records + example_hh_idx for c in range(N_CLONES)]\n",
+ "print(f\"\\nColumn positions across {N_CLONES} clones:\")\n",
+ "for col in clone_cols:\n",
+ " state = geography.state_fips[col]\n",
+ " cd = geography.cd_geoid[col]\n",
+ " nnz = X_sparse[:, col].nnz\n",
+ " abbr = STATE_CODES.get(state, \"??\")\n",
+ " print(f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.7 Sparsity analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total cells: 288\n",
+ "Non-zero entries: 150\n",
+ "Density: 0.520833\n",
+ "Sparsity: 47.9167%\n",
+ "\n",
+ "Non-zeros per row:\n",
+ " min: 3\n",
+ " median: 9\n",
+ " mean: 12\n",
+ " max: 24\n",
+ "\n",
+ "By geographic level:\n",
+ " National : n= 4, median nnz= 24, range=[24, 24]\n",
+ " State : n= 7, median nnz= 9, range=[3, 9]\n",
+ " District : n= 1, median nnz= 3, range=[3, 3]\n"
+ ]
+ }
+ ],
+ "source": [
+ "total_cells = X_sparse.shape[0] * X_sparse.shape[1]\n",
+ "density = X_sparse.nnz / total_cells\n",
+ "print(f\"Total cells: {total_cells:,}\")\n",
+ "print(f\"Non-zero entries: {X_sparse.nnz:,}\")\n",
+ "print(f\"Density: {density:.6f}\")\n",
+ "print(f\"Sparsity: {1 - density:.4%}\")\n",
+ "\n",
+ "nnz_per_row = np.diff(X_sparse.indptr)\n",
+ "print(f\"\\nNon-zeros per row:\")\n",
+ "print(f\" min: {nnz_per_row.min():,}\")\n",
+ "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n",
+ "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n",
+ "print(f\" max: {nnz_per_row.max():,}\")\n",
+ "\n",
+ "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
+ "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
+ "print(\"\\nBy geographic level:\")\n",
+ "for level in [0, 1, 2]:\n",
+ " mask = (geo_levels == level).values\n",
+ " if mask.any():\n",
+ " vals = nnz_per_row[mask]\n",
+ " print(\n",
+ " f\" {level_names[level]:10s}: \"\n",
+ " f\"n={mask.sum():>4d}, \"\n",
+ " f\"median nnz={int(np.median(vals)):>7,}, \"\n",
+ " f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.8 Dropping targets: groups vs domain filtering\n",
+ "\n",
+ "There are two mechanisms for excluding targets from calibration. They operate at different stages and serve different purposes.\n",
+ "\n",
+ "### Group-based filtering (not used in production)\n",
+ "\n",
+ "`drop_target_groups()` removes entire groups created by `create_target_groups()` from the matrix after it has been built. This was designed for cases where a group is redundant after hierarchical uprating — for example, state-level SNAP Household Count becomes redundant once district-level targets were reconciled to sum to the state totals.\n",
+ "\n",
+ "This mechanism is **not currently used in the pipeline** (`target_groups=None` is passed to the optimizer). It is preserved for potential future use with group-weighted loss balancing. The example below demonstrates how it works.\n",
+ "\n",
+ "### Domain filtering via `target_config.yaml` (used in production)\n",
+ "\n",
+ "In production, target filtering happens **before the matrix is built**, not after. `target_config.yaml` is the authoritative gating list: any database target not matching an `include` entry is discarded by the matrix builder at query time (see section 4.4). This is how targets are added or removed from calibration — by editing the YAML config, not by dropping groups from an already-built matrix.\n",
+ "\n",
+ "The key difference: group-based filtering operates on the built matrix (post-hoc removal of rows), while `target_config.yaml` filtering prevents unwanted targets from entering the matrix in the first place.\n",
+ "\n",
+ "### Achievable targets\n",
+ "\n",
+ "Regardless of which filtering approach is used, a target is achievable only if at least one household can contribute to it (row sum > 0). Rows with sum = 0 are impossible constraints that the optimizer cannot satisfy and are removed before fitting."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Matrix before: 12 rows\n",
+ " DROPPING Group 7: State SNAP Snap Household Count (1 target, value=1,500,000) (1 rows)\n",
+ "\n",
+ " KEEPING Group 0: National Aca Ptc (1 target, value=8,000,000,000) (1 rows)\n",
+ " KEEPING Group 1: National Employment Income (1 target, value=9,000,000,000,000) (1 rows)\n",
+ " KEEPING Group 2: National Snap (1 target, value=15,000,000,000) (1 rows)\n",
+ " KEEPING Group 3: National SNAP Snap Household Count (1 target, value=5,000,000) (1 rows)\n",
+ " KEEPING Group 4: State Aca Ptc (1 target, value=3,000,000,000) (1 rows)\n",
+ " KEEPING Group 5: State Employment Income (1 target, value=3,000,000,000,000) (1 rows)\n",
+ " KEEPING Group 6: State Snap (4 targets) (4 rows)\n",
+ " KEEPING Group 8: District Aca Ptc (1 target, value=400,000,000) (1 rows)\n",
+ "\n",
+ "Matrix after: 11 rows\n",
+ "\n",
+ "Achievable targets: 11\n",
+ "Impossible targets: 0\n",
+ "\n",
+ "Final matrix shape: (11, 24)\n",
+ "Final non-zero entries: 141\n",
+ "This is what the optimizer receives.\n"
+ ]
+ }
+ ],
+ "source": [
+ "GROUPS_TO_DROP = [\n",
+ " (\"SNAP Household Count\", \"State\"),\n",
+ "]\n",
+ "\n",
+ "targets_filtered, X_filtered = drop_target_groups(\n",
+ " targets_df, X_sparse, target_groups, group_info, GROUPS_TO_DROP\n",
+ ")\n",
+ "\n",
+ "row_sums = np.array(X_filtered.sum(axis=1)).flatten()\n",
+ "achievable_mask = row_sums > 0\n",
+ "n_achievable = achievable_mask.sum()\n",
+ "n_impossible = (~achievable_mask).sum()\n",
+ "\n",
+ "print(f\"\\nAchievable targets: {n_achievable}\")\n",
+ "print(f\"Impossible targets: {n_impossible}\")\n",
+ "\n",
+ "X_final = X_filtered[achievable_mask, :]\n",
+ "print(f\"\\nFinal matrix shape: {X_final.shape}\")\n",
+ "print(f\"Final non-zero entries: {X_final.nnz:,}\")\n",
+ "print(f\"This is what the optimizer receives.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "part4-header",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "# Part 2: Calibration matrix assembly — per-state simulation\n",
+ "\n",
+ "The previous section showed *what* the matrix contains. This section explains *how* `UnifiedMatrixBuilder` fills it in — specifically the per-state simulation pass, domain constraints that gate matrix rows, and the special treatment of county-dependent variables.\n",
+ "\n",
+ "Because the matrix builder runs inside worker processes and cannot share live Python objects across process boundaries, all simulation is done with picklable top-level functions rather than methods."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "part4-workers",
+ "metadata": {},
+ "source": [
+ "## 2.1 Per-state simulation with parallel workers\n",
+ "\n",
+ "Before the clone loop begins, the builder dispatches one simulation job per unique state FIPS that appears across all clone assignments. Each job runs in a `ProcessPoolExecutor` worker and calls `_compute_single_state()` — a module-level function (not a method) so it is picklable.\n",
+ "\n",
+ "Inside `_compute_single_state()`, the worker:\n",
+ "\n",
+ "1. Creates a fresh `Microsimulation` from the base CPS H5 file.\n",
+ "2. Overwrites every household's `state_fips` with a uniform array set to the target state.\n",
+ "3. Invalidates cached downstream variables with `delete_arrays()` so PolicyEngine recomputes them under the new state.\n",
+ "4. Calculates each target variable mapped to the `household` entity.\n",
+ "5. Calculates each constraint variable mapped to the `person` entity (constraints need person-level resolution).\n",
+ "\n",
+ "```python\n",
+ "# From unified_matrix_builder.py — _compute_single_state()\n",
+ "state_sim = Microsimulation(dataset=dataset_path)\n",
+ "\n",
+ "state_sim.set_input(\n",
+ " \"state_fips\",\n",
+ " time_period,\n",
+ " np.full(n_hh, state, dtype=np.int32),\n",
+ ")\n",
+ "for var in get_calculated_variables(state_sim):\n",
+ " state_sim.delete_arrays(var)\n",
+ "\n",
+ "hh = {}\n",
+ "for var in target_vars:\n",
+ " if var.endswith(\"_count\"):\n",
+ " continue\n",
+ " hh[var] = state_sim.calculate(var, time_period, map_to=\"household\").values.astype(np.float32)\n",
+ "\n",
+ "person = {}\n",
+ "for var in constraint_vars:\n",
+ " person[var] = state_sim.calculate(var, time_period, map_to=\"person\").values.astype(np.float32)\n",
+ "```\n",
+ "\n",
+ "The return value is a `(state_fips, {\"hh\": ..., \"person\": ..., \"entity\": ..., \"entity_wf_false\": ...})` tuple. After all state workers finish, the builder collects results into a `state_values` dict keyed by FIPS."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "part4-clone-loop",
+ "metadata": {},
+ "source": [
+ "## 2.2 Clone loop: slicing and assembling\n",
+ "\n",
+ "With precomputed state values in hand, `_process_single_clone()` runs once per clone (also in worker processes, sharing read-only data via `_init_clone_worker()`). Each call:\n",
+ "\n",
+ "1. Reads its slice of the geography arrays: `clone_states = geo_states[col_start:col_end]`.\n",
+ "2. Calls `_assemble_clone_values_standalone()`, which fans out state-level arrays into a per-record array by applying per-state masks:\n",
+ "\n",
+ "```python\n",
+ "# From _assemble_clone_values_standalone()\n",
+ "arr = np.empty(n_records, dtype=np.float32)\n",
+ "for state in unique_clone_states:\n",
+ " mask = state_masks[int(state)] # records assigned to this state\n",
+ " arr[mask] = state_values[int(state)][\"hh\"][var][mask]\n",
+ "hh_vars[var] = arr\n",
+ "```\n",
+ "\n",
+ "3. Evaluates domain constraints (§2.4) and writes non-zero COO entries for every target row.\n",
+ "\n",
+ "Column layout: `col_idx = clone_idx * n_records + record_idx`, so `col_start = clone_idx * n_records` and `col_end = col_start + n_records`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "part4-constraints",
+ "metadata": {},
+ "source": [
+ "## 2.3 Domain constraints: gating which records contribute\n",
+ "\n",
+ "Each target row in the matrix can have a *domain constraint* — a predicate that must be true for a record to contribute a non-zero value to that row. A few common examples:\n",
+ "\n",
+ "| Target | Constraint variable | Constraint | Meaning |\n",
+ "|---|---|---|---|\n",
+ "| `aca_ptc` (national, IRS) | `aca_ptc` | `> 0` | Only tax units that receive ACA PTC count |\n",
+ "| `refundable_ctc` (national) | `refundable_ctc` | `> 0` | Only households with positive refundable CTC |\n",
+ "| `self_employment_income` (national) | `self_employment_income` | `> 0` | Only households with SE income |\n",
+ "\n",
+ "Constraints come from the `stratum_constraints` table in `policy_data.db`. Each target belongs to a stratum, and each stratum can have one or more constraints stored as `(constraint_variable, operation, value)` rows. The ETL scripts (`db/etl_*.py`) populate these constraints when they insert targets. The matrix builder retrieves them via `_get_stratum_constraints(stratum_id)` for each target, separates geographic constraints (like `state_fips` and `congressional_district_geoid`) from non-geographic ones, and passes the non-geo constraints into the clone loop.\n",
+ "\n",
+ "During the clone loop, `_evaluate_constraints_standalone()` applies the predicate at person level and aggregates to household level via `.any()`. A record that does *not* satisfy the constraint contributes 0 to that matrix row — even if it has a non-zero value for the target variable. For the IRS filer-count targets, `tax_unit_is_filer` plays a similar role: only filer tax units appear in those rows.\n",
+ "\n",
+ "```python\n",
+ "# From _evaluate_constraints_standalone()\n",
+ "person_mask = np.ones(n_persons, dtype=bool)\n",
+ "for c in constraints:\n",
+ " vals = person_vars[c[\"variable\"]]\n",
+ " person_mask &= apply_op(vals, c[\"operation\"], c[\"value\"])\n",
+ "\n",
+ "# Aggregate: a household satisfies the constraint if any of its members do\n",
+ "df[\"satisfies\"] = person_mask\n",
+ "hh_mask = df.groupby(\"household_id\")[\"satisfies\"].any()\n",
+ "```\n",
+ "\n",
+ "The final matrix entry is:\n",
+ "\n",
+ "```python\n",
+ "# From _calculate_target_values_standalone()\n",
+ "vals = hh_vars.get(target_variable)\n",
+ "return (vals * mask).astype(np.float32) # mask = 0 for records failing constraint\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.4 Takeup re-randomization in the clone loop\n",
+ "\n",
+ "Several programs in PolicyEngine require a *take-up* draw: a stochastic binary decision representing whether an eligible household actually participates. The draws must be **reproducible and consistent** across the matrix builder and the H5 builder — if the two builds use different draws, the matrix rows target a different subpopulation than what ends up in the H5, breaking calibration.\n",
+ "\n",
+ "### Why takeup lives in the matrix builder\n",
+ "\n",
+ "Takeup draws depend on geography — each entity's take-up rate is resolved from the state FIPS of the census block assigned to that clone. Since different clones of the same household land in different states, they can have different take-up rates and therefore different draws. This per-clone variation is what makes takeup a matrix-building concern rather than a dataset-building concern.\n",
+ "\n",
+ "### `SIMPLE_TAKEUP_VARS` and `TAKEUP_AFFECTED_TARGETS`\n",
+ "\n",
+ "`SIMPLE_TAKEUP_VARS` (in `utils/takeup.py`) is the canonical list of takeup variables. Each entry is a dict with four keys:\n",
+ "\n",
+ "| Key | Meaning |\n",
+ "|---|---|\n",
+ "| `variable` | PolicyEngine boolean input variable (e.g., `takes_up_snap_if_eligible`) |\n",
+ "| `entity` | Native entity of the variable (`spm_unit`, `tax_unit`, or `person`) |\n",
+ "| `rate_key` | Key used to look up the take-up rate from the parameters store |\n",
+ "| `target` | Corresponding calibration target variable, or `None` for non-target draws |\n",
+ "\n",
+ "Current entries cover: `snap` (spm_unit), `aca_ptc` (tax_unit), `dc_property_tax_credit` (tax_unit), `head_start` (person), `early_head_start` (person), `ssi` (person), `medicaid` (person), `tanf` (spm_unit), and `would_file_taxes_voluntarily` (tax_unit, no target).\n",
+ "\n",
+ "`TAKEUP_AFFECTED_TARGETS` is derived automatically from `SIMPLE_TAKEUP_VARS` — it maps each calibration target that has a takeup variable to `{takeup_var, entity, rate_key}`. Before the clone loop begins, the matrix builder matches `TAKEUP_AFFECTED_TARGETS` against the actual target variables to build `affected_target_info`, which tells each clone worker which target rows need takeup blending.\n",
+ "\n",
+ "### Step 1: State precomputation (`_compute_single_state`)\n",
+ "\n",
+ "When `rerandomize_takeup=True`, each state's precomputation runs PolicyEngine multiple times:\n",
+ "\n",
+ "1. **Baseline simulation** — computes household and person values with the dataset's original takeup values. These populate `hh` (for non-takeup-affected targets) and `person` (for constraint evaluation).\n",
+ "\n",
+ "2. **All-takeup-true simulation** — sets every `SIMPLE_TAKEUP_VARS` entry to `True`, clears formula caches, then recalculates each takeup-affected target at entity level. Stored in `entity_vals`. This gives the \"what would this entity's value be if it participated in every program?\" answer.\n",
+ "\n",
+ "3. **Would-file-false simulation** (tax_unit targets only) — sets `would_file_taxes_voluntarily = False`, clears caches, recalculates tax_unit targets. Stored in `entity_wf_false`. This gives the alternative value for non-filers.\n",
+ "\n",
+ "These precomputed values are shared read-only data for all clone workers.\n",
+ "\n",
+ "### Step 2: Clone loop — drawing and blending\n",
+ "\n",
+ "Within `_process_single_clone()`, takeup runs in two phases after assembling per-clone values:\n",
+ "\n",
+ "**Phase 1 — Non-target draws (line 703):** Draws for variables where `target is None` — currently just `would_file_taxes_voluntarily`. These are computed via `compute_block_takeup_for_entities()` and stored in `wf_draws`, keyed by entity. They must be drawn before Phase 2 because tax_unit targets depend on them.\n",
+ "\n",
+ "**Phase 2 — Target value assembly (line 729):** For each takeup-affected target (e.g., `snap`, `aca_ptc`):\n",
+ "1. Retrieves the precomputed `entity_vals[tvar]` (all-takeup-true eligible value) from the state sim.\n",
+ "2. For tax_unit targets: uses `np.where(wf_draws[\"tax_unit\"], entity_vals[tvar], entity_wf_false[tvar])` to select between the all-true value and the would-file-false value, based on the Phase 1 `would_file` draw.\n",
+ "3. Draws a per-entity program takeup boolean via `compute_block_takeup_for_entities()`.\n",
+ "4. Multiplies: `entity_value = eligible_value * takeup_draw`.\n",
+ "5. Aggregates to household level via `np.add.at()` and overwrites `hh_vars[tvar]`.\n",
+ "\n",
+ "### Seeding strategy and the correctness invariant\n",
+ "\n",
+ "`compute_block_takeup_for_entities()` uses a `(variable_name, household_id, clone_idx)` triple as the RNG seed:\n",
+ "\n",
+ "```python\n",
+ "for hh_id in np.unique(entity_hh_ids):\n",
+ " hh_mask = entity_hh_ids == hh_id\n",
+ " for ci in np.unique(entity_clone_indices[hh_mask]):\n",
+ " ci_mask = hh_mask & (entity_clone_indices == ci)\n",
+ " n_ent = int(ci_mask.sum())\n",
+ " rng = seeded_rng(var_name, salt=f\"{int(hh_id)}:{int(ci)}\")\n",
+ " draws[ci_mask] = rng.random(n_ent)\n",
+ "```\n",
+ "\n",
+ "This guarantees:\n",
+ "\n",
+ "- **Same `(var_name, hh_id, clone_idx)`** → same RNG seed → same draw, regardless of call order or process.\n",
+ "- **Different households** → different seeds → independent draws.\n",
+ "- **Different clones of the same household** → different seeds → independent assignments across clones.\n",
+ "\n",
+ "**Critical correctness invariant:** The matrix builder and the H5 builder (`publish_local_area.py`) call `compute_block_takeup_for_entities()` independently, but they must pass the same `var_name`, the same `entity_hh_ids`, and the same `clone_idx` for each record. If either side passes a different `(hh_id, clone_idx)` combination, the draws diverge, and the matrix rows target a different subpopulation than what ends up in the H5.\n",
+ "\n",
+ "### Rate resolution\n",
+ "\n",
+ "`_resolve_rate()` handles both scalar and state-keyed rates. State FIPS is derived from the first two characters of the census block GEOID, so each entity's rate reflects the state it was assigned to in that clone."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Determinism demo\n",
+ "\n",
+ "The following cell verifies that `compute_block_takeup_for_entities()` produces identical draws regardless of call order — the invariant that makes the matrix builder and H5 builder consistent. The matrix builder calls it once per clone (passing a single clone's entities), while the H5 builder may call it with all clones at once. Both must produce the same draws."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Full call draws: [1, 1, 1, 0, 1, 1, 0, 0]\n",
+ "Split call draws: [1, 1, 1, 0, 1, 1, 0, 0]\n",
+ "Match: True\n",
+ "\n",
+ "Per (hh_id, clone_idx):\n",
+ " hh=101, clone=0 -> takeup=1\n",
+ " hh=101, clone=1 -> takeup=1\n",
+ " hh=202, clone=0 -> takeup=1\n",
+ " hh=202, clone=1 -> takeup=0\n",
+ " hh=303, clone=0 -> takeup=1\n",
+ " hh=303, clone=1 -> takeup=1\n",
+ " hh=404, clone=0 -> takeup=0\n",
+ " hh=404, clone=1 -> takeup=0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from policyengine_us_data.utils.takeup import compute_block_takeup_for_entities\n",
+ "\n",
+ "# Fake household IDs and clone indices\n",
+ "# 4 households, 2 clones each -> 8 (hh, clone) pairs\n",
+ "# Each household has 1 spm_unit for simplicity\n",
+ "hh_ids = np.array([101, 101, 202, 202, 303, 303, 404, 404], dtype=np.int64)\n",
+ "clone_idxs = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.int64)\n",
+ "# Census block GEOIDs — first two digits = state FIPS\n",
+ "# Using state 06 (CA) for all, so rate is resolved from state code 'CA'\n",
+ "blocks = np.array([\"060750001001001\"] * 8)\n",
+ "\n",
+ "# Scalar take-up rate of 0.75\n",
+ "rate = 0.75\n",
+ "var = \"takes_up_snap_if_eligible\"\n",
+ "\n",
+ "# First call: all 8 entities together (as H5 builder would)\n",
+ "draws_full = compute_block_takeup_for_entities(var, rate, blocks, hh_ids, clone_idxs)\n",
+ "\n",
+ "# Second call: split by clone, then concatenate (as matrix builder would)\n",
+ "mask_c0 = clone_idxs == 0\n",
+ "mask_c1 = clone_idxs == 1\n",
+ "\n",
+ "draws_c0 = compute_block_takeup_for_entities(\n",
+ " var, rate, blocks[mask_c0], hh_ids[mask_c0], clone_idxs[mask_c0]\n",
+ ")\n",
+ "draws_c1 = compute_block_takeup_for_entities(\n",
+ " var, rate, blocks[mask_c1], hh_ids[mask_c1], clone_idxs[mask_c1]\n",
+ ")\n",
+ "\n",
+ "# Reconstruct in original order\n",
+ "draws_split = np.empty(8, dtype=bool)\n",
+ "draws_split[mask_c0] = draws_c0\n",
+ "draws_split[mask_c1] = draws_c1\n",
+ "\n",
+ "print(\"Full call draws: \", draws_full.astype(int).tolist())\n",
+ "print(\"Split call draws: \", draws_split.astype(int).tolist())\n",
+ "print(\"Match:\", np.array_equal(draws_full, draws_split))\n",
+ "\n",
+ "# Show that clone 0 and clone 1 of the same household differ\n",
+ "print(\"\\nPer (hh_id, clone_idx):\")\n",
+ "for i, (hh, ci, d) in enumerate(zip(hh_ids, clone_idxs, draws_full)):\n",
+ " print(f\" hh={hh}, clone={ci} -> takeup={int(d)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "part4-county",
+ "metadata": {},
+ "source": [
+ "## 2.5 County-dependent variables (e.g., `aca_ptc`)\n",
+ "\n",
+ "`COUNTY_DEPENDENT_VARS = {\"aca_ptc\"}` marks variables whose simulated value depends on county-level premium data. ACA PTC eligibility uses county-level benchmark plan premiums, so a household in LA County gets a different premium slot than the same household in Sacramento County, even if the state is the same.\n",
+ "\n",
+ "The builder handles this with a separate `_compute_single_state_group_counties()` pass that runs one simulation per county (reusing a single `Microsimulation` instance per state for efficiency):\n",
+ "\n",
+ "```python\n",
+ "# From _compute_single_state_group_counties()\n",
+ "state_sim = Microsimulation(dataset=dataset_path)\n",
+ "state_sim.set_input(\"state_fips\", time_period, np.full(n_hh, state_fips, dtype=np.int32))\n",
+ "\n",
+ "for county_fips in counties:\n",
+ " county_idx = get_county_enum_index_from_fips(county_fips)\n",
+ " state_sim.set_input(\"county\", time_period, np.full(n_hh, county_idx, dtype=np.int32))\n",
+ " # delete cached downstream arrays (excluding county and zip_code)\n",
+ " for var in get_calculated_variables(state_sim):\n",
+ " if var not in (\"county\", \"zip_code\"):\n",
+ " state_sim.delete_arrays(var)\n",
+ "\n",
+ " hh[var] = state_sim.calculate(var, time_period, map_to=\"household\").values\n",
+ "```\n",
+ "\n",
+ "During clone assembly, `_assemble_clone_values_standalone()` checks whether each target variable is in `COUNTY_DEPENDENT_VARS`. If it is, the assembler fills records using county-keyed results rather than state-keyed results:\n",
+ "\n",
+ "```python\n",
+ "if var in cdv and county_values and clone_counties is not None:\n",
+ " for county in unique_counties:\n",
+ " mask = county_masks[county]\n",
+ " county_hh = county_values.get(county, {}).get(\"hh\", {})\n",
+ " if var in county_hh:\n",
+ " arr[mask] = county_hh[var][mask]\n",
+ " else:\n",
+ " # Fall back to state-level if this county wasn't simulated\n",
+ " st = int(county[:2])\n",
+ " arr[mask] = state_values[st][\"hh\"][var][mask]\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.6 COO assembly: from clone entries to sparse matrix\n",
+ "\n",
+ "After state precomputation, takeup, and constraint evaluation, each clone worker builds its contribution to the calibration matrix as COO (coordinate) entries — a list of `(row, col, value)` triples. This allows efficiently storing non-zero values without having to store a large number of zero-valued matrix cells.\n",
+ "\n",
+ "### Per-clone entry generation\n",
+ "\n",
+ "Inside `_process_single_clone()`, the final loop iterates over every target row:\n",
+ "\n",
+ "1. **Geographic filtering** — determines which columns (records) are relevant. District targets use `cd_to_cols`, state targets use `state_to_cols`, national targets use all columns. Only columns belonging to the current clone slice (`col_start..col_end`) are retained.\n",
+ "\n",
+ "2. **Value computation** — for count targets (`*_count`), calls `_calculate_target_values_standalone()` which counts entities satisfying the constraint. For dollar targets, multiplies the household variable value by the domain constraint mask. Both use caches (`count_cache`, `mask_cache`) keyed by constraint tuple to avoid redundant computation.\n",
+ "\n",
+ "3. **Nonzero filtering** — only entries where `value != 0` are emitted. This is where the matrix gets its sparsity: records outside the target's geography, failing the domain constraint, or with zero variable value produce no entry.\n",
+ "\n",
+ "```python\n",
+ "# Simplified from _process_single_clone()\n",
+ "for row_idx in range(n_targets):\n",
+ " # Geographic filter: which columns belong to this target's area?\n",
+ " if geo_level == \"district\":\n",
+ " clone_cols = cd_to_cols[geo_id] # columns in this CD\n",
+ " elif geo_level == \"state\":\n",
+ " clone_cols = state_to_cols[geo_id]\n",
+ " else:\n",
+ " clone_cols = all_columns\n",
+ " clone_cols = clone_cols[(clone_cols >= col_start) & (clone_cols < col_end)]\n",
+ "\n",
+ " # Value: variable * constraint mask (or entity count)\n",
+ " values = source_vars[variable] * constraint_mask\n",
+ "\n",
+ " # Only emit nonzero entries\n",
+ " vals = values[clone_cols - col_start]\n",
+ " nonzero = vals != 0\n",
+ " rows_list.append(np.full(nonzero.sum(), row_idx))\n",
+ " cols_list.append(clone_cols[nonzero])\n",
+ " vals_list.append(vals[nonzero])\n",
+ "```\n",
+ "\n",
+ "Each clone worker writes its COO entries to a compressed `.npz` file (when using parallel workers) or appends to in-memory lists.\n",
+ "\n",
+ "### Final assembly\n",
+ "\n",
+ "After all clones finish, the builder concatenates every clone's `(rows, cols, vals)` arrays and constructs a single Compressed Sparse Row (CSR) with `scipy.sparse.csr_matrix`:\n",
+ "\n",
+ "```python\n",
+ "# From build_matrix(), step 6\n",
+ "for ci in range(n_clones):\n",
+ " data = np.load(clone_dir / f\"clone_{ci:04d}.npz\")\n",
+ " all_r.append(data[\"rows\"])\n",
+ " all_c.append(data[\"cols\"])\n",
+ " all_v.append(data[\"vals\"])\n",
+ "\n",
+ "X_csr = sparse.csr_matrix(\n",
+ " (np.concatenate(all_v), (np.concatenate(all_r), np.concatenate(all_c))),\n",
+ " shape=(n_targets, n_total),\n",
+ ")\n",
+ "```\n",
+ "\n",
+ "The resulting matrix has shape `(n_targets, n_records * n_clones)` — rows are calibration targets, columns are cloned household records. This is what the L0 optimizer receives."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "# Part 3: Configuring calibration targets after matrix build\n",
+ "\n",
+ "## 3.1 `target_config.yaml` include/exclude rules\n",
+ "\n",
+ "`target_config.yaml` controls which targets from the built matrix are passed to the L0 optimizer. It is applied **after** the matrix is built, not during construction — the full unfiltered calibration package is saved first (Step 6b in `run_calibration`), then `apply_target_config()` filters targets for fitting (Step 6c). This design lets the same expensive matrix package be reused with different configs without rebuilding.\n",
+ "\n",
+ "The filtering works by matching rows in `targets_df` against include/exclude rules. If `include` rules are present, only matching targets survive. If `exclude` rules are also present, they remove from the included set. The corresponding rows in `X_sparse` and `target_names` are dropped in sync.\n",
+ "\n",
+ "```python\n",
+ "# From run_calibration(), Step 6c\n",
+ "if target_config:\n",
+ " targets_df, X_sparse, target_names = apply_target_config(\n",
+ " targets_df, X_sparse, target_names, target_config\n",
+ " )\n",
+ "```\n",
+ "\n",
+ "Each rule in `target_config.yaml` has three fields:\n",
+ "\n",
+ "| Field | Required | Meaning |\n",
+ "|---|---|---|\n",
+ "| `variable` | yes | PolicyEngine variable name |\n",
+ "| `geo_level` | yes | One of `district`, `state`, `national` |\n",
+ "| `domain_variable` | no | If present, only matches targets whose stratum has this constraint variable in the DB |\n",
+ "\n",
+ "The `domain_variable` field here is a **row-matching filter**, not a constraint definition. It narrows which rows in the already-built matrix are kept for optimization. The actual domain constraints that gate which records contribute to each matrix row are stored in `stratum_constraints` in `policy_data.db` and are applied during matrix construction (see section 2.3).\n",
+ "\n",
+ "Example entries from `target_config.yaml`:\n",
+ "\n",
+ "```yaml\n",
+ "include:\n",
+ " # ACA PTC at district level\n",
+ " - variable: aca_ptc\n",
+ " geo_level: district\n",
+ "\n",
+ " # ACA PTC at national level — only the domain-constrained target\n",
+ " - variable: aca_ptc\n",
+ " geo_level: national\n",
+ " domain_variable: aca_ptc\n",
+ "\n",
+ " # SNAP at state level\n",
+ " - variable: snap\n",
+ " geo_level: state\n",
+ "```\n",
+ "\n",
+ "The inline comments in `target_config.yaml` document why specific targets were removed (e.g., `# REMOVED: state_income_tax — ETL hardcodes $0 for WA and NH`).\n",
+ "\n",
+ "### Distinction from `target_filter`\n",
+ "\n",
+ "The matrix builder's `_query_targets()` accepts a separate `target_filter` dict (populated from `--domain-variables` CLI args) that filters targets at DB query time — *during* the build. In the default pipeline this filter is empty, so all active targets from `policy_data.db` enter the matrix. `target_config.yaml` then selects the subset to optimize against."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.2 Achievable target filtering\n",
+ "\n",
+ "After `target_config.yaml` filtering, a target is **achievable** only if at least one record in the matrix can contribute to it — i.e., the row sum is positive. Targets with row sum = 0 are impossible constraints: no combination of weights can make the weighted sum match a nonzero target value if every entry in that row is zero.\n",
+ "\n",
+ "This check runs immediately before the optimizer call in `run_calibration()` (Step 7):\n",
+ "\n",
+ "```python\n",
+ "row_sums = np.array(X_sparse.sum(axis=1)).flatten()\n",
+ "achievable = row_sums > 0\n",
+ "```\n",
+ "\n",
+ "The `achievable` boolean array is passed to `fit_l0_weights()`, which uses it to exclude impossible targets from the loss function. This prevents the optimizer from wasting gradient signal on targets it can never satisfy.\n",
+ "\n",
+ "Common causes of unachievable targets:\n",
+ "- A geographic target (e.g., a specific CD) where no clones landed in that area after the no-collision constraint\n",
+ "- A domain-constrained target where the constraint variable is zero for all records (e.g., a program that no CPS respondent participates in)\n",
+ "- A target that was active in the DB but whose variable isn't computed by the current version of PolicyEngine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "# Part 4: Hierarchical uprating\n",
+ "\n",
+ "Calibration targets in `policy_data.db` come from different sources, at different geographic levels, and from different time periods. Before we can use them in calibration, two adjustments are needed:\n",
+ "\n",
+ "1. **Uprating factor (UF)**: Bridges the time gap between the source data's period and the calibration year. For most domains, dollar-valued targets use CPI and count targets use population growth. For **ACA PTC**, we use real state-level enrollment and average APTC changes from CMS/KFF data, giving each state its own UF.\n",
+ "\n",
+ "2. **Hierarchy inconsistency factor (HIF)**: Corrects for the fact that district-level totals from one source may not sum to the state-level total from another. This is a pure base-year geometry correction with no time dimension.\n",
+ "\n",
+ "These two factors are **separable by linearity**. For each congressional district row:\n",
+ "\n",
+ "$$\\text{value} = \\text{original\\_value} \\times \\text{HIF} \\times \\text{UF}$$\n",
+ "\n",
+ "where $\\text{HIF} = S_{\\text{base}} \\;/\\; \\sum_i CD_{i,\\text{base}}$ and the sum constraint holds:\n",
+ "\n",
+ "$$\\sum_i (CD_i \\times \\text{HIF} \\times \\text{UF}) = \\text{UF} \\times S_{\\text{base}} = S_{\\text{uprated}}$$\n",
+ "\n",
+ "Two example domains:\n",
+ "- **ACA PTC** (IRS data): Districts sum exactly to state totals, so HIF = 1.0 everywhere. The UF varies by state, reflecting real enrollment and APTC changes between 2022 and 2024.\n",
+ "- **SNAP** (USDA data): District household counts substantially undercount the state administrative totals, so HIF > 1 (often 1.2 to 1.7). The SNAP data is already at the target period, so UF = 1.0."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.1 Raw targets and generic uprating"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ─── Part 4 setup ──────────────────────────────────────────────────────────\n",
+ "# The uprating walkthrough below requires a fully initialized UnifiedMatrixBuilder\n",
+ "# and a Microsimulation. These load the full source-imputed H5 and policy_data.db.\n",
+ "#\n",
+ "# from policyengine_us import Microsimulation\n",
+ "# from policyengine_us_data import EnhancedCPS_2024\n",
+ "# from policyengine_us_data.storage import STORAGE_FOLDER\n",
+ "# from policyengine_us_data.calibration.unified_matrix_builder import UnifiedMatrixBuilder\n",
+ "\n",
+ "# sim = Microsimulation(dataset=EnhancedCPS_2024)\n",
+ "# db_path = str(STORAGE_FOLDER / \"calibration\" / \"policy_data.db\")\n",
+ "# builder = UnifiedMatrixBuilder(\n",
+ "# db_uri=f\"sqlite:///{db_path}\",\n",
+ "# time_period=2024,\n",
+ "# dataset_path=str(EnhancedCPS_2024.file_path),\n",
+ "# )\n",
+ "#\n",
+ "# Without the above, Part 4 cells print a skip message and set outputs to None.\n",
+ "\n",
+ "try:\n",
+ " builder\n",
+ " sim\n",
+ "except NameError:\n",
+ " builder = None\n",
+ " sim = None\n",
+ " print(\"Part 4: builder/sim not initialized — cells will skip gracefully.\")\n",
+ " print(\"Uncomment the setup lines above and re-run to execute the full walkthrough.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
domain_variable
\n",
+ "
geo_level
\n",
+ "
variable
\n",
+ "
period
\n",
+ "
count
\n",
+ "
total_value
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
aca_ptc
\n",
+ "
district
\n",
+ "
aca_ptc
\n",
+ "
2022
\n",
+ "
436
\n",
+ "
1.419185e+10
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
aca_ptc
\n",
+ "
district
\n",
+ "
tax_unit_count
\n",
+ "
2022
\n",
+ "
436
\n",
+ "
6.848330e+06
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
aca_ptc
\n",
+ "
national
\n",
+ "
aca_ptc
\n",
+ "
2022
\n",
+ "
1
\n",
+ "
1.419185e+10
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
aca_ptc
\n",
+ "
national
\n",
+ "
person_count
\n",
+ "
2024
\n",
+ "
1
\n",
+ "
1.974369e+07
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
aca_ptc
\n",
+ "
national
\n",
+ "
tax_unit_count
\n",
+ "
2022
\n",
+ "
1
\n",
+ "
6.848330e+06
\n",
+ "
\n",
+ "
\n",
+ "
5
\n",
+ "
aca_ptc
\n",
+ "
state
\n",
+ "
aca_ptc
\n",
+ "
2022
\n",
+ "
51
\n",
+ "
1.419185e+10
\n",
+ "
\n",
+ "
\n",
+ "
6
\n",
+ "
aca_ptc
\n",
+ "
state
\n",
+ "
tax_unit_count
\n",
+ "
2022
\n",
+ "
51
\n",
+ "
6.848330e+06
\n",
+ "
\n",
+ "
\n",
+ "
7
\n",
+ "
snap
\n",
+ "
district
\n",
+ "
household_count
\n",
+ "
2024
\n",
+ "
436
\n",
+ "
1.563268e+07
\n",
+ "
\n",
+ "
\n",
+ "
8
\n",
+ "
snap
\n",
+ "
state
\n",
+ "
household_count
\n",
+ "
2024
\n",
+ "
51
\n",
+ "
2.217709e+07
\n",
+ "
\n",
+ "
\n",
+ "
9
\n",
+ "
snap
\n",
+ "
state
\n",
+ "
snap
\n",
+ "
2024
\n",
+ "
51
\n",
+ "
9.365787e+10
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " domain_variable geo_level variable period count total_value\n",
+ "0 aca_ptc district aca_ptc 2022 436 1.419185e+10\n",
+ "1 aca_ptc district tax_unit_count 2022 436 6.848330e+06\n",
+ "2 aca_ptc national aca_ptc 2022 1 1.419185e+10\n",
+ "3 aca_ptc national person_count 2024 1 1.974369e+07\n",
+ "4 aca_ptc national tax_unit_count 2022 1 6.848330e+06\n",
+ "5 aca_ptc state aca_ptc 2022 51 1.419185e+10\n",
+ "6 aca_ptc state tax_unit_count 2022 51 6.848330e+06\n",
+ "7 snap district household_count 2024 436 1.563268e+07\n",
+ "8 snap state household_count 2024 51 2.217709e+07\n",
+ "9 snap state snap 2024 51 9.365787e+10"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DOMAINS = [\"aca_ptc\", \"snap\"]\n",
+ "\n",
+ "if builder is not None:\n",
+ " raw = builder._query_targets({\"domain_variables\": DOMAINS})\n",
+ " summary = (\n",
+ " raw.groupby([\"domain_variable\", \"geo_level\", \"variable\", \"period\"])\n",
+ " .agg(count=(\"value\", \"size\"), total_value=(\"value\", \"sum\"))\n",
+ " .reset_index()\n",
+ " )\n",
+ " display(summary)\n",
+ "else:\n",
+ " raw = None\n",
+ " print(\"Skipping — builder not initialized.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " 2022 -> 2024 (cpi): 1.101889\n",
+ " 2022 -> 2024 (pop): 1.020415\n",
+ " 2023 -> 2024 (cpi): 1.035512\n",
+ " 2023 -> 2024 (pop): 1.010947\n",
+ " 2025 -> 2024 (cpi): 0.970879\n",
+ " 2025 -> 2024 (pop): 0.990801\n"
+ ]
+ }
+ ],
+ "source": [
+ "if builder is not None:\n",
+ " params = sim.tax_benefit_system.parameters\n",
+ " uprating_factors = builder._calculate_uprating_factors(params)\n",
+ " for (yr, kind), f in sorted(uprating_factors.items()):\n",
+ " if f != 1.0:\n",
+ " print(f\" {yr} -> 2024 ({kind}): {f:.6f}\")\n",
+ "else:\n",
+ " uprating_factors = None\n",
+ " print(\"Skipping — builder not initialized.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.2 Hierarchical reconciliation\n",
+ "\n",
+ "For each (state, variable) pair within a domain:\n",
+ "\n",
+ "- **HIF** = `state_original / sum(cd_originals)` — pure base-year correction\n",
+ "- **UF** = state-specific uprating factor:\n",
+ " - For **ACA PTC**: loaded from `aca_ptc_multipliers_2022_2024.csv` (CMS/KFF enrollment data)\n",
+ " - For other domains: national CPI/pop factors as fallback"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if builder is not None and raw is not None:\n",
+ " raw[\"original_value\"] = raw[\"value\"].copy()\n",
+ " raw[\"uprating_factor\"] = raw.apply(\n",
+ " lambda r: builder._get_uprating_info(\n",
+ " r[\"variable\"], r[\"period\"], uprating_factors\n",
+ " )[0],\n",
+ " axis=1,\n",
+ " )\n",
+ " raw[\"value\"] = raw[\"original_value\"] * raw[\"uprating_factor\"]\n",
+ " result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)\n",
+ "else:\n",
+ " result = None\n",
+ " print(\"Skipping — builder not initialized.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ACA PTC (HIF=1.0, state-varying UF):\n",
+ " CA aca_ptc hif=1.000000 uprating=1.209499 sum(CDs)= 3,332,007,010\n",
+ " CA tax_unit_count hif=1.000000 uprating=1.055438 sum(CDs)= 1,302,653\n",
+ " TX aca_ptc hif=1.000000 uprating=1.957664 sum(CDs)= 2,270,594,110\n",
+ " TX tax_unit_count hif=1.000000 uprating=1.968621 sum(CDs)= 1,125,834\n",
+ " NY aca_ptc hif=1.000000 uprating=1.343861 sum(CDs)= 2,049,797,288\n",
+ " NY tax_unit_count hif=1.000000 uprating=1.075089 sum(CDs)= 593,653\n",
+ "\n",
+ "SNAP (HIF>1, UF=1.0):\n",
+ " CA household_count hif=1.681273 uprating=1.000000 sum(CDs)= 3,128,640\n",
+ " TX household_count hif=1.244524 uprating=1.000000 sum(CDs)= 1,466,107\n",
+ " NY household_count hif=1.344447 uprating=1.000000 sum(CDs)= 1,707,770\n"
+ ]
+ }
+ ],
+ "source": [
+ "sample_states = {6: \"CA\", 48: \"TX\", 36: \"NY\"}\n",
+ "\n",
+ "\n",
+ "def show_reconciliation(result, raw, domain, sample_states):\n",
+ " domain_rows = result[result[\"domain_variable\"] == domain]\n",
+ " cd_domain = domain_rows[domain_rows[\"geo_level\"] == \"district\"]\n",
+ " if cd_domain.empty:\n",
+ " print(\" (no district rows)\")\n",
+ " return\n",
+ " for fips, abbr in sample_states.items():\n",
+ " cd_state = cd_domain[\n",
+ " cd_domain[\"geographic_id\"].apply(\n",
+ " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
+ " )\n",
+ " ]\n",
+ " if cd_state.empty:\n",
+ " continue\n",
+ " for var in sorted(cd_state[\"variable\"].unique()):\n",
+ " var_rows = cd_state[cd_state[\"variable\"] == var]\n",
+ " hif = var_rows[\"hif\"].iloc[0]\n",
+ " uf = var_rows[\"state_uprating_factor\"].iloc[0]\n",
+ " cd_sum = var_rows[\"value\"].sum()\n",
+ " print(\n",
+ " f\" {abbr} {var:20s} \"\n",
+ " f\"hif={hif:.6f} \"\n",
+ " f\"uprating={uf:.6f} \"\n",
+ " f\"sum(CDs)={cd_sum:>14,.0f}\"\n",
+ " )\n",
+ "\n",
+ "\n",
+ "if result is not None:\n",
+ " print(\"ACA PTC (HIF=1.0, state-varying UF):\")\n",
+ " show_reconciliation(result, raw, \"aca_ptc\", sample_states)\n",
+ " print(\"\\nSNAP (HIF>1, UF=1.0):\")\n",
+ " show_reconciliation(result, raw, \"snap\", sample_states)\n",
+ "else:\n",
+ " print(\"Skipping — result not available.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.3 Verification: sum(CDs) == uprated state\n",
+ "\n",
+ "The core invariant: for every (state, variable) pair that has district rows, the sum of reconciled district values must equal the uprated state total."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " 153 checks across 2 domains: ALL PASSED\n"
+ ]
+ }
+ ],
+ "source": [
+ "if result is not None and raw is not None:\n",
+ " all_ok = True\n",
+ " checks = 0\n",
+ " for domain in DOMAINS:\n",
+ " domain_result = result[result[\"domain_variable\"] == domain]\n",
+ " cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n",
+ " if cd_result.empty:\n",
+ " continue\n",
+ " for fips, abbr in sorted(STATE_CODES.items()):\n",
+ " cd_rows = cd_result[\n",
+ " cd_result[\"geographic_id\"].apply(\n",
+ " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
+ " )\n",
+ " ]\n",
+ " if cd_rows.empty:\n",
+ " continue\n",
+ " for var in cd_rows[\"variable\"].unique():\n",
+ " var_rows = cd_rows[cd_rows[\"variable\"] == var]\n",
+ " cd_sum = var_rows[\"value\"].sum()\n",
+ " st = raw[\n",
+ " (raw[\"geo_level\"] == \"state\")\n",
+ " & (raw[\"geographic_id\"] == str(fips))\n",
+ " & (raw[\"variable\"] == var)\n",
+ " & (raw[\"domain_variable\"] == domain)\n",
+ " ]\n",
+ " if st.empty:\n",
+ " continue\n",
+ " state_original = st[\"original_value\"].iloc[0]\n",
+ " state_uf = var_rows[\"state_uprating_factor\"].iloc[0]\n",
+ " expected = state_original * state_uf\n",
+ " ok = np.isclose(cd_sum, expected, rtol=1e-6)\n",
+ " checks += 1\n",
+ " if not ok:\n",
+ " print(\n",
+ " f\" FAIL [{domain}] {abbr} {var}: \"\n",
+ " f\"sum(CDs)={cd_sum:.2f} != \"\n",
+ " f\"expected={expected:.2f}\"\n",
+ " )\n",
+ " all_ok = False\n",
+ "\n",
+ " print(\n",
+ " f\" {checks} checks across {len(DOMAINS)} domains: \"\n",
+ " + (\"ALL PASSED\" if all_ok else \"SOME FAILED\")\n",
+ " )\n",
+ "else:\n",
+ " print(\"Skipping — result not available.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "# Part 5: Calibration package and initial weights\n",
+ "\n",
+ "## 5.1 Calibration package serialization\n",
+ "\n",
+ "Building the calibration matrix is the most expensive step in the pipeline — it requires running PolicyEngine simulations for every state across all clones. To avoid rebuilding when experimenting with different `target_config.yaml` settings or hyperparameters, `run_calibration()` saves the **full unfiltered** matrix as a calibration package (Step 6b) before applying any config filtering.\n",
+ "\n",
+ "`save_calibration_package()` serializes:\n",
+ "\n",
+ "| Field | Contents |\n",
+ "|---|---|\n",
+ "| `X_sparse` | The full sparse matrix (all targets, all records) |\n",
+ "| `targets_df` | Target metadata DataFrame (variable, value, geo_level, domain, etc.) |\n",
+ "| `target_names` | Human-readable target name strings |\n",
+ "| `initial_weights` | Population-proportional starting weights (see 5.2) |\n",
+ "| `cd_geoid` | Congressional district GEOID per record (from geography assignment) |\n",
+ "| `block_geoid` | Census block GEOID per record |\n",
+ "| `metadata` | Provenance: git SHA, dataset path, checksums, creation timestamp |\n",
+ "\n",
+ "The package is a pickle file. `load_calibration_package()` restores it and runs provenance checks:\n",
+ "- Prints the git SHA and dataset path from `metadata`\n",
+ "- Warns if the package is stale (created more than 7 days ago or from a different git SHA than the current checkout)\n",
+ "\n",
+ "The `fit_from_package_*` Modal functions in `remote_calibration_runner.py` use this workflow: build the package once with `build_package_remote()`, then call `fit_from_package_*()` repeatedly with different configs/hyperparameters.\n",
+ "\n",
+ "## 5.2 Initial weight computation\n",
+ "\n",
+ "The L0 optimizer needs a starting weight for each record. Rather than uniform initialization, `compute_initial_weights()` in `unified_calibration.py` uses **age-bin population targets** to set per-CD proportional weights:\n",
+ "\n",
+ "1. Find all `person_count` targets where `domain_variable == \"age\"` and `geo_level == \"district\"`.\n",
+ "2. For each congressional district, sum the age-bin target values to get the district's total population.\n",
+ "3. Identify which matrix columns (records) have nonzero entries in those target rows — these are the records active in that CD.\n",
+ "4. Set each record's initial weight to `district_population / n_active_records`.\n",
+ "\n",
+ "```python\n",
+ "# From compute_initial_weights()\n",
+ "for cd_id, group in cd_groups:\n",
+ " cd_pop = group[\"value\"].sum()\n",
+ " # Find columns with nonzero entries in this CD's age rows\n",
+ " col_set = set()\n",
+ " for ri in group.index:\n",
+ " col_set.update(X_sparse[ri].indices)\n",
+ " w = cd_pop / len(col_set)\n",
+ " for c in col_set:\n",
+ " initial_weights[c] = w\n",
+ "```\n",
+ "\n",
+ "This gives the optimizer a head start: records in high-population districts begin with higher weights, and records in small districts begin with lower weights. Without this, the optimizer would spend early epochs just learning the population scale of each district.\n",
+ "\n",
+ "If no age-bin district targets are found (e.g., when running with a minimal config), the function falls back to uniform weights of 100."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "After Part 5, the matrix, filtered targets, initial weights, and achievable mask are ready. See `optimization_and_local_dataset_assembly_internals.ipynb` for how the L0 optimizer uses these inputs to find calibrated weights and how the final H5 datasets are assembled."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pe3.13 (3.13.0)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/internals/data_build_internals.ipynb b/docs/internals/data_build_internals.ipynb
new file mode 100644
index 000000000..7cf5ab95c
--- /dev/null
+++ b/docs/internals/data_build_internals.ipynb
@@ -0,0 +1,954 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000001",
+ "metadata": {},
+ "source": [
+ "# Data build internals\n",
+ "\n",
+ "Internal reference for Stage 1 of the calibration pipeline: clone creation, source variable imputation, and PUF cloning.\n",
+ "\n",
+ "**Requires:** `block_cd_distributions.csv.gz` in storage (from `make data`) for the clone creation cells. PUF cloning cells (Part 2) use toy DataFrames. Source imputation cells (Part 3) need ACS/SIPP/SCF donor files; those cells demonstrate the QRF concept with small synthetic data instead. \n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000002",
+ "metadata": {},
+ "source": [
+ "## Pipeline overview\n",
+ "\n",
+ "Stage 1 of the calibration pipeline transforms a raw CPS extract into an expanded dataset ready for the matrix-building step. The sequence is:\n",
+ "\n",
+ "1. **PUF clone** — double the record count; impute 70+ tax variables on the second half from the PUF (`puf_impute.py` via `extended_cps.py`).\n",
+ "2. **Clone and assign geography** — replicate each record N times and give each clone a random census block drawn from a population-weighted distribution, with AGI-conditional routing for high-income households (`clone_and_assign.py`).\n",
+ "3. **Source imputation** — re-impute housing and asset variables from ACS, SIPP, and SCF donor surveys (`source_impute.py` via `create_source_imputed_cps.py`).\n",
+ "\n",
+ "Because PUF cloning currently runs before geography assignment, the PUF QRF does not condition on state — both CPS and PUF halves receive geography only after doubling. `double_geography_for_puf()` exists for a planned future change where geography is assigned first, enabling state-conditional PUF imputation for richer geographic variation in tax variables.\n",
+ "\n",
+ "See `calibration_package_internals.ipynb` for how these expanded records become columns in the calibration matrix."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000003",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 1: Clone creation and geography assignment\n",
+ "\n",
+ "### How cloning works\n",
+ "\n",
+ "`assign_random_geography()` in `clone_and_assign.py` is the primary entry point. It does not literally duplicate rows in a DataFrame — it produces a `GeographyAssignment` object that indexes into the expanded record space. The caller is responsible for repeating the CPS arrays N times before writing them to the output dataset.\n",
+ "\n",
+ "The function takes `n_records` (number of households in the base CPS) and `n_clones` and returns arrays of length `n_records * n_clones`. Index `i` in each array corresponds to clone `i // n_records`, record `i % n_records`.\n",
+ "\n",
+ "### The `GeographyAssignment` dataclass\n",
+ "\n",
+ "```python\n",
+ "@dataclass\n",
+ "class GeographyAssignment:\n",
+ " block_geoid: np.ndarray # 15-char census block GEOIDs\n",
+ " cd_geoid: np.ndarray # congressional district GEOIDs\n",
+ " county_fips: np.ndarray # 5-char county FIPS codes\n",
+ " state_fips: np.ndarray # 2-digit state FIPS integers\n",
+ " n_records: int\n",
+ " n_clones: int\n",
+ "```\n",
+ "\n",
+ "- `block_geoid` is the primary key; `county_fips` is the first 5 characters, `state_fips` is the first 2 characters parsed as an integer.\n",
+ "- `cd_geoid` uses the format `state_fips * 100 + district_number`. At-large districts (district 00) and DC (district 98) are normalized to district 01.\n",
+ "\n",
+ "### The no-collision constraint\n",
+ "\n",
+ "The algorithm samples blocks independently per clone, but enforces that **the same CPS record cannot land in the same congressional district in two different clones**. Without this constraint, a high-weight record in a small district could dominate the calibration target for that district across multiple clones.\n",
+ "\n",
+ "Implementation:\n",
+ "- Clone 0 draws freely.\n",
+ "- Each subsequent clone checks for collisions against all previous clones and resamples the colliding records, up to 50 retries.\n",
+ "- Residual collisions after 50 retries are accepted (very rare with large block distributions).\n",
+ "\n",
+ "### AGI-conditional geographic assignment\n",
+ "\n",
+ "When `household_agi` and `cd_agi_targets` are provided, `assign_random_geography()` uses a two-distribution sampling strategy:\n",
+ "\n",
+ "1. **Identify extreme households** — those at or above the `agi_threshold_pctile` (default 90th percentile) of household AGI.\n",
+ "2. **Build AGI-weighted block probabilities** — `_build_agi_block_probs()` multiplies population block probabilities by CD-level AGI targets: `P_agi(block) = P_pop(block) * AGI_target(CD) / Z`. This makes blocks in high-AGI districts more likely for extreme households.\n",
+ "3. **Split sampling** — extreme households draw from `P_agi`; all other households draw from the standard population-weighted `P_pop`.\n",
+ "4. **Collision resampling respects the split** — when retrying collisions, extreme households resample from `P_agi` and normal households from `P_pop`.\n",
+ "\n",
+ "**Why this matters:** Without AGI-conditional assignment, a high-AGI household could land in a low-AGI district by chance. The L0 optimizer would then zero that record's weight to match the district's low AGI target — destroying population targets in the process. By routing high-income households toward high-AGI districts, the initial placement is more compatible with what calibration needs, and the optimizer can retain these records without sacrificing other targets.\n",
+ "\n",
+ "The CD-level AGI targets are loaded from `policy_data.db` in `run_calibration()` (in `unified_calibration.py`), which queries the `targets` table for active district-level `adjusted_gross_income` targets.\n",
+ "\n",
+ "### Geography is rederived, not persisted\n",
+ "\n",
+ "The `GeographyAssignment` is held in memory and passed through function calls during a pipeline run. It is **not** serialized to disk — each worker process calls `assign_random_geography()` with the same deterministic seed, so reproducibility is guaranteed without saving state. This avoids stale `.npz` files drifting out of sync with the block distribution data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000004",
+ "metadata": {},
+ "source": [
+ "### Demonstration: geography assignment on 20 households, 3 clones\n",
+ "\n",
+ "The real `assign_random_geography()` requires `block_cd_distributions.csv.gz` in the storage folder. Here we call it directly (it will load the distribution from storage). The call below also shows the AGI-conditional parameters; we pass `None` for both since CD AGI targets require a populated `policy_data.db`, but the function signature is demonstrated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1b2c3d4-0001-0000-0000-000000000005",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from policyengine_us_data.calibration.clone_and_assign import (\n",
+ " GeographyAssignment,\n",
+ " assign_random_geography,\n",
+ " double_geography_for_puf,\n",
+ ")\n",
+ "\n",
+ "N_RECORDS = 20 # 20 base CPS households\n",
+ "N_CLONES = 3 # 3 geographic replicas\n",
+ "# In production: N_RECORDS ~12,000, N_CLONES=430 -> ~5.2M matrix columns.\n",
+ "\n",
+ "# In production, run_calibration() computes household AGI and loads\n",
+ "# CD AGI targets from policy_data.db, then passes them here:\n",
+ "# household_agi=base_agi, # np.ndarray of per-household AGI\n",
+ "# cd_agi_targets=cd_agi_targets, # dict mapping CD GEOID str -> AGI $\n",
+ "# Here we pass None for both (falls back to uniform population-weighted sampling).\n",
+ "\n",
+ "geo = assign_random_geography(\n",
+ " n_records=N_RECORDS,\n",
+ " n_clones=N_CLONES,\n",
+ " seed=42,\n",
+ " household_agi=None,\n",
+ " cd_agi_targets=None,\n",
+ ")\n",
+ "\n",
+ "print(f\"n_records={geo.n_records}, n_clones={geo.n_clones}\")\n",
+ "print(f\"Total column positions: {geo.n_records * geo.n_clones}\")\n",
+ "print(f\"\\nFirst 4 column positions:\")\n",
+ "for i in range(4):\n",
+ " print(\n",
+ " f\" col {i}: block={geo.block_geoid[i]} cd={geo.cd_geoid[i]} state={geo.state_fips[i]}\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a1b2c3d4-0001-0000-0000-000000000006",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " record clone_0_cd clone_0_state clone_1_cd clone_1_state clone_2_cd clone_2_state\n",
+ " 0 4213 42 4206 42 2501 25\n",
+ " 1 2502 25 1804 18 4803 48\n",
+ " 2 4814 48 5309 53 3907 39\n",
+ " 3 3906 39 4829 48 1703 17\n",
+ " 4 621 6 4215 42 4802 48\n",
+ " 5 5401 54 904 9 4507 45\n",
+ " 6 4207 42 2605 26 2101 21\n",
+ " 7 4401 44 503 5 1310 13\n",
+ " 8 635 6 647 6 3714 37\n",
+ " 9 2507 25 3714 37 640 6\n",
+ " 10 1902 19 4106 41 1101 11\n",
+ " 11 5101 51 5307 53 104 1\n",
+ " 12 3623 36 1709 17 4402 44\n",
+ " 13 4707 47 1902 19 3706 37\n",
+ " 14 2504 25 2606 26 3909 39\n",
+ " 15 1212 12 902 9 4216 42\n",
+ " 16 3301 33 636 6 2602 26\n",
+ " 17 607 6 2609 26 3405 34\n",
+ " 18 4709 47 1211 12 640 6\n",
+ " 19 3618 36 3709 37 629 6\n",
+ "\n",
+ "Records with same CD in any two clones: 0 (should be 0)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Show the assignment as a table: rows = records, columns = clones\n",
+ "rows = []\n",
+ "for rec in range(N_RECORDS):\n",
+ " row = {\"record\": rec}\n",
+ " for clone in range(N_CLONES):\n",
+ " flat_idx = clone * N_RECORDS + rec\n",
+ " row[f\"clone_{clone}_cd\"] = geo.cd_geoid[flat_idx]\n",
+ " row[f\"clone_{clone}_state\"] = geo.state_fips[flat_idx]\n",
+ " rows.append(row)\n",
+ "\n",
+ "df_geo = pd.DataFrame(rows)\n",
+ "print(df_geo.to_string(index=False))\n",
+ "\n",
+ "# Verify the no-collision property: no record shares the same CD across two clones\n",
+ "cd_matrix = geo.cd_geoid.reshape(N_CLONES, N_RECORDS)\n",
+ "collision_check = np.zeros(N_RECORDS, dtype=bool)\n",
+ "for c1 in range(N_CLONES):\n",
+ " for c2 in range(c1 + 1, N_CLONES):\n",
+ " collision_check |= cd_matrix[c1] == cd_matrix[c2]\n",
+ "print(\n",
+ " f\"\\nRecords with same CD in any two clones: {collision_check.sum()} (should be 0)\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000009",
+ "metadata": {},
+ "source": [
+ "### `double_geography_for_puf()`\n",
+ "\n",
+ "After `puf_clone_dataset()` doubles the record count (one CPS half, one PUF half), the geometry must also double. `double_geography_for_puf()` iterates over each clone's slice and concatenates it with itself:\n",
+ "\n",
+ "```\n",
+ "Clone c before PUF doubling: [rec_0, rec_1, ..., rec_{n-1}]\n",
+ "Clone c after PUF doubling: [rec_0, ..., rec_{n-1}, rec_0, ..., rec_{n-1}]\n",
+ " \\________CPS half________/ \\______PUF half______/\n",
+ "```\n",
+ "\n",
+ "The CPS half and its PUF copy share the identical geographic assignment. `n_records` doubles from `n` to `2n`; `n_clones` stays the same."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a1b2c3d4-0001-0000-0000-000000000010",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Before doubling: n_records=20\n",
+ "After doubling: n_records=40\n",
+ "Total positions before: 60\n",
+ "Total positions after: 120\n",
+ "\n",
+ "CPS and PUF halves share identical geography in all clones.\n"
+ ]
+ }
+ ],
+ "source": [
+ "geo_doubled = double_geography_for_puf(geo)\n",
+ "\n",
+ "print(f\"Before doubling: n_records={geo.n_records}\")\n",
+ "print(f\"After doubling: n_records={geo_doubled.n_records}\")\n",
+ "print(f\"Total positions before: {geo.n_records * geo.n_clones}\")\n",
+ "print(f\"Total positions after: {geo_doubled.n_records * geo_doubled.n_clones}\")\n",
+ "\n",
+ "# Invariant: for every clone, CPS half and PUF half share the same geography.\n",
+ "for c in range(N_CLONES):\n",
+ " start = c * geo_doubled.n_records\n",
+ " cps_cds = geo_doubled.cd_geoid[start : start + N_RECORDS]\n",
+ " puf_cds = geo_doubled.cd_geoid[start + N_RECORDS : start + geo_doubled.n_records]\n",
+ " assert np.array_equal(cps_cds, puf_cds), f\"Clone {c}: CPS/PUF geography mismatch\"\n",
+ "print(\"\\nCPS and PUF halves share identical geography in all clones.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000017",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 2: PUF cloning and tax variable imputation*\n",
+ "\n",
+ "*Currently happens before geography assignment.\n",
+ "\n",
+ "`puf_clone_dataset()` in `puf_impute.py` doubles the record count. After this step the dataset has `2 * n_records` total person records: one CPS half and one PUF half. This runs before geography assignment, so the PUF QRF currently conditions only on demographics and income — not on state.\n",
+ "\n",
+ "### The 2-clone structure\n",
+ "\n",
+ "| Half | Records | Key properties |\n",
+ "|------|---------|----------------|\n",
+ "| CPS half | indices `0 .. n_cps - 1` | Retains original CPS values for most variables; `OVERRIDDEN_IMPUTED_VARIABLES` are re-imputed from PUF for both halves |\n",
+ "| PUF half | indices `n_cps .. 2*n_cps - 1` | All `IMPUTED_VARIABLES` replaced with PUF QRF predictions; **all weight arrays zeroed** |\n",
+ "\n",
+ "The logic inside `puf_clone_dataset()` selects the treatment for each variable:\n",
+ "\n",
+ "```python\n",
+ "if variable in OVERRIDDEN_IMPUTED_VARIABLES:\n",
+ " # Both halves get PUF predictions\n",
+ " new_data[variable] = concatenate([pred, pred])\n",
+ "elif variable in IMPUTED_VARIABLES:\n",
+ " # CPS half keeps original; PUF half gets predictions\n",
+ " new_data[variable] = concatenate([values, pred])\n",
+ "elif \"_id\" in variable:\n",
+ " # IDs must be unique: PUF IDs are offset by max(CPS IDs)\n",
+ " new_data[variable] = concatenate([values, values + values.max()])\n",
+ "elif \"_weight\" in variable:\n",
+ " # PUF half starts with zero weight\n",
+ " new_data[variable] = concatenate([values, values * 0])\n",
+ "else:\n",
+ " # Default: duplicate unchanged\n",
+ " new_data[variable] = concatenate([values, values])\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000018",
+ "metadata": {},
+ "source": [
+ "### Why the PUF half starts with zero weight\n",
+ "\n",
+ "The calibration matrix optimizer assigns weights to every record across all clones. At initialization, giving the PUF half zero weight means it contributes nothing to population totals until the optimizer decides it should. This prevents PUF records — which are imputed rather than survey-observed — from biasing the initial distribution before calibration has had a chance to learn from targets.\n",
+ "\n",
+ "The zeroing is a single line: `values * 0` applied to any variable whose name contains `\"_weight\"`. After calibration the PUF records receive positive weights wherever they improve the fit to tax-variable targets (e.g., total capital gains, interest income)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a1b2c3d4-0001-0000-0000-000000000019",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "half household_id household_weight employment_income social_security age\n",
+ " CPS 101 1.2 45000.0 0.0 38.0\n",
+ " CPS 102 3.4 0.0 18000.0 72.0\n",
+ " CPS 103 2.1 80000.0 0.0 45.0\n",
+ " CPS 104 0.8 30000.0 12000.0 68.0\n",
+ " CPS 105 1.9 60000.0 0.0 52.0\n",
+ " PUF 206 0.0 52000.0 0.0 38.0\n",
+ " PUF 207 0.0 1000.0 18000.0 72.0\n",
+ " PUF 208 0.0 90000.0 0.0 45.0\n",
+ " PUF 209 0.0 28000.0 12000.0 68.0\n",
+ " PUF 210 0.0 71000.0 0.0 52.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Toy 2-clone structure: 5 households, a subset of variables\n",
+ "n_cps = 5\n",
+ "time_period = 2024\n",
+ "\n",
+ "toy_data = {\n",
+ " \"household_id\": {time_period: np.array([101, 102, 103, 104, 105])},\n",
+ " \"person_id\": {time_period: np.array([201, 202, 203, 204, 205])},\n",
+ " \"household_weight\": {time_period: np.array([1.2, 3.4, 2.1, 0.8, 1.9])},\n",
+ " \"employment_income\": {\n",
+ " time_period: np.array([45_000, 0, 80_000, 30_000, 60_000], dtype=np.float32)\n",
+ " },\n",
+ " \"social_security\": {\n",
+ " time_period: np.array([0, 18_000, 0, 12_000, 0], dtype=np.float32)\n",
+ " },\n",
+ " \"age\": {time_period: np.array([38, 72, 45, 68, 52], dtype=np.float32)},\n",
+ "}\n",
+ "\n",
+ "# Simulate PUF QRF predictions for IMPUTED_VARIABLES (here just employment_income)\n",
+ "puf_employment_preds = np.array(\n",
+ " [52_000, 1_000, 90_000, 28_000, 71_000], dtype=np.float32\n",
+ ")\n",
+ "\n",
+ "IMPUTED_VARIABLES_TOY = {\"employment_income\"}\n",
+ "\n",
+ "new_data = {}\n",
+ "for variable, time_dict in toy_data.items():\n",
+ " values = time_dict[time_period]\n",
+ " if variable in IMPUTED_VARIABLES_TOY:\n",
+ " pred = puf_employment_preds\n",
+ " new_data[variable] = {time_period: np.concatenate([values, pred])}\n",
+ " elif \"_id\" in variable:\n",
+ " new_data[variable] = {\n",
+ " time_period: np.concatenate([values, values + values.max()])\n",
+ " }\n",
+ " elif \"_weight\" in variable:\n",
+ " new_data[variable] = {time_period: np.concatenate([values, values * 0])}\n",
+ " else:\n",
+ " new_data[variable] = {time_period: np.concatenate([values, values])}\n",
+ "\n",
+ "summary = pd.DataFrame(\n",
+ " {\n",
+ " \"half\": [\"CPS\"] * n_cps + [\"PUF\"] * n_cps,\n",
+ " \"household_id\": new_data[\"household_id\"][time_period],\n",
+ " \"household_weight\": new_data[\"household_weight\"][time_period],\n",
+ " \"employment_income\": new_data[\"employment_income\"][time_period],\n",
+ " \"social_security\": new_data[\"social_security\"][time_period],\n",
+ " \"age\": new_data[\"age\"][time_period],\n",
+ " }\n",
+ ")\n",
+ "print(summary.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000020",
+ "metadata": {},
+ "source": [
+ "The CPS half retains its original `employment_income`; the PUF half receives QRF-imputed values. Household IDs are offset in the PUF half to prevent collisions (`household_id` goes from 101–105 to 106–110). Weights are 0.0 across the entire PUF half."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000021",
+ "metadata": {},
+ "source": [
+ "### The 70+ tax variables imputed from PUF\n",
+ "\n",
+ "`IMPUTED_VARIABLES` contains 70 variables covering the main tax return line items. `OVERRIDDEN_IMPUTED_VARIABLES` (a subset of 44 variables) are imputed for **both** CPS and PUF halves — these are variables where the CPS estimate is considered unreliable enough that the PUF imputation is preferred even for records that stay in the CPS half.\n",
+ "\n",
+ "A representative selection from `IMPUTED_VARIABLES`:\n",
+ "- Income: `employment_income`, `self_employment_income`, `partnership_s_corp_income`, `rental_income`, `farm_income`, `estate_income`, `alimony_income`\n",
+ "- Capital: `long_term_capital_gains`, `short_term_capital_gains`, `qualified_dividend_income`, `non_qualified_dividend_income`, `taxable_interest_income`, `tax_exempt_interest_income`\n",
+ "- Deductions: `charitable_cash_donations`, `charitable_non_cash_donations`, `deductible_mortgage_interest`, `student_loan_interest`, `health_savings_account_ald`\n",
+ "- Social Security: `social_security` (total; sub-components reconciled separately)\n",
+ "- Retirement: `taxable_ira_distributions`, `taxable_pension_income`, `tax_exempt_pension_income`\n",
+ "- Credits: `foreign_tax_credit`, `american_opportunity_credit`, `savers_credit`, `general_business_credit`\n",
+ "- Qualified business income: `w2_wages_from_qualified_business`, `unadjusted_basis_qualified_property`, `qualified_reit_and_ptp_income`, `qualified_bdc_income`\n",
+ "\n",
+ "### Stratified subsampling of PUF training data\n",
+ "\n",
+ "`_stratified_subsample_index()` in `_run_qrf_imputation()` selects training records from the PUF:\n",
+ "- Keeps **all** records with AGI at or above the 99.5th percentile (top 0.5%, `PUF_TOP_PERCENTILE = 99.5`)\n",
+ "- Randomly samples the remainder to reach a target of 20,000 total (`PUF_SUBSAMPLE_TARGET = 20_000`)\n",
+ "\n",
+ "This preserves the extreme tail of the AGI distribution in the training set, which matters for rare but high-impact income types (capital gains, partnership income)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000022",
+ "metadata": {},
+ "source": [
+ "### Sequential QRF: preserving the joint distribution\n",
+ "\n",
+ "`_sequential_qrf()` calls `microimpute.QRF.fit_predict()`, which imputes each variable **conditioned on all previously imputed variables** in the `output_vars` list order. This preserves the joint distribution across the 70 tax variables — for example, records with high `employment_income` will tend to receive plausible values for `pre_tax_contributions` because the model sees both."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "a1b2c3d4-0001-0000-0000-000000000023",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " age employment_income capital_gains dividends\n",
+ " 30 40000 3143.0 1168.0\n",
+ " 45 80000 3564.0 767.0\n",
+ " 60 20000 1584.0 770.0\n",
+ " 35 55000 1713.0 488.0\n",
+ " 55 95000 2673.0 881.0\n",
+ " 70 5000 1497.0 1035.0\n",
+ "\n",
+ "Donor corr(capital_gains, dividends): 0.941\n",
+ "Imputed corr(capital_gains, dividends): 0.281\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "\n",
+ "# Minimal illustration of sequential conditioning:\n",
+ "# Impute capital_gains conditioned on employment_income,\n",
+ "# then impute dividends conditioned on both.\n",
+ "\n",
+ "rng = np.random.default_rng(1)\n",
+ "\n",
+ "# PUF-like donor: employment correlated with capital gains; gains correlated with dividends\n",
+ "n_donor = 300\n",
+ "emp = rng.exponential(50_000, n_donor)\n",
+ "age = rng.integers(25, 80, n_donor).astype(float)\n",
+ "cg = np.maximum(0, 0.05 * emp + rng.normal(0, 5_000, n_donor))\n",
+ "div = np.maximum(0, 0.3 * cg + rng.normal(0, 500, n_donor))\n",
+ "\n",
+ "donor = pd.DataFrame(\n",
+ " {\"age\": age, \"employment_income\": emp, \"capital_gains\": cg, \"dividends\": div}\n",
+ ")\n",
+ "\n",
+ "# CPS receiver: only demographics known initially\n",
+ "n_recv = 6\n",
+ "recv = pd.DataFrame(\n",
+ " {\n",
+ " \"age\": [30, 45, 60, 35, 55, 70],\n",
+ " \"employment_income\": [40_000, 80_000, 20_000, 55_000, 95_000, 5_000],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# Step 1: impute capital_gains from demographics alone\n",
+ "rf1 = RandomForestRegressor(n_estimators=50, random_state=0)\n",
+ "rf1.fit(donor[[\"age\", \"employment_income\"]], donor[\"capital_gains\"])\n",
+ "recv[\"capital_gains\"] = rf1.predict(recv[[\"age\", \"employment_income\"]]).round(0)\n",
+ "\n",
+ "# Step 2: impute dividends conditioning on capital_gains (now available)\n",
+ "rf2 = RandomForestRegressor(n_estimators=50, random_state=0)\n",
+ "rf2.fit(donor[[\"age\", \"employment_income\", \"capital_gains\"]], donor[\"dividends\"])\n",
+ "recv[\"dividends\"] = rf2.predict(\n",
+ " recv[[\"age\", \"employment_income\", \"capital_gains\"]]\n",
+ ").round(0)\n",
+ "\n",
+ "print(recv.to_string(index=False))\n",
+ "print(\n",
+ " f\"\\nDonor corr(capital_gains, dividends): {donor['capital_gains'].corr(donor['dividends']):.3f}\"\n",
+ ")\n",
+ "print(\n",
+ " f\"Imputed corr(capital_gains, dividends): {recv['capital_gains'].corr(recv['dividends']):.3f}\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000024",
+ "metadata": {},
+ "source": [
+ "By conditioning each variable on those already imputed, the sequential QRF preserves inter-variable correlations from the donor distribution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000025",
+ "metadata": {},
+ "source": [
+ "### Social Security sub-component reconciliation\n",
+ "\n",
+ "The PUF records a total `social_security` amount but not its breakdown by reason code. `reconcile_ss_subcomponents()` derives the four sub-components for every PUF record with positive SS:\n",
+ "\n",
+ "```\n",
+ "SS_SUBCOMPONENTS = [\n",
+ " \"social_security_retirement\",\n",
+ " \"social_security_disability\",\n",
+ " \"social_security_survivors\",\n",
+ " \"social_security_dependents\",\n",
+ "]\n",
+ "```\n",
+ "\n",
+ "**Step 1 — QRF share prediction:** `_qrf_ss_shares()` trains on CPS records where all four sub-components are known. It converts each sub-component to a share of the total (`sub_value / ss_total`), trains a QRF on these shares using `SS_SPLIT_PREDICTORS` (`age`, `is_male`, `tax_unit_is_joint`, `is_tax_unit_head`, `is_tax_unit_dependent`), and predicts shares for PUF records. Predicted shares are clipped to [0, 1] and renormalized to sum to 1.\n",
+ "\n",
+ "**Step 2 — Age heuristic fallback:** If the QRF fails (fewer than 100 CPS training records or a runtime exception), `_age_heuristic_ss_shares()` assigns:\n",
+ "- Age ≥ 62 (`MINIMUM_RETIREMENT_AGE`): 100% retirement\n",
+ "- Age < 62: 100% disability\n",
+ "- If age is unavailable: 100% retirement for all\n",
+ "\n",
+ "**Step 3 — Scale to imputed total:** Each sub-component is set to `ss_total * share`. PUF records with `social_security == 0` get all sub-components set to 0.\n",
+ "\n",
+ "The function modifies `data` in place, only touching the PUF half (indices `n_cps .. 2*n_cps`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "a1b2c3d4-0001-0000-0000-000000000026",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " age ss_total ss_retirement ss_disability ss_survivors ss_dependents check_sum\n",
+ " 70 24000 21600.0 1200.0 720.0 480.0 24000.0\n",
+ " 55 14400 0.0 13680.0 432.0 288.0 14400.0\n",
+ " 68 9600 8160.0 960.0 288.0 192.0 9600.0\n",
+ " 45 18000 0.0 16200.0 900.0 900.0 18000.0\n",
+ "\n",
+ "All check sums match ss_total: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Toy SS reconciliation: 4 PUF records with positive SS\n",
+ "# Predicted shares come from the QRF (here we supply them directly).\n",
+ "\n",
+ "puf_ss_total = np.array([24_000, 14_400, 9_600, 18_000]) # imputed totals\n",
+ "ages = np.array([70, 55, 68, 45]) # ages for heuristic\n",
+ "\n",
+ "# Suppose QRF predicted these normalized shares for each record\n",
+ "qrf_shares = {\n",
+ " \"social_security_retirement\": np.array([0.90, 0.00, 0.85, 0.00]),\n",
+ " \"social_security_disability\": np.array([0.05, 0.95, 0.10, 0.90]),\n",
+ " \"social_security_survivors\": np.array([0.03, 0.03, 0.03, 0.05]),\n",
+ " \"social_security_dependents\": np.array([0.02, 0.02, 0.02, 0.05]),\n",
+ "}\n",
+ "\n",
+ "# Renormalize (shares already sum to ~1 here; shown for completeness)\n",
+ "total_shares = sum(qrf_shares.values())\n",
+ "for k in qrf_shares:\n",
+ " qrf_shares[k] = np.where(total_shares > 0, qrf_shares[k] / total_shares, 0.0)\n",
+ "\n",
+ "# Scale to imputed total\n",
+ "result = pd.DataFrame({\"age\": ages, \"ss_total\": puf_ss_total})\n",
+ "for sub, shares in qrf_shares.items():\n",
+ " result[sub.replace(\"social_security_\", \"ss_\")] = (puf_ss_total * shares).round(0)\n",
+ "\n",
+ "result[\"check_sum\"] = (\n",
+ " result[\"ss_retirement\"]\n",
+ " + result[\"ss_disability\"]\n",
+ " + result[\"ss_survivors\"]\n",
+ " + result[\"ss_dependents\"]\n",
+ ")\n",
+ "print(result.to_string(index=False))\n",
+ "print(\n",
+ " \"\\nAll check sums match ss_total:\",\n",
+ " np.allclose(result[\"check_sum\"], result[\"ss_total\"], atol=1),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000027",
+ "metadata": {},
+ "source": [
+ "### Age heuristic fallback: numeric example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "a1b2c3d4-0001-0000-0000-000000000028",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " age ss_total is_retirement_age ss_retirement ss_disability ss_survivors ss_dependents\n",
+ " 70 24000 True 24000.0 0.0 0.0 0.0\n",
+ " 55 14400 False 0.0 14400.0 0.0 0.0\n",
+ " 68 9600 True 9600.0 0.0 0.0 0.0\n",
+ " 45 18000 False 0.0 18000.0 0.0 0.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "MINIMUM_RETIREMENT_AGE = 62\n",
+ "\n",
+ "ages = np.array([70, 55, 68, 45])\n",
+ "puf_ss_total = np.array([24_000, 14_400, 9_600, 18_000])\n",
+ "\n",
+ "is_old = ages >= MINIMUM_RETIREMENT_AGE\n",
+ "\n",
+ "heuristic_shares = {\n",
+ " \"social_security_retirement\": is_old.astype(float),\n",
+ " \"social_security_disability\": (~is_old).astype(float),\n",
+ " \"social_security_survivors\": np.zeros(len(ages)),\n",
+ " \"social_security_dependents\": np.zeros(len(ages)),\n",
+ "}\n",
+ "\n",
+ "heuristic_result = pd.DataFrame(\n",
+ " {\"age\": ages, \"ss_total\": puf_ss_total, \"is_retirement_age\": is_old}\n",
+ ")\n",
+ "for sub, shares in heuristic_shares.items():\n",
+ " heuristic_result[sub.replace(\"social_security_\", \"ss_\")] = (\n",
+ " puf_ss_total * shares\n",
+ " ).round(0)\n",
+ "\n",
+ "print(heuristic_result.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000029",
+ "metadata": {},
+ "source": [
+ "When the QRF heuristic applies, the two youngest recipients (ages 55 and 45) have their entire SS imputed as disability; the two older recipients get it all as retirement. Survivors and dependents receive nothing under the heuristic."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000030",
+ "metadata": {},
+ "source": [
+ "### Additional PUF-half imputations\n",
+ "\n",
+ "Beyond the 70 IMPUTED_VARIABLES, two further QRF passes run on the PUF half:\n",
+ "\n",
+ "**`_impute_weeks_unemployed()`** — Trains on CPS records where `weeks_unemployed` is observed. The test predictor set uses PUF-imputed `taxable_unemployment_compensation` as an additional feature (when available), ensuring the imputed weeks are consistent with the PUF unemployment income values. Predictions are clipped to [0, 52]; records with zero unemployment compensation receive zero weeks.\n",
+ "\n",
+ "**`_impute_retirement_contributions()`** — Imputes `traditional_401k_contributions`, `roth_401k_contributions`, `traditional_ira_contributions`, `roth_ira_contributions`, and `self_employed_pension_contributions` for the PUF half. The test data uses CPS demographic predictors (`RETIREMENT_DEMOGRAPHIC_PREDICTORS`) combined with PUF-imputed income (`RETIREMENT_INCOME_PREDICTORS`). After prediction, year-specific IRS contribution limits are applied:\n",
+ "- 401k: capped at the annual limit + catch-up allowance for age ≥ 50\n",
+ "- IRA: capped at the annual IRA limit + catch-up for age ≥ 50\n",
+ "- SE pension: capped at `min(25% of SE income, dollar limit)` from `imputation_parameters.yaml`\n",
+ "- All contributions zeroed for records without the corresponding income type"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000031",
+ "metadata": {},
+ "source": [
+ "### End-to-end record count accounting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "a1b2c3d4-0001-0000-0000-000000000032",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Stage Household records Primary function\n",
+ " Base CPS 100000 -\n",
+ " After clone x10 1000000 assign_random_geography()\n",
+ " After PUF doubling 2000000 puf_clone_dataset()\n",
+ "After double_geography 2000000 double_geography_for_puf()\n",
+ "\n",
+ "Final n_records in GeographyAssignment: 200000\n",
+ "Final n_clones in GeographyAssignment: 10\n",
+ "Total flat length of each geo array: 2,000,000\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "n_base_cps = 100_000 # typical household count in base CPS\n",
+ "n_clones = 10\n",
+ "\n",
+ "stages = [\n",
+ " (\"Base CPS\", n_base_cps, \"-\"),\n",
+ " (\"After clone x10\", n_base_cps * n_clones, \"assign_random_geography()\"),\n",
+ " (\"After PUF doubling\", n_base_cps * n_clones * 2, \"puf_clone_dataset()\"),\n",
+ " (\"After double_geography\", n_base_cps * n_clones * 2, \"double_geography_for_puf()\"),\n",
+ "]\n",
+ "\n",
+ "df_stages = pd.DataFrame(\n",
+ " stages, columns=[\"Stage\", \"Household records\", \"Primary function\"]\n",
+ ")\n",
+ "print(df_stages.to_string(index=False))\n",
+ "print(f\"\\nFinal n_records in GeographyAssignment: {n_base_cps * 2}\")\n",
+ "print(f\"Final n_clones in GeographyAssignment: {n_clones}\")\n",
+ "print(f\"Total flat length of each geo array: {n_base_cps * 2 * n_clones:,}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000033",
+ "metadata": {},
+ "source": [
+ "### Summary of variable treatment in `puf_clone_dataset()`\n",
+ "\n",
+ "| Variable class | Examples | CPS half value | PUF half value |\n",
+ "|----------------|----------|---------------|----------------|\n",
+ "| `IMPUTED_VARIABLES` | `employment_income`, `social_security`, capital gains | Original CPS value | QRF prediction from PUF |\n",
+ "| `OVERRIDDEN_IMPUTED_VARIABLES` | `partnership_s_corp_income`, `deductible_mortgage_interest` | PUF QRF prediction | PUF QRF prediction |\n",
+ "| ID variables (`*_id`) | `household_id`, `person_id` | Original | Offset by `max(CPS IDs)` |\n",
+ "| Weight variables (`*_weight`) | `household_weight` | Original | **0.0** |\n",
+ "| `SS_SUBCOMPONENTS` | `social_security_retirement` | Original CPS split | Reconciled from imputed total |\n",
+ "| `CPS_RETIREMENT_VARIABLES` | `traditional_401k_contributions` | Original CPS value | QRF from CPS trained on retirement income |\n",
+ "| `weeks_unemployed` | — | Original CPS value | QRF conditioned on PUF unemployment comp |\n",
+ "| All other variables | `age`, `rent`, `state_fips` | Original | Duplicated unchanged |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "See `calibration_package_internals.ipynb` for how these expanded, imputed records are assembled into the calibration matrix and how weights are optimized across all clones."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000011",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 3: Source variable imputation (ACS/SIPP/SCF)\n",
+ "\n",
+ "After PUF cloning and geography assignment, `impute_source_variables()` in `source_impute.py` re-imputes 9 variables that the CPS measures poorly. All three donor surveys use Quantile Random Forest (QRF) from the `microimpute` package. Because geography is already assigned at this point, the ACS QRF can condition on `state_fips`.\n",
+ "\n",
+ "### Variables and donor surveys\n",
+ "\n",
+ "| Survey | Variables imputed | State predictor? |\n",
+ "|--------|------------------|------------------|\n",
+ "| ACS 2022 | `rent`, `real_estate_taxes` | **Yes** (`state_fips`) |\n",
+ "| SIPP 2023 | `tip_income`, `bank_account_assets`, `stock_assets`, `bond_assets` | No |\n",
+ "| SCF 2022 | `net_worth`, `auto_loan_balance`, `auto_loan_interest` | No |\n",
+ "\n",
+ "### Why only ACS includes a state predictor\n",
+ "\n",
+ "The ACS is a large household survey with published state identifiers, so the donor file includes `state_fips` for every record. The QRF trained on ACS data therefore learns state-level differences in rent levels and property tax burdens, and propagates those differences to the CPS clones (which now carry `state_fips` from the geography assignment step).\n",
+ "\n",
+ "SIPP and SCF do not publish state identifiers in their public-use files. Their QRF imputations condition only on demographic and financial predictors, so the imputed values vary across individuals but not across states. This is a deliberate simplification: the pipeline accepts state-blind asset imputations because assets are less geographically concentrated than housing costs.\n",
+ "\n",
+ "### Predictor sets\n",
+ "\n",
+ "**ACS predictors** (+ `state_fips`):\n",
+ "`is_household_head`, `age`, `is_male`, `tenure_type`, `employment_income`, `self_employment_income`, `social_security`, `pension_income`, `household_size`\n",
+ "\n",
+ "**SIPP tip predictors:**\n",
+ "`employment_income`, `age`, `count_under_18`, `count_under_6`\n",
+ "\n",
+ "**SIPP asset predictors:**\n",
+ "`employment_income`, `age`, `is_female`, `is_married`, `count_under_18`\n",
+ "\n",
+ "**SCF predictors:**\n",
+ "`age`, `is_female`, `cps_race`, `is_married`, `own_children_in_household`, `employment_income`, `interest_dividend_income`, `social_security_pension_income`\n",
+ "\n",
+ "### Training sample sizes\n",
+ "\n",
+ "- ACS: 10,000 household heads sampled from ACS 2022\n",
+ "- SIPP tips: up to 10,000 records sampled with probability proportional to `WPFINWGT`\n",
+ "- SIPP assets: up to 20,000 records sampled with probability proportional to `WPFINWGT`\n",
+ "- SCF: 50% random sample of SCF 2022"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000012",
+ "metadata": {},
+ "source": [
+ "### Toy QRF imputation: state-aware vs state-blind\n",
+ "\n",
+ "The following cell builds a minimal synthetic example showing how including `state_fips` in the predictor set causes the QRF to produce state-differentiated predictions, while omitting it collapses predictions to demographic-only variation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "a1b2c3d4-0001-0000-0000-000000000013",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "state_label age income pred_state_aware pred_state_blind\n",
+ " AL 35.0 30000 282.0 1151.0\n",
+ " AL 45.0 60000 599.0 1200.0\n",
+ " AL 55.0 80000 590.0 692.0\n",
+ " AL 30.0 25000 259.0 905.0\n",
+ " CA 35.0 30000 1358.0 1151.0\n",
+ " CA 45.0 60000 1618.0 1200.0\n",
+ " CA 55.0 80000 1524.0 692.0\n",
+ " CA 30.0 25000 1213.0 905.0\n",
+ " NY 35.0 30000 1112.0 1151.0\n",
+ " NY 45.0 60000 1199.0 1200.0\n",
+ " NY 55.0 80000 1270.0 692.0\n",
+ " NY 30.0 25000 1147.0 905.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "\n",
+ "rng = np.random.default_rng(0)\n",
+ "\n",
+ "# Synthetic donor (ACS-like): 200 records across 4 states.\n",
+ "# Rent in CA (state 6) is structurally higher than in AL (state 1).\n",
+ "n_donor = 200\n",
+ "states_donor = rng.choice([1, 6, 12, 36], size=n_donor) # AL, CA, FL, NY\n",
+ "age_donor = rng.integers(25, 75, size=n_donor).astype(float)\n",
+ "income_donor = rng.exponential(40_000, size=n_donor)\n",
+ "state_premium = np.where(\n",
+ " states_donor == 6,\n",
+ " 1200, # CA\n",
+ " np.where(\n",
+ " states_donor == 36,\n",
+ " 900, # NY\n",
+ " np.where(\n",
+ " states_donor == 12,\n",
+ " 400, # FL\n",
+ " 200,\n",
+ " ),\n",
+ " ),\n",
+ ") # AL\n",
+ "rent_donor = state_premium + 0.005 * income_donor + rng.normal(0, 100, n_donor)\n",
+ "rent_donor = np.clip(rent_donor, 0, None)\n",
+ "\n",
+ "donor_df = pd.DataFrame(\n",
+ " {\n",
+ " \"age\": age_donor,\n",
+ " \"income\": income_donor,\n",
+ " \"state_fips\": states_donor.astype(float),\n",
+ " \"rent\": rent_donor,\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# Synthetic receiver: 12 CPS records, 4 per state\n",
+ "receiver_df = pd.DataFrame(\n",
+ " {\n",
+ " \"age\": [35.0, 45.0, 55.0, 30.0] * 3,\n",
+ " \"income\": [30_000, 60_000, 80_000, 25_000] * 3,\n",
+ " \"state_fips\": [1.0] * 4 + [6.0] * 4 + [36.0] * 4,\n",
+ " \"state_label\": [\"AL\"] * 4 + [\"CA\"] * 4 + [\"NY\"] * 4,\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# State-AWARE model\n",
+ "rf_aware = RandomForestRegressor(n_estimators=100, random_state=0)\n",
+ "rf_aware.fit(donor_df[[\"age\", \"income\", \"state_fips\"]], donor_df[\"rent\"])\n",
+ "receiver_df[\"pred_state_aware\"] = rf_aware.predict(\n",
+ " receiver_df[[\"age\", \"income\", \"state_fips\"]]\n",
+ ").round(0)\n",
+ "\n",
+ "# State-BLIND model (same demographics, no state_fips)\n",
+ "rf_blind = RandomForestRegressor(n_estimators=100, random_state=0)\n",
+ "rf_blind.fit(donor_df[[\"age\", \"income\"]], donor_df[\"rent\"])\n",
+ "receiver_df[\"pred_state_blind\"] = rf_blind.predict(\n",
+ " receiver_df[[\"age\", \"income\"]]\n",
+ ").round(0)\n",
+ "\n",
+ "print(\n",
+ " receiver_df[\n",
+ " [\"state_label\", \"age\", \"income\", \"pred_state_aware\", \"pred_state_blind\"]\n",
+ " ].to_string(index=False)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1b2c3d4-0001-0000-0000-000000000014",
+ "metadata": {},
+ "source": [
+ "The state-aware predictions separate AL from CA and NY households with the same age and income. The state-blind predictions are identical for those records — any geographic variation in rent then depends entirely on which clones happen to overlap with which calibration targets, rather than on the underlying rent distribution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "96822918",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "After Part 3, the base datasets are ready. Sometimes these are used by themselves, for microsimulation use or for other branches of the data pipeline. However, their biggest use is as a base for national and local area calibration. See `calibration_package_internals.ipynb` for an explanation of how the calibration problem is set up to build on them."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pe3.13 (3.13.0)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/internals/optimization_and_local_dataset_assembly_internals.ipynb b/docs/internals/optimization_and_local_dataset_assembly_internals.ipynb
new file mode 100644
index 000000000..e6ba5bb2f
--- /dev/null
+++ b/docs/internals/optimization_and_local_dataset_assembly_internals.ipynb
@@ -0,0 +1,1144 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7fb27b941602401d91542211134fc71a",
+ "metadata": {},
+ "source": [
+ "# Weight optimization and local dataset assembly internals\n",
+ "\n",
+ "Internal reference for stages 3–4 of the calibration pipeline: L0 weight optimization, weight expansion, and H5 assembly.\n",
+ "\n",
+ "**Requires:** calibration package (`.pkl`) and optionally diagnostic CSVs from a completed run.\n",
+ "\n",
+ "**Pipeline stages recap:**\n",
+ "1. Load CPS dataset → get n_records \n",
+ "2. Clone ×N, assign random geography (census block) \n",
+ "3. Build sparse calibration matrix (clone-by-clone) \n",
+ "4. **L0-regularized optimization → calibrated weights** ← Part 1 \n",
+ "5. **Expand weights → per-clone records → H5 file** ← Part 2 \n",
+ "6. **Read diagnostic CSVs to assess convergence** ← Part 3\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "acae54e37e7d407bbb7b55eff062a284",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 1: L0 weight optimization\n",
+ "\n",
+ "This is the core algorithmic innovation of the pipeline. The optimizer simultaneously calibrates a weight vector to match target statistics **and** drives most weights to exactly zero via Hard Concrete gates — producing a sparse, interpretable output.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a63283cbaf04dbcab1f6479b197f3a8",
+ "metadata": {},
+ "source": [
+ "### 1.1 The loss function\n",
+ "\n",
+ "The total loss combines a **calibration term** and an **L0 sparsity penalty**:\n",
+ "\n",
+ "$$\\mathcal{L} = \\underbrace{\\sum_j \\left(\\frac{\\hat{y}_j - y_j}{y_j}\\right)^2}_{\\text{relative calibration loss}} + \\lambda_{L0} \\underbrace{\\sum_i P(z_i > 0)}_{\\text{L0 penalty}} + \\lambda_{L2} \\|w\\|^2$$\n",
+ "\n",
+ "where:\n",
+ "- $\\hat{y}_j = \\sum_i X_{ji} w_i z_i$ is the weighted estimate for target $j$\n",
+ "- $X$ is the sparse calibration matrix (targets × records)\n",
+ "- $w_i$ is the continuous weight for record $i$\n",
+ "- $z_i \\in [0, 1]$ is the Hard Concrete gate for record $i$\n",
+ "- $\\lambda_{L0}$ controls the sparsity pressure (see presets below)\n",
+ "- $\\lambda_{L2} = 10^{-12}$ provides mild weight regularization\n",
+ "\n",
+ "The calibration loss uses **relative** errors (`loss_type=\"relative\"` in `model.fit()`), so a 1% miss on a small target counts the same as a 1% miss on a large target. This prevents large-population targets from dominating.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8dd0d8092fe74a7c96281538738b07e2",
+ "metadata": {},
+ "source": [
+ "### 1.2 Hard Concrete gates — how they work\n",
+ "\n",
+ "Hard Concrete (Louizos et al., 2018) approximates the discrete L0 count $\\sum_i \\mathbf{1}[w_i \\neq 0]$ with a differentiable surrogate. Each record $i$ has a learned logit $\\alpha_i$ (`qz_logits`). During training the gate is sampled:\n",
+ "\n",
+ "$$u \\sim \\text{Uniform}(\\epsilon, 1-\\epsilon)$$\n",
+ "$$s = \\sigma\\!\\left(\\frac{\\log u - \\log(1-u) + \\alpha_i}{\\beta}\\right)$$\n",
+ "$$\\bar{s} = s \\cdot (\\zeta - \\gamma) + \\gamma$$\n",
+ "$$z_i = \\text{clip}(\\bar{s},\\, 0,\\, 1)$$\n",
+ "\n",
+ "The `clip` maps the stretched sigmoid onto $[0, 1]$, placing probability mass at exactly 0 and exactly 1. At inference (`model.eval()`), the stochastic sample is replaced by the deterministic mean:\n",
+ "\n",
+ "$$z_i^{\\text{det}} = \\text{clip}\\!\\left(\\sigma(\\alpha_i)(\\zeta - \\gamma) + \\gamma,\\; 0,\\; 1\\right)$$\n",
+ "\n",
+ "The expected number of nonzero gates — used as the L0 penalty — is:\n",
+ "\n",
+ "$$P(z_i > 0) = \\sigma\\!\\left(\\alpha_i - \\beta \\log\\frac{-\\gamma}{\\zeta}\\right)$$\n",
+ "\n",
+ "This is computed in `HardConcrete.get_penalty()` in `utils/l0.py`.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72eea5119410473aa328ad9291626812",
+ "metadata": {},
+ "source": [
+ "### 1.3 Hyperparameter values (source: `unified_calibration.py`)\n",
+ "\n",
+ "| Parameter | Name in code | Value | Role |\n",
+ "|-----------|-------------|-------|------|\n",
+ "| $\\beta$ | `BETA` | 0.35 | Gate temperature — lower = sharper 0/1 transition |\n",
+ "| $\\gamma$ | `GAMMA` | −0.1 | Left stretch of the sigmoid (enables exact-zero gates) |\n",
+ "| $\\zeta$ | `ZETA` | 1.1 | Right stretch (enables exact-one gates) |\n",
+ "| Initial keep probability | `INIT_KEEP_PROB` | 0.999 | All records start nearly fully active |\n",
+ "| Weight jitter SD | `LOG_WEIGHT_JITTER_SD` | 0.05 | Log-space noise on weights at init |\n",
+ "| Logit jitter SD | `LOG_ALPHA_JITTER_SD` | 0.01 | Log-space noise on gate logits at init |\n",
+ "| $\\lambda_{L2}$ | `LAMBDA_L2` | $10^{-12}$ | Weight decay |\n",
+ "| Learning rate | `LEARNING_RATE` | 0.15 | Adam optimizer step size |\n",
+ "| Default epochs | `DEFAULT_EPOCHS` | 100 | Training iterations |\n",
+ "| Default clones | `DEFAULT_N_CLONES` | 430 | CPS copies before optimization |\n",
+ "\n",
+ "$\\gamma = -0.1$ and $\\zeta = 1.1$ are the standard \"stretch\" values from the original Hard Concrete paper. They place $\\approx 9\\%$ of the sigmoid's mass below 0 and $\\approx 9\\%$ above 1, which is what allows `clip` to produce exact zeros and ones.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8edb47106e1a46a883d545849b8ab81b",
+ "metadata": {},
+ "source": [
+ "### 1.4 Jitter and initialization\n",
+ "\n",
+ "The `SparseCalibrationWeights` model is initialized with:\n",
+ "- **`init_keep_prob = 0.999`**: gate logits are set so $P(z_i > 0) \\approx 99.9\\%$. Nearly every record starts active, so the optimizer begins from a well-calibrated (dense) starting point and only prunes as the L0 penalty accumulates.\n",
+ "- **`log_weight_jitter_sd = 0.05`**: small Gaussian noise in log-weight space breaks symmetry between duplicate CPS records.\n",
+ "- **`log_alpha_jitter_sd = 0.01`**: small Gaussian noise on gate logits similarly breaks symmetry.\n",
+ "\n",
+ "When `initial_weights` is not supplied externally, `compute_initial_weights()` derives them from age-target rows in `targets_df`, giving the optimizer a demographically grounded starting point rather than uniform weights.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10185d26023b46108eb7d9f57d49d2b3",
+ "metadata": {},
+ "source": [
+ "### 1.5 Preset selection: `local` vs `national`\n",
+ "\n",
+ "```python\n",
+ "# From unified_calibration.py\n",
+ "PRESETS = {\n",
+ " \"local\": 1e-8, # ~3–4 M retained records\n",
+ " \"national\": 1e-4, # ~50 K retained records\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "The only difference between the two presets is $\\lambda_{L0}$:\n",
+ "\n",
+ "| Preset | $\\lambda_{L0}$ | Retained records | Use case |\n",
+ "|--------|--------------|-----------------|----------|\n",
+ "| `local` | $10^{-8}$ | ~3–4 M | Local area H5 datasets (state, district, city) |\n",
+ "| `national` | $10^{-4}$ | ~50 K | Web app national dataset |\n",
+ "\n",
+ "A larger $\\lambda_{L0}$ increases the gradient signal pushing gate logits below zero, so more records are pruned. The `local` preset applies almost no sparsity pressure — it retains geographic resolution at the cost of a larger output file. The `national` preset aggressively prunes, producing a compact dataset suitable for in-browser simulation.\n",
+ "\n",
+ "To override a preset, pass `--lambda-l0` directly:\n",
+ "\n",
+ "```bash\n",
+ "python -m policyengine_us_data.calibration.unified_calibration \\\n",
+ " --dataset cps_2024.h5 \\\n",
+ " --preset local \\\n",
+ " --epochs 100\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8763a12b2bbd4a93a75aff182afb95dc",
+ "metadata": {},
+ "source": [
+ "### 1.6 Toy example: Hard Concrete gate behavior\n",
+ "\n",
+ "The cell below creates a minimal `HardConcrete` gate, samples it at various logit values, and shows how `get_penalty()` relates to $P(z > 0)$. This uses only the `HardConcrete` class from `utils/l0.py` — no calibration matrix needed.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "7623eae2785240b9bd12b16a66d81610",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA90AAAGGCAYAAABmGOKbAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAgvxJREFUeJzt3Qd4U9X7B/Bv926htJS9yt57K6DIUEEUERAEUVCWouhfwZ+CigoOEEQBBQEVENyAgKhsZEPZe89Cy2ihpTP5P+8JKWlJ27RNejO+n+dJ781txsnJzT33vWe56fV6PYiIiIiIiIjI6tyt/5JERERERERExKCbiIiIiIiIyIZY001ERERERERkIwy6iYiIiIiIiGyEQTcRERERERGRjTDoJiIiIiIiIrIRBt1ERERERERENsKgm4iIiIiIiMhGGHQTERERERER2QiDbqIcuLm54d1332UemXj22WdRoUKFjPunT59W+fTZZ58VSj7J9yHvR86x/xARFfTYYO1ygccp63C2c6iC7mfbt29Hy5YtERAQoF5n9+7dVk0f2TcG3XSPuXPnqoPBjh07cs2db7/9FjVq1ICvry+qVKmCqVOnOnWObtq0SR10b9y4AUeXmJioPsvatWthb+w5bUREti5/jTcpW0uVKoWOHTviiy++wM2bN/P92gcPHlTHVblQSsDFixdVfjha4MPy0XoK8zeRmpqKHj164Nq1a/j888/xww8/oHz58jZ/X7IfDLop377++msMHDgQtWrVUsF2ixYt8PLLL+Pjjz926qD7vffec5qgWz5LXgPbmTNn4siRI9AqbW+//TZu375t0/cnItLS+++/r07Kp0+fjpdeeklte+WVV1CnTh3s3bs33wGGHFftNeguSNmSn3JBgm7JD3NBd2GUc4VddlPBfxMFOf84ceIEzpw5g9dffx0vvPAC+vbti6JFi/JrcSGeWieAHJMcdP73v//hkUcewS+//KK2DRo0CDqdDuPGjVMHFB5MnEtCQoJqEuXl5aVpOjw9PdXN2fPZUklJSfD29oa7u31cQ7W39BA5os6dO6Nx48YZ90ePHo3Vq1fj0UcfRdeuXXHo0CH4+fnBEY9Z2SlI2WLtckHrco7sU0H2sytXrqhlkSJF7O63R4WDZ0WUL2vWrMHVq1cxdOjQTNuHDRumDgLLli2zqF/M4cOH8dRTTyE4OBjFihXDiBEj1El7VvPmzUOjRo3USUZoaCh69eqFc+fOZXpM27ZtUbt2bXXlsl27dvD390fp0qXxySefZHpcSkoKxowZo14vJCREHbDuu+8+9ZlyS/P//d//qfWKFStmNP+TK6Rt2rRBvXr1zD6vWrVqqmlgblasWKFeJygoSOVHkyZNsGDBgkyP+fnnnzPyISwsTF0pvXDhwj190QIDA9X2bt26qfXw8HB1dTU9PV09RtIs24Rc5TV+FmPfK+NryJXZhx9+WKWpT58+ufa7kyZT0lxK0iefZf/+/fd8R3LLyvQ1c0ubuT5VaWlp6mJPZGQkfHx81Gu99dZbSE5OzvQ42S4nrRs3bkTTpk1V081KlSrh+++/z/X7Me27ntvnFLJvP/nkk2p/lfeRE+glS5aYbUq6bt069VsqXrw4ypQpk20apGZDHr9w4UJ1xV32b9nP4+Pj1f+3bt2KTp06qf1atkva/vvvv3teR/aN559/XjVblfyS/XnIkCHqt2F08uRJ1RRO0i+v1bx583t+17ml548//lC/Sfn8svz9999zzWciMu+BBx7AO++8o2rLpEzMy/FGjjXyexZSPhqPq6a1pVIGSVkoZaIc8+Wi+oEDBzK9T05lg7ze8OHDVTlVs2ZNdXyUFnD79u3LaB1XuXJllT4pB7LWLuY0Xsg333yTcXyXslH6xpoyVy78888/aN26tQpyJM1SFku5IORzy+uIAQMGZOSH5JO5tAipVJgyZYpqbSCfQcopOd5a0hXvq6++UmWN5ImUPRs2bLinPLTk3CS38tHSssdc02d5vORFVnI8l9eRcwhL05md7M4fsusrbcm5nznyG5EyVb5zea6cX8r+b7rPWfKbsCSdxv3eWN7JPiotQP/6669Mn1vKYyHvKc8x/e6tcb6Ql99vTueHed3f8/sduSLnrS4im4qKilJL0yvxQn54UsMl/5eAMDcScMsBePz48diyZYvqs3b9+vVMQdCHH36oTjTksdKcPSYmRjVnv//++9X7mF41lOfKQeGJJ55Qj5da+DfffFMdNKTmwFiAzJo1C71791a189JHTvqmS2C8bds21K9f32xa5TWPHj2KH3/8UQVdEvQKORA988wz6rUk+JKDrpGcGMhzJCDJiRxMn3vuOXWglhoN+Uzy2eSg/fTTT2c8RgpEOVGQ/Lp8+bI6IEpQlTUf5OApn6dZs2bqhOXff//FxIkT1UmLBFeSZmm2KOuPP/64+myibt26mQJZeQ05aZHXkGAqJ/KdSV7KhRe5cCJpk5NEOeGKiIiApSxJW1ayX3z33Xeq0HrttddU8Cl5JLVBWQO948ePq8dJ0Nm/f3/Mnj1bFUSy70r+58aSzykFXatWrVQQOmrUKFUI/vTTT6qQ+/XXX9XnMiUFqHxuOZGRi1a5kQsMUpssBaVcWJB1qQWTfVw+x9ixY9XvcM6cOSptcoInJ3rGJpWyLl0kpEVK9erVVQEsvxVptiivJfuWDPYi96XLiJywSP5KDZs8Lmv6zaXn77//Rvfu3dXJt3wXcpFO9t+cLioQUc6krJHAUX5fUuZYeryR8lJ+y1LGyvNlLBZhXEpTdjkeyjFfuojJb1+Ow3L8l/LFNFDKqWyQY40EC3J8FPLblwudb7zxBqZNm6aOdVJOy8VwKfPkuJUbufgsx9wXX3xRBR3yXCkX5MJgdjXSkifyvlJuSFN9CYTk2G+8CCmfW7bLMVeOgxKsCDnuZUfKDCmH5TgrZY7kg3xeOXfJei5kSvJRgjJ5j1dffVUFfvLdSGtA0+OhJecmuZWPeS17jCQf5X+//fabujgix3AjCSbluC7BlKXptIa8nPtlJede0h1Q0ix5LHku+SaBrlTMyD6b228iL+RCvuSd7N8S8MprSvl39uxZVX7KvivfyUcffaTeU87jrHm+kJffb27nh3nZ3wvyHbkkPVEWc+bM0cuusX379mzzZtiwYXoPDw+z/wsPD9f36tUrx3wdO3aseo+uXbtm2j506FC1fc+ePer+6dOn1ft8+OGHmR63b98+vaenZ6btbdq0Uc/9/vvvM7YlJyfrS5Qooe/evXvGtrS0NLXd1PXr1/URERH65557LtN2eT1Jq9Gnn36qtp06dSrT427cuKH39fXVv/nmm5m2v/zyy/qAgAD9rVu3ss0LeW5QUJC+WbNm+tu3b2f6n06nU8uUlBR98eLF9bVr1870mD///FOlZ8yYMRnb+vfvr7a9//77mV6rQYMG+kaNGmXcj4mJuefzZX2NUaNGmf1f+fLlM+5LXshj/fz89OfPn8/YvnXrVrX91VdfzfQdyS2318wpbcZ9x2j37t3q/sCBAzM97vXXX1fbV69enbFN3kO2rV+/PmPblStX9D4+PvrXXntNn5O8fM4HH3xQX6dOHX1SUlKm77Jly5b6KlWq3PNba926tdovc7NmzRr1+EqVKukTExMzvba8bseOHTP2GSGPqVixov6hhx7K2NavXz+9u7u72d+38bmvvPKKep8NGzZk/O/mzZvqtSpUqKBPT0/PMT2ifv36+pIlS6r92+jvv/9Wjzf9rokob+VvSEiIOp7n9Xjz888/q9eW360p+W0XKVJEP2jQoEzbo6Oj1XuZbs+pbJDtciw1LR+//vprtV3K4fj4+Izto0ePvqcsza5sKVasmP7atWsZ2xcvXqy2L126NNty4fPPP1f3pSzJjuSxPEbyPKusaZFyRB4rZXpWpsfcrORcQ9LfpEkTfWpqasb2uXPnqtczLQ8tPTfJqXy0dF8wZ+XKlffkq3j44YfVMT6v6RRZ05k1X7P7/vJy7mdO1vJIbN68+Z5zxOx+E9nJmk7jZ/T29tYfP348Y5ucw8r2qVOnZmwzlpfynqYKer6Qn99vbueHluzvBf2OXBGbl1O++3SbXgk1Jc1QLB1owng13Mg4YMzy5cvVUq4cShMXuYoWGxubcStRooQaLT1rcyZpKmNawy5plFo9uSJu5OHhkZF2eW0ZSVKu4MmVu127diE/pInVY489pmrBDcdgw9XERYsWqauVOfW5kSZwcqVYrnBK3pkyNmOS5jzSH0iucJo+RpoPSU2lueb8gwcPznRfrrKb5oMlTK965kY+p1ypNZJ8lyupxu/SVoyvP3LkyEzbpcZbZM0bqXk11moIuWIsTdAszZvcPqfsT1J7I/usfK/GfVZqeuXq8rFjx+7pEiC1BbJfWkquaJv255SBgOR1pVWEvI/xPeUq+IMPPoj169erfV1uUmvRpUsXszUzxv1NPot8LrlKbvrbkhohqTGQmoKc0nPp0iWVJtkuvw2jhx56SOU/EeWf/BaNo5jn53hjrgySli9Sc2lazsoxSY5t5poNZ1c2yPHGtFZNni+k1k9qALNut+S427Nnz0xjxBiP3zk911jDtnjxYnXcKyipcZTjo7QiyiqnKaSk7JbvQo7xpn2BpUl+1nFvCnpuUtB9QVpFSQs+OW8xklYJsn/Id2CtdFoir+d+WZmWR9J0XvJAujbIfmGtNJpq3769qik2kpYH0k0wt/3bGucL+fn95nZ+aMn+XtDvyBWxeTnlixzQTPt/mpImt5YO8CI/TFNy0JJmscZ+N3LAkSA26+OMsjYtk2ZEWQtAKdiyjvYqTWWlOY30o5EDspH0bc2vfv36qcJKmt9I0xppsiPNdKU5YE6kb5wwbZZurn+SkOAwKwm6pWmTKWP/m6z5IAWopeQEIS9Ngc19R1WrVlXNpGxJ8kb2GSlQTcmBXwpYY94ZlStX7p7XyEve5PY5pQmj7LPS5Epu5sgFFNPAPa/7XdbHy+9ESJCbnbi4OPWblaaBOe1rQvLMeFJsytjsTv5v+hpZ02PMc3N5JfuwLU56iFzFrVu3VH/O/B5vsjIePyToMkeCB0vLhqzHV+NFt7Jly5rdbslxN+trGoPVnJ4rQaI0gZYmr3JBWy4GSFNs6VqUn0EepZyWMTCkz2peGI+FWcsnyUNzfZsLcm5S0H1B0iQXR6Q5vzQnlyb5ElhJOkyD7oKm0xJ5PffLSip+pGuDdLGSoNVYGWIsC60tv+cV1jhfyOvv15LzQ0v294J+R66IQTflS8mSJVVNrhwMjIW/kJN6uUInP9b8yBowy1U02SYDRJirCZQr/qayqy00PeDKoA/Sh1dqLGVgNEm/PE8O0MYAOD/kqqT00ZHXl6BblhL4yRXQwpaXWtPsSIFr7RGo5bs0/S6Msg7gkd/XtoQl+0hBGGtVpH9zdgPoZT0By+soxFkfb3zPTz/9NNv+dPJbkavqtmAvoygTObvz58+roMF4DMnP8SYr42tIv1Aps7LKOlpzTmVDdsfXghx38/NcOSZJCx+pbZPWTjI+ilwUl8BE+sNbo4y0toKem1hjX5A+0NKnW865JB1yMVku7JsOFFuQdGZXTpsbxCsv535ZSatJCbhlmj0ZzE8u8sjryeezRssHa+3f1jhfyOvv11r7fkG/I1fEoJvyxXhiL02nZARTI7kvP0RLB9KQK2WmV+3kqp8833gFWGq+5aAlj5HaRGuQgaBkFFG5gmtaAJhrRpOXwE4OOtK8VwaekIEspBmvJc2GjU2SZBC27ApEGSlbyLyhWa9myjbj/20RpFrKeLXVlAwiZ3o1X66mmmtulbU2Oi9pk88u+4y8v+kAKNLKQJpc5SdvCvI5Zd8yXuUtrAsuxn1Irmjn9J5ydVseY260dVOSZ+bmqJVaDeP/c3t+dnllr3PfEjkCObEWxhP0vBxvsjuuGo8fEjxpcZHYVuTCgNRwy23SpElqECuZ6lQCcfmceSlnJI9WrlypLlzmpbbbeCyUcxsZIdtImmNLiz7TAUItPTfJLt3WKHukwkAqVeQChXQvkqbPkmfWOoeScwApl3M7ByjouZ+kUVp+SW28aSvMrO9t7fOgvLLGd2aL368l+7stzs+dHft0U75I4Cc/RBkd0ZTcl1Ehpa+xJWQaDVMy6qEwjjQuzcEkaJWpMbJeMZT7UqueV8Yg2PT1ZLTrzZs35/pcY99sc4WGkKbk0kRHRqqUJoCWjODeoUMH1ddNrhJnnS7NmEbpKyUH1BkzZmSaBkuuMMoI3ZbmtynjiLPZfZa8kosMpn2PZBRTyVfjd2k8SEvgJiNcGu3Zs+eeaa3ykjbjRZ/Jkydn2i4nWSI/eVOQzynfk4yQKrUF0rc5K9PPbi0yYrnkrYxEKvtddu8pJ6FSO7F06VKz09wY9zfJU/lcpr8J6R8u0/bIxYXc+mXLSZtceJMmiKZN+aTvWdb+4ERkGQmAZKYAOck1TtOVl+NNduWXBPByMU6CUtOmwuZew1GYa9VjrAwwlqG5leempNm1HB/lXCQvtZlSdsvo1TNnzlSBttH8+fPvaXps6blJduWjNcoeKSOkCb6UEXKBR9KctWl5Qc6hpJySMsG0y5+kNessIwU995PnZn2enF9mrVHPyz5gC9b4zmzx+7Vkf7fF+bmzY003ZUumUjKdZ9BI5tKWIFEKfxkITeYclB+99GWWZkcyhYClV4JPnTqlpiGSab7kgC3Pl9piY1MmOUB/8MEHahot4zQb8t7yPDlIy8BOxrkjLSXTiMgVWpmGQQIyeS0JZiWQMBewZA1uhFz5lWZKcnVSBqUyHrgbNGig+rrKPKVS69qwYcNc0yMHS5mCTPqeyTQS8vnlarAEozLtgwQu8j5Sey5TLslcjzJghnHKMAmCZBqSvJImSvKZ5Yq2XKWU70zSnlt/3+xILb1cGZcBduSkRoJgOdmQqWKMZIoYCYZlf5HpKKR7guS9TNVlnNs5r2mTfUWuaEtAKAWn5I8EjJJvsr+Y1i5YgyWfUy4myWNkqjpp7SBXs+X7kn1cmofKd2tNcqIk/Rcl8Je8lP1E+oDJxQGp1ZF9TE6ihBTM0rxS8kl+P7KfSmEv+6yMDSD94KUPpAwKKK8n05tI/kt+ym9FBlixpNuBXESS35fkg3zvchIsJz2Svtx+Z0SuTi6oygVKCXrk2CEBt1y0kppTmZLLdEBNS483EnTKSbKUJRL4SDNxuYAuJ/5ywVwuGkuZJWWbtIqR6Y6kabZMZ/Tll1/Ckch0YNK8XI5BkmdS1siUZdIX3ThApJxfyPFOyiA5r5ByXMayMNcvWcoRyR+ZCkpa8Mg5i7SwkvMe+Z9MCWaODDgmcztLc2fJaxl0Ss5lpEWcvL9pTaul5yY5lY/WKHskyJZjtdRcy+tknUKrIOdQsm/JNK7yXClbjFNbyecwHeujoOd+kka5aCDNyiVd8vllnB0pq03l9JsoLAX9zqR8t/bv15L93Rbn505P6+HTyf4YpyXI7nbu3LmMx37zzTf6atWqqekSIiMj1TQdOU2fkXXahYMHD+qffPJJNWVW0aJF9cOHD79n2izx66+/qmkSZPotuVWvXl1NW3bkyJGMx8jUG7Vq1brnuVmnqJD0ffTRR2qbTG8iUyXI1FvmprIwNy3HuHHj9KVLl1bTLpmbPuyTTz5R2+U98mLJkiVqigiZkio4OFjftGlT/Y8//pjpMYsWLVLplXSHhobq+/Tpk2n6KuPnlTyyZKqLTZs2qWki5Psz/azZvUZO07rIdGoTJ07Uly1bVqXvvvvuy5j6zdS8efPU9CPynjKtlExTYi7vs0ubuc8hU7G89957akorLy8vlQaZksZ0Cg4h7/HII4/ck6bspjIzldfPeeLECTU9l0yVI2mSfebRRx/V//LLL3maHshUdlOOGEVFRemfeOIJNUWNpE0+71NPPaVftWpVpsedOXNGpU2m95PHyfchvyfTaWAk/fLblKlIZDo82R/ld5KX9MjvtkaNGuo9atasqf/tt9+ynTKGiO4tf+X4J8cQmfZvypQpmabdyuvxRsycOVP93mWqn6xTJcm6TDso0wzJb17K9GeffVa/Y8eOjMfkVDbI68lxJLvjZm7HjpzKFnPvZVo2Zy0X5Jj32GOP6UuVKqXyUJa9e/fWHz16NNPryPRjcmySKY5Mpw8zd5ySaZokLXL+Ia8px8/OnTvrd+7cqc/NF198kXHOIcfS//77T5VvnTp1yte5SXblY172hexIOqR8k9f94IMPzP6/IOdQMnWkTH8qaZfzRzknMFeuW3ruZ45MYTZgwAB9WFiYPjAwUO3Xhw8fVumTdFr6m7B0yrCs+73I+l45lZfWOF8oyO/X3OeydH/P73fkitzkj9aBP7keufIrTVKk2YtMUeFMpPZZap7lyp+5ES3JMcn3KTUgMlgZr94SEVF+Sa2h1EZKE11pek5Ezo99uomsSK5hffvtt6rpLgNuIiIi1yZjtWSt3/r+++9Vlxvpz0tEroF9uomsQAaZkn520n923759WLx4MfOViIjIxW3ZskW1fpPxb6RPsfRdlovz0gdbthGRa2DQTWQF0kxeBkCTQVneeustNTgcERERuTYZ7LRs2bJqUCrjFEz9+vXDhAkT1EBrROQa2KebiIiIiIiIyEbYp5uIiIiIiIjIRhh0ExEREREREdmIpytO03Dx4kU1gbubm5vWySEiIrqHjHZ88+ZNlCpVCu7urnt9nGU2ERE5Q3ntckG3BNwyoAUREZG9O3fuHMqUKQNXxTKbiIicobx2uaBbariNGRMcHKx1cogcXnJyMr788ksMHz4cPj4+WieHyCnEx8erC8TGMstVscwmIiJnKK9dLug2NimXgJtBN5F1gm5pTiO/JwbdRNbl6t2gWGYTEZEzlNeu21GMiIiIiIiIyMYYdBMRERERERHZCINuIioQLy8vDBkyRC2JiIiIiCgzBt1EVOA+LCEhIS7f95SIiIiIyBwG3URUICkpKZgwYYJaEhERERFRZgy6iYiIiIiIiJwx6F6/fj26dOmCUqVKqaapf/zxR67PWbt2LRo2bKimJqpcuTLmzp1bKGklIiIiIiIicqigOyEhAfXq1cNXX31l0eNPnTqFRx55BO3atcPu3bvxyiuvYODAgVi5cqXN00pEROTKeKGciIgofzyhoc6dO6ubpWbMmIGKFSti4sSJ6n6NGjWwceNGfP755+jYsaMNU0pE2fH29saoUaPUkoicl/FC+XPPPYcnnnjC4gvlgwcPxvz587Fq1Sp1obxkyZIss4mIyKVoGnTn1ebNm9G+fftM2yTYlhpvItKGXq9HXFwcwsLCOII5kRPjhXIiIiIXCLqjo6MRERGRaZvcj4+Px+3bt+Hn53fPc5KTk9XNSB5LRNaTmpqK6dOnq9puGWuByFp0Oj2S03RISdMhOS1drcsyKVWWJttS766npuuQlq5Xy3SdHmlyS5el/E+P9DtLuS//N2wzPN7wOMP/jM/R6YB0veExcoHJsG5Im6zLUndnuzx23sBmqBgWwJ0gnxfKsyuzTbe7u7vDy8tLHXt0kul3eHh4wNPTU82kIN+VkWyT/2XdLq8hr2X6fsbtMs5M1hkZpDWPPF/e15Qc9yQdptvl+fL49PR0pKWl3bNdtsn/jPiZ+D1x37PT35Mcf3bMB85sBKL3A2mJ8EIa3GT2lixhlDfSIM9OzbLdxy0NOr0bUuFxN+3Qw9stHel6N6SZ2Z6md0e6SS9gd+jg5aZDql7W7m73gA6ebjqk6D2gV6m685mQDg83/T3bveRV3fRI1mdOo0t+psgOQMePCrzvZS1DnCLozo/x48fjvffe0zoZREROTwLfuNupiL+ddmeZivikVNxKTkNicjoSUtKQmJKOxJTM9xOS7yzvbDcG1BIQOxq5QED5v1CeXZk9adIk+Pr6qvUGDRqga9euWLFiBaKiojIe06ZNG7Rt2xY//fQTTpw4kbFdBmyVAVhnzZqFmJiYjO19+vRRA7LKa5sG2EOGDEFISIiaCtGUXFiUVj1ykdFITrpGjx6NkydPqib0RuHh4Rg6dCj27NmDpUuXZmyPjIxE3759Vde4devWZWznZ+L3xH3PDn9PT3VH5d0fYtKRikhxKwVAbsAQ/XcIwU1McBue+Rih/xJxCMJ0t/53jxH6ZIzGVziJ8pjv1u3uZ9LHYii+xx7UxlK3Dnc/k/40+uI3bEQLrHNrcfcz6fehK/7BCjyEKLc6dz+TfjPaYjN+whM44Vbh7mfS/42G2I9Z6IcYt7C7n0n/KyrjLCZhGFLc7laUuORnigoGoiYUeN+zdGwxN73pJR0NyRWD33//Hd263f3ysrr//vvVj2Ly5MkZ2+bMmaOumktBaOlV87Jly6rHBwcHW/lTELke+X3JySlrup2L1OxeT0zB1VtyS0ZsQgpibybjakKy2hZ7KxnXEw2BddydmwTLtuLuBvh6ecDH0x0+nh7w8XK/uy5LL3d4e7jD687N08MNHu5u8HI3rHu6u8HTw2Rdtt/Z5nXnsWrd/c7zPNzVUm7uboalh7uhrPJwy3571Ygg+HnfvcKfX1JWSeBnr2WVJWV21apVMWDAAHUiY7R8+XLVzzsxMdHi1mlSZl+5ciUjH1grzFphwRYJdlgr7EwtR26ch9evz8L9yl4kuwcCzV4AyjQB/ELh5ekJNzcgJTUt82fy8oQ8PdUkjeozeXsZPlNaeua0e3kiXbWyune7bJP/ZfpMnh7qNTJ/JinXPFRaMn0mDw/1v6zb5TXU95SSOd9d8jN5BQChlQq8712/fh3FixfPtbx2qJruFi1aqALb1D///KO2Z0cykE1eiWyLg6g5FmlKHR2XhEvqdhsXb2ReXo5PwrWEFOjycUlWCrggH0+E+Hsh2NdwC/T1RIC3B/x97iy9PRHgk2Xp7Ql/te4B34yg2hhkSwGs6WQblA8lSpTA5cuXM22T+3JSYi7gzqnMNrddTqrzcjzKbnt25wjmtsuJlrntcsJnbrucyMstKznJl1tW/Ez8nrjv2cHv6dx2YOHTQMIVIKA4fHotAMo2uefx5o4cbtlsd89mu3wac5dpPbMJ0sx/Imkynbft2XUGdPnP5J73fc/SOFPToPvWrVs4fvx4ppFOZSqw0NBQlCtXTl0dv3DhAr7//nv1fxkB9csvv8Qbb7yhRk9dvXq1aiKybNkyDT8FkWuTg41pTRbZB2mefe7abZyOTcDpq3dusYk4FZuAi3G31VVeSwLoov7eKBbgjWKBcvNBeKDPnfs+KOrvhRA/LwT73V1KwO0u1dLk8vJzoZyISFOHlwO/DADSkoCIOkDvH4EiZfmlUIFpGnTv2LFDzbltNHLkSLXs378/5s6di0uXLuHs2bMZ/5fpwiTAfvXVVzFlyhSUKVNG9b/gdGFE2pGmONIHplKlSurqNBV+rbUE0oejb+JIdDyOyPLyTVy4fjvHmmpvT3eUDPFVt1IhfihZRNb9UKqIL0oE+yEsyBuh/t6sYaYMvFBORE5t94/A4mGAPh2o2gno/i3gE6h1qshJ2E2f7sJi7/3kiBwN+3QXnrR0nQqo956Pw55zN9Ty+JVbSDHpI2VKmnKXLxagRtOuEOaPCsVkGYDyxfxVjbU0lSX7ZI9l1dq1azNdKDcyXih/9tlncfr0afU40+fIhfKDBw+qC+XvvPOOepwj5wMROaHNXwEr3zKs13sa6DoV8HCoXrikEUvLKe5NRER2Ki4xFdtOX8PWk1cRde4GDlyMU9NlZRXo44mqEYGoViII1SKCUK1EMCKLBzCwJquSUX9zuk4vgbe555iOHkxEZFfkmLb6A2DDZ4b7LYYDD42TjuVap4ycDINuIiI7CrI3n7yKraeuYuvJazgUHX9P32vpM123bAjqlimCemVCUKtUCMoU9WOtNRERUV5IAbvqfWDjJMP9B8cCrV81DGhCZGUMuomoQKSJssxjyKbKeSe1hkcv38Lqw1ew5vAV7Dx7XU3VZapSeACaVSyGJhWKol7ZIqhYLIADlRERERXU2vF3A+7OnxqmBSOyEQbdRFQgMsXG0KFDmYsWkqB684mr+OvAJaw5HIMLN25n+n9keABaRBZTgXaziqEoHuzLvCUiIrKmtR8D6z42rHeawICbbI5BNxEVSHp6Ovbs2YN69eqZncOQZIR3PaLOXceS3RexbF80Ym8lZ2SLzEHdMrIYHqheHG2rFUfZUH9mGRERka1smASs/ciw3uFDoPkQ5jXZHINuIiqQtLQ0LF26FLVq1WLQncXxKzfx887z+HPPpUw12jK/dafaJdC+RgRaRobBz5sXK4iIiGxu53fAqvcM6+3fA1oOZ6ZToWDQTURkRUmp6Vi+7xJ+3HYW209fzzR9V8daJdClfim0rhwGLw+OjEpERFRoDi8H/nzFsH7f60DrO+tEhYBBNxGRFVy8cRvfbTqtgu34pDS1zcPdDe2qFccTDUur5uO+XqzRJiIiKnRntwK/DAD0OqBBX+CBt/klUKFi0E1EBSKjlkdGRrrs6OW7z93ArA0nsWJ/dMbI46WL+KFXk7Lo0bgsSoRwIDQiIiLNxBwBfuwJpCUBVTsBj07htGBU6Bh0E1GBRy/v27evy+Xi1pNX8cXqY/jv+NWMbS0qFcPzrSuiXfXiqpabiIiINJRwFZjfA7h9HSjTBHhyDuDB8IcKH/c6IirwQGobN25E69at4enp/IeUTSdiMeXfY9h66pq67+nuhq71S6lgu1apEK2TR0RERCI9FfipH3DjDFC0ItB7EeDNGUJIG85/hkxENp8ybN26dWjRooVTB90HL8Zjwl+Hsf5ojLrv5eGmmo8PbRuJMkVZiBMREdmVFW8CZzYC3kFA74VAQDGtU0QuzHnPkImIrDRA2sS/j+K3qPPQ6w3Bdq8m5TCkbSRKFfFjHhMREdmb7bOAHd/KyDNA91lA8epap4hcHINuIiIzktPSMXP9SUxdfRzJaTq17dG6JfF/HauhfLEA5hkREZE9OrXBUMst2o8FqnXSOkVEDLqJqGDc3d3RoEEDtXQWG47FYOziAzgZm6DuN60YirceroH6ZYtonTQiIiLKzs1ow9RgujSgTg+gFefiJvvAmm4iKhAvLy907drVKXLx6q1kjF1yAH/uvaTuhwf54O1HaqBrvVIuOyUaERGRQ0hPA355HkiIASJqA12ncmowshsMuomoQFJTU7FixQp07txZBeCOavm+S3jnj/24mpACme2rf8sKePWhqgj2ddzPRERE5DLWjr8zcFog0OM7wIvjrpD9YNBNRAWi0+kQFRWFjh07OmROXktIwZjF+zNqt6uXCMJnPeqhdmlO/0VEROQQjv0LbPjMsN71CyCsstYpIsqEQTcRuaxNx2MxYtFuxNxMhoe7m5r+66UHqsDb03n6pxMRETm1uAvAb4MM642fB2p31zpFRPdg0E1ELictXYcvVh3D1DXH1TRglYsHYtJT9VC3DAdKIyIichg6HfD7i8Dta0CJukDHj7ROEZFZDLqJqEA8PDzQpk0btXQE0XFJeHlhFLaduqbu92xcFu92rQU/b8dIPxEREd2xZRpwegPgFQD0mAt4+TJryC4x6Caigh1EPD3Rtm1bh8jFHaevYfC8XYi9lYwAbw989EQdPFa/tNbJIiIiory6fBBY9Z5hveOHQLFI5iHZLXZcJKICSUlJwbx589TSni3afha9Z25RAbcMlvbny/cx4CYiInJEacnAby8A6SlAlY5Ao2e1ThFRjljTTUQFotfrceLECbW0R6npOny47BDmbjqt7neuXQITn6oHf28e/oiIiBx2erDL+wD/YpyPmxwCzzqJyGndSk7DkHk7seFYrLo/8qGqGN6uMtxlIm4iIiJyPGe3ABsnG9a7TAGCIrROEVGuGHQTkVOSacAGzN2G/Rfi4eflgcm96qNjrRJaJ4uIiIjyKzUJWDxc2tkB9fsANbowL8khMOgmooIdRDw90aVLF7W0F6diE9B/9jacvZaIYgHemP1sE9Qry+nAiIiIHNr6T4Grx4DAEpwejByK/ZwlE5FDkqnCGjZsCHux9/wNPDtnO64lpKBcqD++f64pKoQFaJ0sIiIiKojo/cB/d5qVP/IZ4MeL6eQ4OHo5ERWIjFo+bdo0uxi9fNfZ6+gzc6sKuGuXDsavQ1oy4CYiInJ0unRgyXBAl2ZoUs5m5eRgWNNNRAUio5bHxMRoPnr59tPXMGDOdjV4WtOKoapJeaAPD3FEREQOb8t04GIU4BMCPPyZ1qkhyjOekRKRw9ty8iqem7sdiSnpaBlZDLP6N+aUYERERM7g+mlg9QeG9Q7jgCAOikqOh0E3ETm0zSeuqlHKk1J1uK9KGL55pjH8vD20ThYRERFZw4pRQNptoMJ9QMN+zFNySAy6iahAvLy80KdPH7UsbHvO3cDA77argLtttXDM6NsIvl4MuImIiJzC0ZXA0RWAuyfwyCTAzU3rFBHlC4NuIioQd3d3VK5cudBz8djlm+g/ZxsS7jQpZ8BNRETkZHNyr3jDsN58KBBeVesUEeUbRy8nogJJTk7G+PHj1bKwnLuWiL7fbsWNxFQ1//Y3/RqzhpuIiMiZbPrC0J87qCTQ5k7wTeSgGHQTUYEV5nRhV24mqYD7cnwyqkYEYi5HKSciInIu188AGyYa1jt8APgEaZ0iogJh0E1EDuN2SjoGfrcDZ64momyoH354vhmKBnhrnSwiIiKyppVvAWlJhsHTandn3pLDY9BNRA4hXafHK4uisPd8HIr6e+H755ohIthX62QRERGRNR1fBRz+0zB42sOfcvA0cgoMuomoQGTU8iFDhth89PIJKw5h5YHL8PZwx8x+jVExLMCm70dERESFTJcO/P2OYb3JIKB4DX4F5BQYdBNRgbi5uSEkJEQtbeWHLWcwc8Mptf5pj7poXCHUZu9FREREGtk9H7hyAPAtwsHTyKkw6CaiAg+iNmHCBJsNprbxWCzeXXJArb/eoSoeq1/aJu9DREREGkq+Baz+wLAuo5X78wI7OQ8G3URkt2RqsOE/7lL9ubs3LINh7Qp/PnAiIiIqBJumArcuA0UrAE0GMsvJqTDoJiK7lJSajsHzdhrm4i4Tgg8fr23TJuxERESkkfhLhnm5Rfv3AE8ffhXkVBh0E5Hd0ev1GP3bPhy4GI9iAd6Y3rcRfL08tE4WERER2YI0K09NBMo2B2o+xjwmp8Ogm4gKxNvbG6NGjVJLa5m76TR+j7oAD3c3fPl0Q5Qq4me11yYiIiI7cvmAYQA10fFDThFGTknzoPurr75ChQoV4Ovri2bNmmHbtm05Pn7y5MmoVq0a/Pz8ULZsWbz66qtISkoqtPQS0b210nFxcWppDTvPXMOHyw6p9dGdq6NFZDFmOZGdYJlNRFa3+kM5mwBqdgPKNGYGk1PSNOhetGgRRo4cibFjx2LXrl2oV68eOnbsiCtXrph9/IIFC1SNmjz+0KFD+Pbbb9VrvPXWW4WediIySE1NxfTp09WyoOISU/Hyj7uRptPj0bol8XzrisxmIjvBMpuIrO78DuDIMsDNHXjgbWYwOS1Ng+5JkyZh0KBBGDBgAGrWrIkZM2bA398fs2fPNvv4TZs2oVWrVnj66adV7XiHDh3Qu3fvXGvHicj+SU35m7/uxYUbt1G+mD/GP1GHA6cR2RGW2URkdaveNyzrPw2EVWEGk9PSLOiWOX137tyJ9u3b302Mu7u6v3nzZrPPadmypXqOMcg+efIkli9fjocffrjQ0k1EtjFv61n8dSAaXh5umNq7AYJ8vZjVRHaCZTYRWd3JtcCpdYC7F9DmTWYwOTVPrd44NjYW6enpiIiIyLRd7h8+fNjsc6SGW57XunVrVSuWlpaGwYMH59i8PDk5Wd2M4uPjrfgpiEgUdBC1Q5fiMe7Pg2r9zU7VUbdMEWYskR1hmU1EViXjwKwaZ1hv/BxQpBwzmJya5gOp5cXatWvx0UcfYdq0aaoP+G+//YZly5Zh3Lg7P1ozxo8fj5CQkIybDL5GRNbj4+OD0aNHq2V+3E5Jx/AFu5CSpsOD1YuzHzeRk2CZTUTZOrICuLAD8PIH7nuNGUVOT7OgOywsDB4eHrh8+XKm7XK/RIkSZp/zzjvv4JlnnsHAgQNRp04dPP744yoIl8Bap9OZfY4EAzKysvF27tw5m3weIlclv73jx49n+xvMzcd/HcaJmAREBPvg0x712I+byA6xzCYiq5HzBZmXWzQbDARlbvVK5IzctWyO2qhRI6xatSpjm5y0y/0WLVqYfU5iYqLq921KAneR3XRFUvsWHByc6UZE1iOjls+fPz9fo5f/dzxWzcktPnmyHkIDrDfXNxFZD8tsIrKaQ0uAKwcAnxCg1cvMWHIJmvXpFjJdWP/+/dG4cWM0bdpUzcGdkJCgRjMX/fr1Q+nSpVVNtujSpYsaPbVBgwZqTm+pXZPab9luDL6JyDHEJ6Xi/37eo9b7Ni+HNlXDtU4SEeWAZTYRWaWWe/2nhvXmQwC/osxUcgmaBt09e/ZETEwMxowZg+joaNSvXx9//fVXxuBqZ8+ezVSz/fbbb6ump7K8cOECwsPDVcD94YcfavgpiCg/3ltyEBfjktT0YG89XIOZSGTnWGYTUYHJnNyX9wM+wUDzwcxQchlu+uzaZTspGb1cBlST/t1sak5knamEZs2apcZasHQU87/2R2PwvJ1wdwN+erEFGlcI5VdBxLKKZTaRM5OQ4+v7gOh9wP3/BzzwttYpIiq02FLTmm4icnwSaA8dOtTix19PSMH/ft+n1l+4P5IBNxERkauMWC4Bt3cg0Nzy8wYiZ+BQU4YRkf1JT09XU/jJ0hLjlh3E1YQUVI0IxKsPVbF5+oiIiMgOarnXfWxYbzoI8GcLN3ItDLqJqEDS0tKwdOlStczN+qMx+G3XBbi5ARO614WPJwdAJCIicnrH/gYu7TbMy91iuNapISp0DLqJqFAkpqThrTvNyvu3qICG5ThiKRERkUvVcjcZCASEaZ0iokLHoJuICsWkv4/i/PXbKF3ED693rMZcJyIicgWn1gMXdgKevkDLl7RODZEmGHQTUYHINH6RkZFqmZ09525g9n+n1PoHj9dGoA/HcCQiInIJGz83LBs8AwQW1zo1RJrgmS8RFXj08r59+2b7/7R0HUb9tg86PdCtfim0q8YCl4iIyCVcjAJOrgHcPICW7MtNros13URUIDKA2tq1a7MdSO37zWdw6FI8ivh74Z1HazK3iYiIXMV/UwzL2t2BohW0Tg2RZhh0E1GByFRh69atMztl2JX4JHz+z1G1/kbH6igW6MPcJiIicgVXTwAHFxvWW7+idWqINMWgm4hs5qPlh3AzOQ31yoSgZ5OyzGkiIiJXsekLQK8DqnQEImppnRoiTTHoJiKb2HLyKv7YfVHNyT2uW214uGc/0BoRERE5kZvRwO4FhvXWr2qdGiLNMegmooIdRNzd0aBBA7U0Sk3XYczi/Wr96ablULdMEeYyERGRq9gyDUhPAco2B8q30Do1RJrj6OVEVCBeXl7o2rVrpm1z/zuNo5dvITTAG//HObmJiIhcR1I8sGOOYZ19uYkU1nQTUYGkpqZiyZIlailibiZjyqpjan1Up+oo4u/NHCYiInIVUT8AyfFAWFVDf24iYtBNRAWj0+kQFRWllmLSP0dwKzkNdcuE4MlGZZi9REREriI9Ddgyw7DefKj0QdM6RUR2gb8EIrKaAxfjsHD7ObU+5tGacOfgaURERK7j8FIg7izgXwyo10vr1BDZDQbdRGQVer0e7y89CL0eeLRuSTSuEMqcJSIichVyArDpS8N6k4GAl5/WKSKyGwy6iahAPDw80KZNG6w+chVbT12Dj6c7RnWuzlwlIiJyJee2ARd2AB4+hqCbiDIw6CaiAvH09ESL1vdhwsqj6v4L91dCmaL+zFUiIiJXsnmqYVn3KSCwuNapIbIrDLqJqEBSUlIwadq3uHjtJooH+WBwm0jmKBERkSu5dhI49KdhvcUwrVNDZHcYdBNRgVy7lYyU65fgBqg5uQN8PJmjRERErmTr19KpG6jcHiheQ+vUENkdnh0TUYF8s+GkWlaNCMQTDTlFGJE9SU5OxtatW3HmzBkkJiYiPDwcDRo0QMWKFbVOGhE5i6R4IGre3WnCiOgeDLqJKN/OXUvEwm3n0NMbGNmhGjw4RRiRXfjvv/8wZcoULF26FKmpqQgJCYGfnx+uXbumAvFKlSrhhRdewODBgxEUFKR1conIke35EUi5BYRVBSIf0Do1RHaJzcuJKN8m/n0ESel6RBephTbVIpiTRHaga9eu6NmzJypUqIC///4bN2/exNWrV3H+/HlV233s2DG8/fbbWLVqFapWrYp//vlH6yQTkaPS6YBt3xjWm74AuElnMyLKijXdRJQv+y/E4Y/dF9W1u6HdH1KjmBOR9h555BH8+uuv8PLyMvt/qeWWW//+/XHw4EFcunSp0NNIRE7i5Grg6nHAOwio10vr1BDZLdZ0E1G+fPzXYbXsWqc4NiyZr0YxJyLtvfjii9kG3FnVrFkTDz74oM3TREROauudWu4GfQAfdlUhyg6DbiLKs/VHY7DhWCy8PNww4sHKiImJgV6vZ04S2Znt27ergdSykm07duzQJE1E5ETThB3727DeZJDWqSGyawy6iShPJLj+dOURtd63eXmUKerPHCSyU8OGDcO5c+fu2X7hwgX1PyKifNs26+40YWGVmZFEOWDQTUR58vfBy9h3IQ7+3h4Y1o6FLJE9kz7bDRs2vGe7TBsm/yMiypfkW3enCWv6IjORKBcMuonIYjqdHpP+PqrWB7SqgLBAH9V3tE+fPhb3ISWiwuPj44PLly/fs10GT+Pgh0SUb3sXAclxQGglQ003EeWIQTcRWezPfZdw5PJNBPl64oX7Ig0HEXd3VK5cWS2JyL506NABo0ePRlxcXMa2Gzdu4K233sJDDz2kadqIyEHJGC7bZt7ty83ynyhXPEsmIoukpesw+R9DLfeg+yohxN9Qs52cnIzx48erJRHZl88++0z16S5fvjzatWunbhUrVkR0dDQmTpyodfKIyBGdWg/EHAK8AgyjlhNRrjixLhFZ5PeoCzgZm4Ci/l54rnXFTP/jdGFE9ql06dLYu3cv5s+fjz179sDPzw8DBgxA79692SWEiPJn251pwmRebt8Q5iKRBRh0E1GuUtJ0mLLqmFof0jYSgT48dBA5ioCAALzwwgtaJ4OInMGNs8CR5Yb1pjyuEFmKZ85ElKufdpzD+eu3ER7kg2eaV2COEdmxJUuWoHPnzqomW9Zz0rVr10JLFxE5ge3fAnodULENULy61qkhchgMuokoR0mp6Zi62lDLPbxdZfh5e2T6v5zYDxkyhE1ViexEt27dVJ/t4sWLq/XsuLm5IT09vVDTRkQOLC3ZZJqwQVqnhsihMOgmohzN23IGl+OTUSrEF72aljV74h4SEqKWRKQ9nU5ndp2IqEAOLQUSY4GgkkDVzsxMojzg6OVElK2E5DRMX3tCrb/8YBX4eGau5TYOojZhwgQOpkZkh77//nuzMwvI71b+R0RksR2zDcuG/QEP1tsR5QWDbiLK1vebz+BqQgrKF/NH90ZlmFNEDkZGKjedo9vo5s2b6n9ERBa5chg48x/g5g407MdMI8ojBt1EZFZiShpmbjip1l96oAq8PHi4IHI0er3ebNeP8+fPq24hREQW2TnHsJRm5SGlmWlEecS2IURk1oKtZ3EtIQXlQv3xWP1SzCUiB9KgQQMVbMvtwQcfhKfn3eJeBk87deoUOnXqpGkaichBpCQCu380rDd+TuvUEDkkBt1EZHbE8hnrDLXcw9pF5ljL7e3tjVGjRqklEdkH46jlu3fvRseOHREYGJjxP/mtVqhQAd27d9cwhUTkMA78BiTHAUXKA5EPaJ0aIofEoJuI7rFw21nE3kpG6SJ+eLxBmVybr0qf0bCwMI5gTmQnxo4dq5YSXPfq1Qs+Pj5aJ4mIHH0AtcYDAHd2NSPKD/5yiOieWu7p6wwjlg9pGwlvz5wPE6mpqZg+fbpaEpF9eeCBBxATE5Nxf9u2bXjllVfwzTffaJouInIQF3cDF3YC7l5A/b5ap4bIYTHoJqJMft55Xs3LXSLYFz0ac8RyIkf29NNPY82aNWo9Ojoa7du3V4H3//73P7z//vt5fr2vvvpK1Z77+vqiWbNm6rVycuPGDQwbNgwlS5ZUte1Vq1bF8uXL8/15iEijWu6aXYHAcGY/kaMG3SzAiexHSpoOM+7Myz24TSWz83ITkePYv38/mjZtqtZ/+ukn1KlTB5s2bcL8+fMxd+7cPL3WokWLMHLkSNV0fdeuXahXr57qL37lyhWzj5e5wB966CGcPn0av/zyC44cOYKZM2eidGmOfEzkEJLigH2/GNY5gBqR4/bpNhbgM2bMUFfMJ0+erApwKZiLFy+ebQEu/5MCXAruM2fOoEiRIpqkn8jZ/LbrPC7cuI3wIB/0alrO4udxEDUi+yTdPoz9uf/991907dpVrVevXh2XLl3K02tNmjQJgwYNypjfW8ruZcuWYfbs2Wowxaxk+7Vr11SQ7+XlpbZJLTkROYi9PwGpCUBYVaB8K61TQ+TQNK3pNi3Aa9asqQpwf39/VVCbYyzA//jjD7Rq1UoV3m3atFFX24moYFLTdfhq7XG1/uL9leDrZVktt5zQjx49mgM1EdmhWrVqqbJ1w4YN+OeffzKmCbt48SKKFStm8evIRe+dO3eq5ulG7u7u6v7mzZvNPmfJkiVo0aKFal4eERGB2rVr46OPPlJTlhGRndPrgR1z7tZyu7lpnSIih6ZZ0F1YBXhycjLi4+Mz3YjoXot3X8S5a7dRLMAbfZqVtziLdDodjh8/rpZEZF8+/vhjfP3112jbti169+6dcZFaylNjs3NLxMbGqrJWyl5Tcl/6iptz8uRJ1SpNnif9uN955x1MnDgRH3zwQbbvwzKbyE6c2wZcOQB4+gH1emmdGiKHp1nQXVgF+Pjx4xESEpJxK1u2rNU/C5GjS9fp8dUaQy33oPsrwc/b8r7c0nxV+ody9HIi+yPBtpS3cjNtRfbCCy+oGnBbkgtx0h1MRkpv1KgRevbsqQZwy+l9WWYT2Ymdd8Z8qP0E4FdU69QQuV6f7lOnTqlmatKXOjExEeHh4WjQoIGqgZbRTAurAPfw8FCF+IULF/Dpp59mzEmalTR7lX7jRlLTzcCbKLNl+y7hVGwCivh7oW9zy2u5icj+SXlZtGjmk+a89q0OCwtTr3P58uVM2+V+iRIlzD5HRiyXvtzyPKMaNWqoC+vS2s3cWBAss4nsZAC1A78b1hv21zo1RK4VdEtN1pQpU7Bjxw5VG12qVCn4+fmpPtYnTpxQAXefPn3w5ptvonz58nZTgEt/U+MgMkR0L71ej+l3Riwf0LIiAn00HV+RiAqoYcOGWLVqlQq05aK4Ww59MWUUcktI+SoXuuV1u3XrlnEhXO4PHz7c7HNk7JUFCxaox0n3MXH06FFVlmc3+CLLbCI7ICOWp90GwqsDZS3vhkJE2bPo7FoKbSkgn332Wfz666/31BRLHyzph71w4UI0btwY06ZNQ48ePeyiACeinK0/FotDl+Lh7+2Bfi3yXsstJ/TS4iWnE3siKjyPPfZYxsVmY/lqDdJqrH///qqcl/7gMuNIQkJCxmjm/fr1U7OKSBNxMWTIEHz55ZcYMWIEXnrpJRw7dkyNw/Lyyy9bLU1EZAO7vjMsG/bjAGpEVuKml2quXKxcuVJN5WWJq1evqjk5JaC2ZMowKcBlkBdjAS7ziB4+fFjVpmctwM+dO6dGYpXnGAvw5557ThXg0k/MEtK8XPp2x8XFITg42KLnEDmzXt9sxpaT1/B864p459GaWieHiOy4rJIgWrp0SQuz+vXr44svvlBTfhr7j0uzddP5v+WC/Kuvvordu3er8vz5559XLeJMW6w5Yj4QOa2Lu4Fv2gAe3sDIw0CA5bMcELmieAvLKYtqui0NuIVMQWLpNCQyqEpMTAzGjBmTUYD/9ddfGYOrnT17NqNGW0gNu1wAkAK8bt26qgCXK+hSgBNR3u06e10F3F4ebhh4X8V8ZaEMbLhnzx41KrKlJ9JEVLikC9aVK1fumWWgXLlyeXodaYmWXWu0tWvX3rNNxnvZsmVLHlNLRJqJ+sGwrP4oA24iK8pz500Jjrdu3Zoxwrj0v5ar3Nn1w84NC3Ai7cy405e7W/3SKBnil6/XSEtLw9KlS1UrFAbdRPZFumBJ7fKmTZsybZdGbtIlhHNmE1GGlERg7893m5YTUeEH3dJv68UXX1T9tqWgDg0NVdtlIDUpvGX+T2km7u/vb73UEZHNHL9yE38fNAxk+GKbSsxpIick/a09PT3x559/qvFPOPYCEWXr4GIgOQ4oUh6o2IYZRaRF0C3NuLdt24Zly5ahffv2GTVacpVcBj+TPtbymJkzZ1ozfURkI1+vO6mWHWpGoHLxIOYzkROSvtQ7d+5E9erVtU4KEdm7Xd8blg2fAUy6dxJRwVn8i5JRy2VwFOnfbdqEVNY7dOiA2bNn45dffrFCkojI1i7euI0/dl9Q64PbRhbotaTmLDIykjVoRHaoZs2aiI2N1ToZRGTvYo8BZzcBbu5A/T5ap4bIdYNuGXwlp2m55H9ZB2ghIvv07cZTSE3Xo3mlUDQsV7RAryW//b59+3LaPiI79PHHH+ONN95Qg5zJ7CIyyqrpjYgo0zRhVToAwaWYKURaBd2PPvooXnjhBURFRd3zP9km83F26dLF2ukjIiu7kZiCH7edVetD2lYu8OvJQGpyQi9LIrIv0h1MRg9/8MEHUbx4cRQtWlTdihQpopZEREhLAXb/aMgIDqBGpG2fbpmb8+mnn1bzb0tBLYW3kClIbty4oZqdy2OIyL59v/kMElPSUbNkMO6vElbg15NxHdatW6emBpIBm4jIfqxZs0brJBCRvTu6AkiMBQJLAFUsnyaYiCxn8RmyBNorVqzA4cOHsXnz5kxThsnJNgdpIbJ/iSlpmPPfqYy+3BzJmMi5tWnDEYiJKBc77zQtr/804MGL50S2kOdflgTXDLCJHNNP28/hemIqyoX64+HaJbRODhHZwN69e1G7dm24u7ur9ZzUrVuX3wGRK7txFjix+u6o5URkE7ycReQiUtN1mLnBUMv9wv2V4OlhnelA5MS+QYMGaklE2qtfv75qjSbdwGRdWrTo9fp7HifbpXsIEbmwqPkA9EDF+4HQSlqnhshpMegmchFL91zEhRu3ERbogycblbHa63p5eaFr165Wez0iKphTp04hPDw8Y52IyCxdOhA1z7DesD8ziciGGHQTuQCdTo8Z606o9edaV4Cvl4fVXjs1NVWN99C5c2cVgBORtsqXL292nYgoE2lWHn8e8CsKVH+UmUNkQwy6iVzAmiNXcPTyLQT6eKJPM+uehOt0OjVtoMxgQET25+LFi9i4caOabUR+r6ZefvllzdJFRHYyN3fdXoCXr9apIXJqDLqJXMD0tYZa7j7NyyHEj7XRRK5i7ty5ePHFF+Ht7Y1ixYplmrFA1hl0E7moW1eAIysM6xxAjcjm8j3y0YEDB9CwYUPs2LHDuikiIqvafvoadpy5Dm8PdzzfqiJzl8iFvPPOOxgzZgzi4uJw+vRp1cfbeDt58qTWySMirez5EdClAaUbAxG1+D0Q2WvQLVfP9+zZg9mzZ1s3RURkk1ru7o3KoHiw9ZuPeXh4qLmAZUlE9iUxMRG9evXi7AJEdJfMZrDre8N6Iw6gRmS3Qbf0CZs/fz5effVVLFq0CCkpKdZPGREV2OHoeKw+fAXubsCL99tmKhBPT0+0bdtWLYnIvjz//PP4+eeftU4GEdmTM5uAq8cB70Cg1hNap4bIJeTrLPmvv/5Sc3uOHz8ev/76KxYvXowePXpYP3VEVCBfrzM0H+1cuyQqhAXYJDflottPP/2Ep556SvUbJSL7IeX0o48+qsrtOnXq3DPDwKRJkzRLGxFpxFjLXfsJwCeQXwORvQbd3333HXr27KkK7z59+qim5gy6iezLuWuJWLLnolof3CbSZu+j1+tx4sQJtSQi+wu6V65ciWrVqqn7WQdSIyIXc/s6cPAPw3rDZ7VODZHLyHPQfePGDSxduhTr169X95955hl88sknuHz5MiIiImyRRiLKh1kbTiJdp8d9VcJQp0wI85DIBU2cOFGNvfLsszy5JiIA+34B0pKA4rWA0g2ZJUT22qd74cKFqFixIho3bqzuy9VzGcV83rx5tkgfEeXD1VvJWLTjnFofYsNabiKybz4+PmjVqpXWySAieyAt0nbemZu7YT9p7qJ1iohchmd+mpZL7bapvn374uuvv8Zrr70GR5GcnKxuwt3dXTWVT01NVYPEGclozDI4lPRZNW06K9vkf1m3y2vIaxlf13S7NOPLOuCc9H+V58v7Zj1JknSYbpfny+OlL31aWto922Wb/M+In8m1v6e5648hPTUV9UoXRfNKofd8Vmt+JtG5c2f1f3kf7nuuve/xM1nne8qan/k1YsQITJ06FV988YVVXo+IHNil3cDlfYCHD1D3Ka1TQ+RS8hR0nzt3TjUjzxp09+7dG5MnT8bRo0dRtWpVOAIZPMbX1zB9UoMGDdC1a1esWLECUVFRGY+RaZBkVGYZJEr6rBp16dJF1e7PmjULMTExGdulf3vlypXVa5uePA8ZMgQhISGYMGFCpjSMGjVKzZ06ffr0jG1y0jV69Gg1f6qMEG8UHh6OoUOHqmnapHm/UWRkpLrosXHjRqxbty5jOz8Tv6e+fkDthk+qE35b7nubN29W+578frjv8RjB4551juVJSUmwhm3btmH16tX4888/UatWrXsGUvvtt9+s8j5E5ACMtdw1uwL+oVqnhsiluOldbPSj+Ph4FQBfuXIFwcHBahtrfFiL5Uy1jd9tOoVPVx5FhWL+WP5qO3h6uNv0M8k8wHPmzEG/fv3Uff6e+Htypt+TVp9JyqrixYuri2PGsio/BgwYkOP/5bfrCGV2QfOByOWlJACfVQNSbgL9lwIV73f5LCEqzHLKZSfWlZMhuZnKWgNglN00SNltz/q6OW2XEy1z2+WEz9x2OUGUW1Zy8mhunmR+Jtf6nuDhiW83n0cqPDCwbVV4eXrY/DPJttjYWJUm0+dx33OtfY+fybrfU3b5mRcSwLdr1w4dOnRAiRIlCvx6ROTADvxhCLiLVgTKt9Y6NUQuJ88DqRGR/VocdRGX45MREeyDbg1Ka50cItKQBO+DBw+2Wv9wInJgu0wGUHPn6T9RYeOvjshJ6HR6zFhvGHtgYOtK8LlTy01Erqtp06aZxiohIhd05TBwbivg5gHUf1rr1BC5JJdtXk7kbP4+eBknYxIQ7OuJ3s3KFdr7SvNkGUQwu2bKRKQdGbRNZhY5f/48GjVqhICAgEz/r1u3rmZpI6JCEvWDYVm1ExDEriZEWmDQTeQEZBCn6esMtdz9WlRAoE/h/bSlz6qM2k9E9qdXr15q+fLLL2caJ0COGbI0HYiOiJxQWjKw50fDeqP+WqeGyGXluXn5c889p+bqNjdym/yPiArf5pNXsefcDfh4uuPZVhUK9b2lv+j48ePZb5TIDp06deqem0xjZlwSkZM7vAxIvAoElQIiH9Q6NUQuK8/VYXPnzsWiRYuwc+dONTe31HKJ27dvq2B89uzZtkgnEeVgxjrDyXPPJmURFljwUY/zKusUSkRkH8qXL691EojIHgZQa9BXzXBCRA40kNqyZcuwfPlydOzYEdevX7d+qojIYvsvxGH90Rh4uLth0H2VmHNElMkPP/yAVq1aoVSpUjhz5ozaJhfNFy9ezJwicmbXTgEn10qnEkPQTUSOFXTXrFkTW7duRWpqqhoZ9dChQ9ZPGRFZZMadvtxd6pZE2VB/5hoRZZg+fTpGjhyJhx9+GDdu3Mjow12kSBEVeBORCwygFtkOKMpWL0QOFXTLwCuiWLFi+Pfff9GmTRu0aNECS5YssUX6iCgHp2ITsHzfJbX+YptITfJKRi0fMmQIRy8nskNTp07FzJkz8b///Q8eHnenEWzcuDH27dunadqIyIbS04Co+Yb1hhxAjUhree7cISOeZjzZ0xOzZs1SNd8yLQkRFa6v152ATg88UL04apQM1iT75UJcSEhIxgU5IrIfMmBagwYN7tnu4+ODhIQETdJERIXg2ErgVjTgHwZUe5hZTuRoNd1r1qxBaGhopm3SdG3FihUYM2aMNdNGRDmIjkvCr7vOq/WhbbWp5TYOojZhwgQOpkZkhypWrIjdu3ffs/2vv/5CjRo1NEkTERWCnXcGUKvfG/D0ZpYTOVpNtzQnN6d9+/bqRkSFY9aGk0hN16NphVA0rpD5QhgRkfGi+LBhw5CUlKRaqm3btg0//vijmuZPWqoRkROKuwAc/8ewzqblRI4TdEst1ogRI+Dn55frY2WAtdjYWDzyyCPWSB8RmXE9IQULtp1V60PbaVfLTUT2beDAgarsfvvtt5GYmIinn35ajWI+ZcoU9OrVS+vkEZEt7J4P6HVA+VZAWBXmMZGjBN0HDx5EuXLl0KNHD3Tp0kUNwBIeHq7+l5aWpv6/ceNGzJs3DxcvXsT3339v63QTubTvNp9GYko6apUKRpuqht8iEZE5ffr0UTcJum/duoXixYszo4iclU4H7LozajlruYkcq0+3BNEyUrlMESZXyUuUKAFvb28EBQWpwVhkkJbZs2ejX79+OHz4MO6//37bp5zIRSUkp2HuptNqfUjbSM0HMJNjwahRo9SSiOzLAw88oKYKE/7+/hkBd3x8vPofETmZk6uBuLOAbwhQs6vWqSGivPbprlevnpp25Ouvv8aePXtw9uxZ3L59G2FhYahfv75aEpHt/bjtLG4kpqJiWAA61y6peZZLP9G4uDh1DND6AgARZbZ27VqzgxxKH+8NGzYwu4icdQC1uj0Br9y7hRKRnQ6k5u7urmq2zU1BQkS2lZyWjlkbTqn1F++vBA937YNcaQEzffp0VdstLV+ISHt79+7NWJcuYNHR0Rn309PT1ejlpUuX1ih1RGQTt64AR5Yb1tm0nMgxg24ppD/77DMsWbJEXTV/8MEHMXbsWIsGVyMi6/h91wVExychItgHjzfkCTMRmSct0KTlidzMNSOXsnvq1KnMPiJnsnsBoEsDSjcCStTWOjVElJ+g+6OPPsK7776rpgWTwlpGPr1y5Yrqy01Etpeu0+Pr9SfV+qD7KsHH04PZTkRmnTp1SnX9qFSpkpomzDj4qZDxF6Rvt4cHjyFETkOvB3bdGciYtdxEjht0y2Bq06ZNw4svvqjuy8BqMi2YzPMpTc6JyLZW7L+EU7EJKOLvhd5Ny9lVdnMQNSL7Ur58ebXUyUjGROT8zvwHXDsBeAcCtbtrnRoiysLiaFkGTnv44Ycz7kuNtzRbkynCiMi2pMZq2poTar1/iwoI8MnzcAw2I/24R48ezf7cRHbqhx9+QKtWrdT83GfOnFHbPv/8cyxevDjPr/XVV1+hQoUK8PX1RbNmzVQtuiUWLlyozhm6deuW5/ckojwMoFb7CcAnkFlG5KhBt8zHLYWsKS8vLzWIUkGxECfK2bqjMTh4KR7+3h54tmUFu8ouqUk7fvw4a9SI7JAMcjhy5Eh10VymDpPxWUTRokUxefLkPL3WokWL1GvJeC67du1Ss5p07NhRdTXLyenTp/H666/jvvvuK9BnIaJsJF4DDt65iNbwWWYTkSMH3VLT9uyzz+KJJ57IuMmUI4MHD860La9YiBPlbtpaQy33003LoWiAfc2HLRfe5s+fb5ULcERkXTJYmkz3+b///S9TH+7GjRtj3759eXqtSZMmYdCgQRgwYABq1qyJGTNmqLm/cxrbRYL8Pn364L333lP9y4nIBvb+BKQnAxG1gdINmcVEjhx09+/fXw28EhISknHr27evaq5mui2vWIgT5WzH6WvYduoavDzcMPA+nrQSUd4GVDM3xad0C0lISLD4dWTWkp07d6quZUYynovc37x5c7bPe//999W5w/PPP2/R+yQnJyM+Pj7TjYhyG0Dtu7sDqLlpP5UoEd3L4o6hc+bMgbUZC3HpD5rfQnzDhg25FuByM2IBTo7myzXH1bJ7wzIoEZK5iwcRUU4qVqyI3bt3ZwysZiTzdNeoUcPizIuNjVW11hEREZm2y/3Dhw+bfc7GjRvx7bffqve31Pjx41WtOBFZ6NxW4MpBwNMPqNuD2UZkpzQdjakwCnEW4OTI9p6/gbVHYuDh7oYhbSNhj2RwJJmOSJZEZF+kD/awYcNUdzDpJiYDn/3444+qbJTZR2zl5s2beOaZZ1TT9rCwMIufJxfhJc2mF8rLli1ro1QSOYEdd7p3yIjlfkW1Tg0RZcN+hkC2USHOApwc2RerDLXcj9UvhfLFAmCPZLqwoUOHap0MIjJj4MCB8PPzw9tvv43ExEQ8/fTTqlvYlClT0KtXL4vzTMpc6RN++fLlTNvlfokSJe55/IkTJ9QAal26dMnYZpy+zNPTE0eOHEFkZKTZZu9yIyILJFwFDvxhWG/8HLOMyI5pGnQXRiHOApwc1f4Lcfj30GW4uwHD2lXWOjnZktYqe/bsUSMZmw7URET2QQYyk5sE3bdu3VLds/Jzca1Ro0ZYtWpVxrRfUv7K/eHDh9/z+OrVq98zUJsE/nLxXAJ+1l4TWcGeBYYB1ErU5QBqRHZO06CbhThR9r5cbajl7lKvFCLD7XfOTZlOcOnSpahVqxaDbiI7JiONyy2/pNm3DKoqI583bdpUTTkmg7HJaOaiX79+KF26tGq6LlOM1q5dO9PzixQpopZZtxNRPkil0445d2u52cWLyK5p3rychTjRvQ5Hx+OvA9GqDB1ux7XcRGR/ZLRyS8dYkPm2LdWzZ0/ExMRgzJgxiI6ORv369dWAbMZxWc6ePasGQyWiQnB6PXDtBOAdBNThAGpE9k7zoJuFONG9pt6p5X64TklUiQhiFhGRxYzNv4UMoDZt2jQ1r3aLFi3Uti1btuDAgQP5GotBmpKba04u1q5dm+Nz586dm+f3I6JsbP/WsKzXE/Cx39ZwRGQnQbdgIU5017HLN7F83yW1/tID9l/LLTVqMpYCRy8nsg9jx47NNJDayy+/jHHjxt3zmHPnzmmQOiIqsPhLwOFlhvVGhu4dRGTf2A6MyA7n5dbrgY61IlC9RDDsnYzN0LdvX7UkIvvy888/q77WWclv9tdff9UkTURUQFHzAH06ULYZUIJjJBA5AgbdRHbkZMwtLN1zUa2/9EAVOAIZSE2alcqSiOyLTBf233//3bNdtslgZ0TkYHTpwM47XTU4TRiRw7CL5uVEZPDVmhPQ6YH2NYqjdukQh8gWmTJs3bp1qr+oTN1HRPbjlVdewZAhQ9SAaTLiuNi6dStmz56Nd955R+vkEVFeHfsHiD8P+BUFat4dv4GI7BvPkInsxJmrCfhj9wWHquUmIvs2atQoVKpUSc2NPW/ePLWtRo0amDNnDp566imtk0dEebVjtmFZvw/gxdYqRI6CQTeRnZiy6hjSdXq0qRqOemUN89kSERWUBNcMsImcwI2zwLG/DescQI3IobBPN5EdOH7lJv6IMtRyj3yoKhyJzMsr8wJzfl4i+6CXkRiJyPmovtx6oGIbIMz+ZzchorsYdBPZgc//Oab6cneoGeFwtdxeXl7o2rWrWhKR9mrVqoWFCxciJSUlx8cdO3ZM9feeMGFCoaWNiPIpNQnY+Z1hnQOoETkcNi8n0tiBi3FYtu8S3NyAkR0cq5ZbpKamYsWKFejcuTMDbyI7MHXqVLz55psYOnQoHnroITRu3BilSpVSo5Vfv34dBw8exMaNG3HgwAEMHz5cBd5EZOcO/gEkxgJBpYDqj2idGiLKIwbdRBqb9PdRtexSt5RDzMudlU6nQ1RUFDp27Kh1UogIwIMPPogdO3aowHrRokWYP38+zpw5g9u3byMsLEx1B5G5u/v06YOiRYsyz4gcwdavDcsmzwEebFlG5GgYdBNpaNfZ61h1+Ao83N3wSnuOWE5E1tO6dWt1IyIHd34HcHEX4OENNHxW69QQUT6wTzeRhib+fUQtuzcsjUrhgfwuiIiIyHwtd+3uQGA4c4fIATHoJtLIphOx+O/4VXh5uDn0vNweHh5o06aNWhIREZEV3bwMHPjdsN70BWYtkYNi83Iijab0mXinL3fvpuVQNtTfYb8HT09PtG3bVutkEBEROec0YbpUoEwToHRDrVNDRPnEmm4iDaw9GoOdZ67Dx9Mdw9o59lybMi3RvHnzcp2eiIiIiPIgLQXYMduw3vRFZh2RA2PQTaRJLbehL3f/lhUQEezr8J/nxIkTaklERERWcmgJcCsaCIwAaj7GbCVyYAy6iQrZ8n3R2H8hHgHeHnjx/krMfyKymQceeADvvffePdtlvm75HxHZsW3fGJaNBgCe3lqnhogKgH26iQpRSpoOn6w8rNYH3lcJxQJ9mP9EZDNr167Fvn37EBUVpebrDggIMByLUlKwbt065jyRvbq4Gzi3FXD3BBoP0Do1RFRArOkmKkQ/bjuLM1cTERbog0FOUsstA6l16dJFLYnI/vz777+Ijo5G8+bNcfr0aa2TQ0R5qeWu2Q0IKsE8I3JwDLqJCsnNpFRMWXVMrb/SvgoCfZwjSJWpwho2bMgpw4jsVMmSJVWtdp06ddCkSRNV+01EdiwhFtj3i2G9GQdQI3IGDLqJCsnX607iWkIKKoUHoGeTsk6T79JMddq0aRy9nMgOubm5qaWPjw8WLFiAESNGoFOnTuo3S0R2atd3QHoyULK+YaowInJ4zlHVRmTnouOSMGvjSbX+Zqfq8PJwnutdMmp5TEwMRy8nskNZZxV4++23UaNGDfTv31+zNBFRLtOEbZt5t5b7zoUzInJsDLqJCsHkf48iKVWHxuWLokPNCOY5ERWKU6dOITw8PNO27t27o3r16tixYwe/BSJ7c+A34OYlILAEUPtJrVNDRFbCoJvIxg5Hx+OnHefU+uiHq2c09yQisrXy5cub3V6rVi11IyI7Ii1TNn1pWG/2AqcJI3IiztPGlchOm3aO+/MgdHqgc+0SaFQ+FM7Gy8sLffr0UUsiIiLKp1Prgcv7AC9/w9zcROQ0WNNNZEP/HLyM/45fhbenO956uIZT5rW7uzsqV66sdTKIiIgc2+avDMv6fQB/57tIT+TKWNNNZCPJaen4cPkhtT6wdUWUDfV3yrxOTk7G+PHj1ZKIiIjyIeYIcGylzDkANB/CLCRyMgy6iWxk7n+nceZqIsKDfDC0nXPXBMu0YURERJRPW+5M41f9EaBYJLORyMkw6CaygZibyZi6+rhaf6NjNQT6sCcHERERmZEQC+xZaFhvMYxZROSEGHQT2cCkf47gVnIa6pYJQfeGZZjHREREZJ7My52WBJRqCJRrwVwickIMuomsbO/5G1i43TBF2JhHa8Ld3bmnCJNRy4cMGcLRy4mIiPIqJQHY9rVhveVLAKcVJXJKDLqJrChdp8c7f+xXU212q18KjSs4/+ijMu94SEgI5x8nIiLKq13fA7evA6GVgJqPMf+InBSDbiIr+nHbWew5H4cgH0+89YhzThFmbhC1CRMmcDA1IiKivEhLATZ9aVhv+TLg7sH8I3JSDLqJrOTqrWR8uvKIWn+tQ1UUD/Jl3hIREZF5+38B4s8DgRFAvd7MJSInxqCbyEomrDiMuNupqFkyGH2bl2e+EhERkXk6HbBxsmG9+VDAixfqiZwZg24iK9hx+hp+3nlerY/rVhueHvxpERERUTaOrgBijwA+IUDj55hNRE6OkQFRAaWm6/D2H/vVes/GZdGofFGXylNvb2+MGjVKLYmIiCgXMtrqhkmG9SbPA77BzDIiJ8egm6iAvl53Aoejb6Kovxfe7Fzd5fJTr9cjLi5OLYmIiCgXpzcCF3YAHj5A8yHMLiIXwKCbqACOX7mFL1YdV+tju9RCaIDr1fampqZi+vTpaklERES5WP+JYdmgLxBYnNlF5AIYdBPlk06nx+jf9iIlXYe21cLxWP1SzEsiIiLK3pnNwKn1gLsX0PpV5hSRi2DQTZRP87eewfbT1xHg7YEPH68DNzc35iURERFlb93HhmWDPkCRsswpIhfBoJsoHy7euK2mCBNvdKqO0kX8XDofOYgaERFRLs5tA06uAdw9gdYjmV1ELsRT6wQQORoZMOx/v+9DQkq6Gqn8GRefk9vHxwejR4/WOhlERESOUctdrzdQ1LXPHYhcDWu6ifJo4fZzWHMkBt6e7vi4ex24u7t2s3KdTofjx4+rJREREZlxfidw/F/AzQO4j7XcRK6GQTdRHpy5moBxfx5U6290rIbKxYNcPv9k1PL58+dz9HIiF/DVV1+hQoUK8PX1RbNmzbBt27ZsHztz5kzcd999KFq0qLq1b98+x8cTucSI5XV7AqGVtE4NEbli0M1CnBxBuk6P137ag8SUdDSrGIrnWlXUOklERIVm0aJFGDlyJMaOHYtdu3ahXr166NixI65cuWL28WvXrkXv3r2xZs0abN68GWXLlkWHDh1w4cIFfmvkWi7sAo7+Bbi5A/e/rnVqiMgVg24W4uQovll/EjvOXEegjyc+61HP5ZuVE5FrmTRpEgYNGoQBAwagZs2amDFjBvz9/TF79myzj5cWMEOHDkX9+vVRvXp1zJo1S3VDWbVqVaGnnUhTqz8wLOs8BRSL5JdB5II0D7pZiJMjOHQpHpP+OaLWx3SpibKh/lonyW7IVGnh4eGcMo3IiaWkpGDnzp2qibiRu7u7ui+12JZITExU3VBCQ0NtmFIiO3N6I3BilWFe7nYcdJTIVWkadLMQJ0dwOyUdIxZGITVdj4dqRqBHozJaJ8nupguT2ixOG0bkvGJjY5Geno6IiIhM2+V+dHS0Ra/x5ptvolSpUpkC96ySk5MRHx+f6UbksPR6YNU4w3qj/kDRClqniIhcMegujEKcBTgV1HtLD+Do5VsID/LB+CfqsEY3C/kNS/9OWRIRmTNhwgQsXLgQv//+uxqELTvjx49HSEhIxk36gRM5rGP/AOe2AJ5+wP3/p3VqiMiVm5fbuhBnAU4FsXj3BTVFmJsbMKVnfYQF+jBDs0hLS8PSpUvVkoicU1hYGDw8PHD58uVM2+V+iRIlcnzuZ599psrrv//+G3Xr1s3xsaNHj0ZcXFzG7dy5c1ZJP1Ghk2k0V79vWG/2AhCU8++EiJybu7MX4izAKb9OxSbgrd/2qfWXHqiClpXDmJlE5JKk+0ijRo0yDYJmHBStRYsW2T7vk08+wbhx4/DXX3+hcePGub6Pj48PgoODM92IHNLBP4DofYBPMNDqFa1TQ0SuHHQXRiHOApzyIzktHcMX7EJCSjqaVgzFyw9UZkYSkUuT6cJk7u3vvvsOhw4dwpAhQ5CQkKBGMxf9+vVTF7qNPv74Y7zzzjtqdHOZ21u6jcnt1q1bGn4KokKQngas+dCw3vIlwJ+DBxK5Ok97KMT79++vguemTZti8uTJ9xTipUuXVs3EjYX4mDFjsGDBgoxCXAQGBqobkTWM+/MgDlyMR1F/L3zRqwE8PRy6J4bNRy+PjIxkX3ciJ9ezZ0/ExMSoMljKXpkKTC5+G8dlOXv2rBrR3Gj69OlqwNQnn3wy0+vIPN/vvvtuoaefqNDsmgtcPQ74FwOaD2HGE5H2QTcLcbI3P20/h3lbzqp+3JOeqo8SIdkP+kOGFit9+/ZlVhC5gOHDh6ubOWvXrs10//Tp04WUKiI7khQPrDFUFKHtaMAnSOsUEZEd0DzoFizEyV7sPncDb/+xX62/2r4q2lUvrnWS7J4MoLZx40a0bt0anp52cUghIiLSxsbPgcRYoFgVoNGz/BaISGGbWaI7Ym4mY/APO5GSrkOHmhEY3o79uC0hU4WtW7eOU4YREZFru3EO2DLNsP7Q+4CHl9YpIiI7waCbCEBqug7D5u9CdHwSIsMDMPGpenB3d2PeEBERkWVWjwPSkoDyrYFqnZlrRJSBQTe5PL1ejzGLD2Db6WsI9PHEN/0aI8iXV6eJiIjIQhd2AXsXGdY7fiCjjDLriCgDg25yeV+vP4kftxkGTpvcsz4iwzkKfl7IaMUNGjTINGoxERGRy9Drgb/fNqzX7QWUaqB1iojIznDUI3Jpf+69iAkrDqv1MY/WRPuahqlvyHJeXl7o2rUrs4yIiFzTgd+AM/8Bnr7Ag+9onRoiskOsmiKXtfPMNYz8aY9aH9CqAga0qqh1khxSamoqlixZopZEREQuJfkWsPJOLXfrkUBIGa1TRER2iEE3uaRTsQkY+N0OpKTp8FDNCLz9SE2tk+SwdDodoqKi1JKIiMilrP8EuHkRKFoBaDVC69QQkZ1i0E0u58KN2+g7ayuuJ6aibpkQTOlVHx4cqZyIiIjyIuYosPkrw3qnjwEvX+YfEZnFoJtcbi7uZ2ZtVYF3pfAAzH62Cfy9ObQBERER5XHwtBX/B+jSgKqdgGqdmH1ElC0G3eQy4hJT0W/2NpyMTUDpIn6Y93wzhAX6aJ0sh+fh4YE2bdqoJRERkUs4uBg4uRbw8AE6TdA6NURk51jFRy7hVnIaBszdhkOX4lWgPW9gM5Qq4qd1spyCp6cn2rZtq3UyiIiICkdSPLDyLcN661eAUA7ESkQ5Y003Ob34pFT0+3Yrdp29gRA/L8wb2BQVwwK0TpbTSElJwbx589SSiIjI6a16D4i/YBg8rfWrWqeGiBwAa7rJBZqUb8We83EI9vXED883RfUSwVony6no9XqcOHFCLYmIiJza2S3A9lmG9S5fAF5sNUdEuWPQTU7rWkKKGqX84KV4FPWXGu5mqFUqROtkERERkSNKTQKWvGRYb9AXqNRG6xQRkYNg0E1O6XJ8Evp9uw1HLt9EWKA35g9sjmolgrROFhERETmqDROB2KNAQHGgwwdap4aIHAiDbnI6xy7fxLNztqtpwYoH+WDBoOaoXDxQ62Q59UBqXbp0UUsiIiKndPkAsHGSYf3hTwG/olqniIgcCM+SyalsP30Nz8/djvikNFQKC8B3zzVF2VB/rZPl1GSqsIYNG2qdDCIiIttITwUWDzPMyV3tEaDmY8xpIsoTjl5OTuOv/ZfQZ9ZWFXA3KFcEvwxpyYC7EMio5dOmTePo5URE5JzWfwpcjAJ8Q4BHPgPc3LROERE5GNZ0k8PT6fT4as1xTPr3KGQA7fY1IjC1dwP4eXtonTSXIKOWx8TEcPRyIiJyPud3AOs/M6w/MgkILqV1iojIATHoJoeWkJyG13/egxX7o9X9/i3K451Ha8LTg404iIiIqABSEoDfXgD06UDtJ4E6TzI7iShfGHSTwzp7NREv/LADh6NvwsvDDeMeq41eTctpnSwiIiJyBn+/A1w7AQSVMjQrJyLKJwbd5LD9t9/4Za/qvx0e5IMZfRuiUflQrZPlkry8vNCnTx+1JCIicgrH/gF2fGtY7zaNo5UTUYEw6CaHkpSajg+XHcIPW86o+/XLFsGMvo1QIsRX66S5LHd3d1SuXFnrZBAREVlH/CXg98GG9WZDgMh2zFkiKhB2fCWHcfzKLXT76r+MgPvFNpXw8+AWDLg1lpycjPHjx6slERGRQ0tPA355DkiMBSLqAO3Hap0iInICrOkmu5eu0+PbjScx8e+jSE7TISzQGxOfqo82VcO1ThqZTBtGRETk8NZ8AJzdBHgHAU99B3j5aZ0iInICDLrJ7mu3/++XPYg6e0Pdv79qOD7rURfFg9icnIiIiKzo6N/Axs8N612/AIpFMnuJyCoYdJNdSknTYeaGk5iy6phaD/LxxNuP1sBTjcvCzc1N6+QRERGRM4k7D/z+gmG9yUCg9hNap4iInAiDbrI7G47FYOySAzgZk5BRuz3hiTooVYRNvOyRjFo+ZMgQjl5ORESOKfU2sKgvcPs6ULIe0PEjrVNERE6GQTfZjQs3buOjZYewbN8ldT8s0AejO1fHEw1Ls3bbjknLg5CQEH5HRETkePR6YMlLwMUowC8U6PEd4OmjdaqIyMkw6CbN3UhMwbS1JzB302nVlNzdDejfsgJefagqgn0597MjDKI2YcIEjBo1Cj4+PFEhIiIHIn249/0MuHsCT30PhFbUOkVE5IQYdJNmbqek4/vNp/HVmuOIT0pT25pXCsWYR2uhZqlgfjNERERkO0dWAKveN6x3/hioeB9zm4hsgkE3FbqE5DTM23JGDZQWe8sw1VT1EkF4s3N1tK0azmbKREREZFuXDwK/DpL25UDj5wyDpxER2QiDbio0cYmp+GHLaXy78RSuJ6aqbWWK+uGV9lXxeIPS8JB25UREREQ2PSE5D8x/Eki5CVS4D+j8CfObiGyKQTfZ3ImYW5j732n8svM8bqemq20VivljWLvK6NagNLw83PktODBvb2/Vn1uWREREdk1GKJ/XHYi/AIRVM/Tj9uD4MURkWwy6ySbS0nVYcyQG87eewdojMRnbpRn54DaReLRuSXgy2HYKer0ecXFxCAsLY9cAIiKy76nBfnwaiDkMBJUE+v4K+IdqnSoicgEMusmqzl5NxKIdZ/HzjvO4cjNZbXNzAx6sHoHnW1dUA6XJFFPkPFJTUzF9+nSOXk5ERPZLlw78OhA4uwnwCTEE3EXKap0qInIRDLqpwC7HJ2HZ3ktYuvcios7eyNheLMAb3RuVwdNNy6FCWABzmoiIiLQJuBcPAw7/CXj4AL0XABG1+E0QUaFh0E35cvVWMlbsj8bSPRex7fQ16PWG7VKJ3bpyGHo1KYeHakbA25P9tYmIiEgjOh2w5GVgz4+Amwfw5LdAhdb8OoioUDHoJovodHocuBiPNUeuYPXhK9hz/kZGoC0alS+KLnVL4uE6JVE82Je56mI4iBoREdllwP3nCGD3PMDNHeg+C6jRRetUEZELYtBN2boSn4Qtp65hw9EYrD0ag5g7fbSNapcORpe6pfBI3ZIoU9SfOemifHx8MHr0aK2TQURElDngXv46sOt7Q8D9xEyg9hPMISLSBINuynDhxm1sPXkVW09ew9ZTV3H6amKm3PH39lBNx9tVL4521YqjRAhrtEnOa3Q4efIkKlWqBHd3dicgIiKNpacBS14C9iyQjm9AtxlAnSe1ThURuTAG3S7qRmIK9p6Pw97zN7DnzvJyfOaabOmfXbNkMJpXKqaC7CYVi8LH00OzNJP9jl4+f/58jl5ORET2MS3YL88BR5Yb+nB3mwbU66l1qojIxTHodnIpaTqcik3A0cs3M26Ho2/iTJZabOHh7obapUPQrGKoujWuEIoQPy9N0k1ERESUJ0lxhnm4z2wEPH2BHnOBap2ZiUSkOQbdTkCv1yPmVjLOXUtUwbTcjl+5hSOXb+J0bALSdCYjnpkoX8wfdcsUQb0yIWpZq1QwAny4SxARkXlfffUVPv30U0RHR6NevXqYOnUqmjZtmm12/fzzz3jnnXdw+vRpVKlSBR9//DEefvhhZi9ZX/wlYMFTQPRewCcY6L0QqNCKOU1EdoERlgNITdepQcyi45NwOS4Jl+KScP76bZy9lqgCbVneTk3P9vlBPp6oEhGIqhFBqBIRhGoRQWoQtCL+3oX6Ocg5ubm5ITw8XC2JyHktWrQII0eOxIwZM9CsWTNMnjwZHTt2xJEjR1C8ePF7Hr9p0yb07t0b48ePx6OPPooFCxagW7du2LVrF2rXrq3JZyAndTEK+LE3cPMSEBAO9P0VKFlP61QREWVw00s1qQtdOY+Pj0dISAji4uIQHBwMrSSnpeN6QiquJiTjWkJKptvVhBQVZF+ONwTYsbeSM03PZY67G1AyxE/VXpcL9UdkeCCqlghC1YhAlAj2ZUBERORA7KWsMiWBdpMmTfDll19mDKJYtmxZvPTSS2pMh6x69uyJhIQE/Pnnnxnbmjdvjvr166vA3VHzgezMgT+A3wcDabeB8OqGGu7QilqniohcRLyF5ZTmNd2OfOX8VnKaCoxvJqXhZlJqpmV8lm3xt9NwM9mwlMBanpsXnu5uiAj2RUSwjxo1vHQRP5QrFqACbLnJfW9PjhxNhS89PR179uxRF8w8PDjQHpEzSklJwc6dOzNNDyizFbRv3x6bN282+xzZLuW7KSnf//jjD2ji6glAl32rMHIkeuDGOeD4P8DWOxdwKrcHnpwN+IZonTgiIvsLuidNmoRBgwZhwIAB6r4E38uWLcPs2bPNXjmfMmUKOnXqhP/7v/9T98eNG4d//vlHXXm39Mq5tfy26zzGLD6Q7+fLwGVF/b1RLMAboXIL9Eaov2E9LMhH1VCrW4iveoy7VGcT2Zm0tDQsXboUtWrVYtBN5KRiY2PVBbaIiIhM2+X+4cOHzT5HWq+Ze7xsz05ycrK6mdYgZN0uwb6Xl5eaOUFq243kop+np6e6QGDaiE+2yf9S5nSD/tbd9/ZCKtyhRzIyd7WS7W7QIyXLdm+kQA83pCLzAKM+SIEuy3Z5vjdSkQ53pJmcarlBB2+kIQ0eSMfdi5Tu0MELaUiFJ3S4ewFdHuWJdKTAE3qT7Z7qFXRIgZdKk8t/psYDgQfeBtw84a3Xq+9f9o9M35OPj9pfTLdLtyhvb2+1b0tZlnW7bJP/ZXxP+d33smyX15DXMt3XjdvlveXxmfY9b29+Jn5P3Pfs9PeU9X3tMugujCvn2RXg1iAjewf5eiLY17A03MyteyH4zjZ5rATVxQJ81H0G0kRERAbSiu29994ze4He19dXrTdo0ABdu3bFihUrEBUVlfGYNm3aoG3btvjpp59w4sSJjO1dunRBw4YNMSupI2LcgjK29/FaicruFzApuW+mAHuI128IcUvAhJRnMqVhlPcPiNMHYHrqE5kC8dE+83BSVxrzUztmbA93u46h3r9jT3pVLE1rnbE90u08+nr/jY1pDbAuvUHG9gbuR9DV6z+sSG2FKF21u5/JIwptPaPwU0oHnNCXufuZPDeiocdRzEp5HDH6oq75mYJXI6R0NUw4VBrYCWDnZ4bPNGqUauY5ffr0u5/J21uda548eVJNcZnxmcLDMXToUNVaSy4eZ3ymyEj07dsXGzduxLp16+5+pvzue7NmISYm5u5n6tMHlStXVvu1aUAwZMgQ1Ux1woQJmb8nfiZ+T9z3YK+/p5UrV8Lu+3RfvHgRpUuXVk3GW7RokbH9jTfeUAe5rVu33vMcyZTvvvtONTE3mjZtmiqkL1++fM/j3333XbMFOPuHEVmHXNSSA5ocxOQqIREVnL31ZZYTGX9/f/zyyy+qS5dR//79cePGDSxevPie55QrV05dJH/llVcyto0dO1ZdJJcTGEsvlEu/8StXrmTkA2sbWYMqWCvM2nu2SGArC3uo6b5+/brqEm33fbptTa5amNaMGwtwIrIOOfDIFT+OXk7kvOTkolGjRli1alVG0C0nJ3J/+PDhZp8jF9Pl/6ZBt3QHM73InpWc9Ji7eGduu5xQZZfWvGzP7mKhue1ynDO3XU74zG2XppDmxrqQZpJyy4qfid8T9z3+nniMcKxjuaUVTpoG3WFhYeoDZK2hlvslSpQw+xzZnpfHZ1eAE5F1yMFPmtgQkXOTC9hSs924cWM1w4gMfCqjkxvHZOnXr59qvSZNxMWIESNUs9uJEyfikUcewcKFC7Fjxw588803Gn8SIiKiwuVuL1fOjYxXzrO7Em68cm4qtyvnRGQ70rRm7dq1mZrdEJHzkSnAPvvsM4wZM0ZN+7V792789ddfGYOlnT17FpcuXcp4fMuWLdUMIxJky+wG0jRdmpZzjm4iInI1ms/TLVOGyZXzr7/+OuPKuQxEIaOhSkGe9cq59P+WK+fSh9R45fyjjz6yeMowe+snR+To2KebyPpYVjEfiIjI/jnMPN1y5VxGoJMr5zKNiFw9z3rlXNrXZ71y/vbbb+Ott95ClSpVeOWciIiIiIiI7JLmQbeQQViyG4hFmq1m1aNHD3UjIiIiIiIismea9ukmIscnLVFk/kjTFilERERERGRHNd1E5LhkipuuXbtqnQwiIiIiIrvEqikiKpDU1FQsWbJELYmIiIiIKDMG3URUIDLNX1RUlFoSEREREVFmDLqJiIiIiIiIbMTl+nQbpyWXOdWIyDrzdCclJanflI+PD7OUyAqMZZSxzHJVLLOJiMgZyms3vYuV6OfPn0fZsmW1TgYREVGuzp07hzJlyrhsTrHMJiIiZyivXS7oln6nFy9eRFBQENzc3KxydUOCeMno4OBguBpX//zC1fPA1T+/YB4wD6y9D0jRfPPmTZQqVcqlp+OzZpnN3ynzgPsA9wHuA9wHrL0fWFpeu1zzcskMW9QayBfmqgGHcPXPL1w9D1z98wvmAfPAmvtASEgIXJ0tymz+TpkH3Ae4D3Af4D5gzf3AkvLadS+fExEREREREdkYg24iIiIiIiIiG2HQXUAyWvPYsWNddtRmV//8wtXzwNU/v2AeMA+4D9g/fkfMA+4D3Ae4D3Af0Go/cLmB1IiIiIiIiIgKC2u6iYiIiIiIiGyEQTcRERERERGRjTDoJiIiIiIiIrIRBt1WtmzZMjRr1gx+fn4oWrQounXrBleUnJyM+vXrw83NDbt374YrOH36NJ5//nlUrFhRff+RkZFqkIaUlBQ4s6+++goVKlSAr6+v2ve3bdsGVzF+/Hg0adIEQUFBKF68uPq9HzlyBK5qwoQJ6jf/yiuvwJVcuHABffv2RbFixdRvv06dOtixY4fWyaJcsLx23fLaVctsltcsr41YXhcr9PKaQbcV/frrr3jmmWcwYMAA7NmzB//99x+efvppuKI33ngDpUqVgis5fPgwdDodvv76axw4cACff/45ZsyYgbfeegvOatGiRRg5cqQ6Udm1axfq1auHjh074sqVK3AF69atw7Bhw7Blyxb8888/SE1NRYcOHZCQkABXs337drXv161bF67k+vXraNWqFby8vLBixQocPHgQEydOVBddyX6xvHbt8toVy2yW1yyvjVhee2lTXsvo5VRwqamp+tKlS+tnzZrl8tm5fPlyffXq1fUHDhyQkfH1UVFRLpsnn3zyib5ixYp6Z9W0aVP9sGHDMu6np6frS5UqpR8/frzeFV25ckXt8+vWrdO7kps3b+qrVKmi/+eff/Rt2rTRjxgxQu8q3nzzTX3r1q21TgblAcvru1heu06ZzfI6M5bXLK8LG2u6rURq+aSJobu7Oxo0aICSJUuic+fO2L9/P1zJ5cuXMWjQIPzwww/w9/eHq4uLi0NoaCickTTB27lzJ9q3b5+xTfZ/ub9582a46vctnPU7z47U9j/yyCOZ9gVXsWTJEjRu3Bg9evRQXQzk+D9z5kytk0U5YHltwPLadcpsltf3YnnN8rpBIZfXDLqt5OTJk2r57rvv4u2338aff/6pmiu0bdsW165dgyuQKd+fffZZDB48WJ2Eurrjx49j6tSpePHFF+GMYmNjkZ6ejoiIiEzb5X50dDRcjTRTlL7M0tS4du3acBULFy5UQYz0b3fVY//06dNRpUoVrFy5EkOGDMHLL7+M7777TuukUTZYXrO8drUym+V1ZiyvWV6v1KC8ZtCdi1GjRqnBRXK6GfsFif/973/o3r07GjVqhDlz5qj///zzz3CFPJDC6ubNmxg9ejSciaWf35S0eujUqZOq/ZKaf3KN2l5p2SJBqKs4d+4cRowYgfnz56uB9FyRHPsbNmyIjz76SF01f+GFF9RvXvqGUuFiec3yOi/7gSmW2a6F5TXL6wYalNeehfIuDuy1115Ttbc5qVSpEi5duqTWa9asmbHdx8dH/e/s2bNwhTxYvXq1alYsn9uU1Hr36dPHYWt+LP38RhcvXkS7du3QsmVLfPPNN3BWYWFh8PDwUE0UTcn9EiVKwJUMHz5ctW5Zv349ypQpA1ch3Qtk0DwJOo2k9YPkw5dffqlGRZZ9xJlJVyLT476oUaOGGqiLChfLa5bXedkPXKnMZnl9F8trltdaldcMunMRHh6ubrmRmm0JNmW6oNatW6ttMpKxTElRvnx5uEIefPHFF/jggw8yFWQykrWMmClTSTn75zdeLZfC29jSQfo4Oytvb2/1OVetWpUxNZ7U+sl9KdRcpUvFSy+9hN9//x1r165VU8+4kgcffBD79u3LtE1mb6hevTrefPNNpw+4hXQnyDpN3NGjRx3+uO+IWF6zvM7LfuBKZTbLa5bXLK+heXnNoNtKgoODVV9mmTqpbNmy6gv89NNP1f+kibErKFeuXKb7gYGBailzX7pC7Z8U3tKHX777zz77DDExMRn/c9aaX5kurH///qo1Q9OmTTF58mQ1XZYEXq7SRG3BggVYvHixmqvb2Jc9JCREzf/o7OQzZ+2/HhAQoOardpV+7a+++qqqIZPm5U899ZSap15qy5y1xswZsLxmee2KZTbLa5bXLK9f1ba8LvTx0p1YSkqK/rXXXtMXL15cHxQUpG/fvr1+//79eld16tQpl5oybM6cOerzmrs5s6lTp+rLlSun9/b2VlOSbNmyRe8qsvu+ZV9wVa42ZZhYunSpvnbt2nofHx81XeI333yjdZIoFyyvXbu8dtUym+U1y2tTLK+rF2p57SZ/Cie8JyIiIiIiInItztl5hYiIiIiIiMgOMOgmIiIiIiIishEG3UREREREREQ2wqCbiIiIiIiIyEYYdBMRERERERHZCINuIiIiIiIiIhth0E1ERERERERkIwy6iYiIiIiIiGyEQTcR5Zubmxv++OMPq+Zg27Zt8corr1j1NYmIiFwdy2wi7TDoJnIQ0dHRGDFiBCpXrgxfX19ERESgVatWmD59OhITEy1+nblz56JIkSKwV7/99hvGjRuXcb9ChQqYPHmypmkiIiLKC5bZRGTKM9M9IrJLJ0+eVAG2BMsfffQR6tSpAx8fH+zbtw/ffPMNSpcuja5du8IZhIaGap0EIiKifGOZTUT30BOR3evYsaO+TJky+lu3bpn9v06ny1ifOHGivnbt2np/f3/1nCFDhuhv3ryp/rdmzRq9/OxNb2PHjlX/S0pK0r/22mv6UqVKqec2bdpUPT4n8vzff/894/7evXv17dq10/v6+upDQ0P1gwYNynhvkZqaqn/ppZf0ISEh6v9vvPGGvl+/fvrHHnss4zFt2rTRjxgxImM9a3qzc+TIEfXePj4++qpVq+r/+usv/fz58/WtW7e2IIeJiIisg2U2y2yirNi8nMjOXb16FX///TeGDRuGgICAbPtpGbm7u+OLL77AgQMH8N1332H16tV444031P9atmypmmoHBwfj0qVL6vb666+r/w0fPhybN2/GwoULsXfvXvTo0QOdOnXCsWPHLEpnQkICOnbsiKJFi2L79u34+eef8e+//6rXNfr4448xf/58zJkzB//99x/i4+Nz7BMuTc3LlCmD999/PyO95kjz+vbt26v33r17N/r3748+ffpgwYIFePzxxy1KPxERUUGxzGaZTWTWPWE4EdmVLVu2qBre3377LdP2YsWK6QMCAtRNaoyz8/PPP6vHGs2ZM0fVNJs6c+aM3sPDQ3/hwoVM2x988EH96NGjLarp/uabb/RFixbNVBu/bNkyvbu7uz46Olrdj4iI0H/66acZ/09LS9OXK1cu25puUb58ef3nn3+uz8kvv/yi3uf8+fMZNf8lS5ZU6Ttx4kSOzyUiIrIWltkss4nMYZ9uIge1bds26HQ6VaObnJycsV1ql8ePH4/Dhw+rmuS0tDQkJSWp2mB/f3+zryV9w9PT01G1atVM2+V1ixUrZlF6Dh06hHr16mWqjZd+6JLGI0eOqMHfLl++jKZNm2b838PDA40aNVKPKQipjS9btqzq226s+W/SpAlOnz6NSpUqFei1iYiICopl9l0ss8kVMegmsnMyWrkEkRK4mjIGk35+fhnbJMh89NFHMWTIEHz44YdqULKNGzfi+eefR0pKSrZB961bt1QAvHPnTrU0FRgYCHsng8p5e3tn2hYeHp4RhBMRERUGltm5Y5lNroh9uonsnNQ0P/TQQ/jyyy9Vv+mcSNAstcYTJ05E8+bNVc31xYsXMz1GglOp1TbVoEEDte3KlSvqhMH0VqJECYvSWaNGDezZsydTGqXftvQxr1atGkJCQtQ0Z9Lf20jec9euXTm+rrn0ZhUZGYkLFy7g9u3bGa+7fPlydRGCiIiosLDMZplNZA6DbiIHMG3aNNVMvHHjxli0aJFqyi013/PmzVPNyI210xIkp6amYurUqWrKkh9++AEzZszI9Foy77XUbK9atQqxsbGq2bkE59JMvV+/fmrwslOnTqmmcNJMfdmyZRalUZ4vTchlELP9+/djzZo1eOmll/DMM8+oYFvIfXnNxYsXq/TLvOPXr1/PNBBcVpLe9evXq6Ba0muODPgmg6jJa0tXc5lGTfJEgn55fSIiosLCMptlNtE9zPb0JiK7c/HiRf3w4cP1FStW1Ht5eekDAwPVtF4yMFlCQkLG4yZNmqQGEfPz81PTlnz//fdqQLHr169nPGbw4MFqcDXTKcNSUlL0Y8aM0VeoUEG9vrzG448/rqYBs+aUYfIZgoOD1aBrb775pr5Hjx76Xr16ZTuQ2ubNm/V169ZVU4HldMjaunWrvlGjRup9ZfC1Xbt2qenJwsPD9QcOHMhzfhMREeUXy2yW2USm3OTPvaE4EZHtSVN4aZb+1FNPYdy4ccxyIiIiO8Uymyj/OJAaERWaM2fOqDnH27Rpo0ZGl37q0pT96aef5rdARERkR1hmE1kP+3QTUaGRQdXmzp2rpvOS6cRkqjKZ4kxqu4mIiMh+sMwmsh42LyciIiIiIiKyEdZ0ExEREREREdkIg24iIiIiIiIiG2HQTURERERERGQjDLqJiIiIiIiIbIRBNxEREREREZGNMOgmIiIiIiIishEG3UREREREREQ2wqCbiIiIiIiIyEYYdBMRERERERHBNv4fYV4z6rYKq0YAAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "alpha=-3: P(active)=0.1033, gate=0.0000\n",
+ "alpha= 0: P(active)=0.6983, gate=0.5000\n",
+ "alpha=+3: P(active)=0.9789, gate=1.0000\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "import math\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Replicate the HardConcrete math from utils/l0.py\n",
+ "# (same constants used in unified_calibration.py)\n",
+ "BETA = 0.35\n",
+ "GAMMA = -0.1\n",
+ "ZETA = 1.1\n",
+ "\n",
+ "\n",
+ "def hc_prob_active(alpha, beta=BETA, gamma=GAMMA, zeta=ZETA):\n",
+ " \"\"\"P(z > 0) for a Hard Concrete gate with logit alpha.\"\"\"\n",
+ " shifted = alpha - beta * math.log(-gamma / zeta)\n",
+ " return torch.sigmoid(torch.tensor(shifted)).item()\n",
+ "\n",
+ "\n",
+ "def hc_deterministic_gate(alpha, beta=BETA, gamma=GAMMA, zeta=ZETA):\n",
+ " \"\"\"Deterministic gate value (used at inference).\"\"\"\n",
+ " prob = torch.sigmoid(torch.tensor(alpha)).item()\n",
+ " s = prob * (zeta - gamma) + gamma\n",
+ " return max(0.0, min(1.0, s))\n",
+ "\n",
+ "\n",
+ "alphas = np.linspace(-6, 6, 200)\n",
+ "probs = [hc_prob_active(a) for a in alphas]\n",
+ "gates = [hc_deterministic_gate(a) for a in alphas]\n",
+ "\n",
+ "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))\n",
+ "\n",
+ "ax1.plot(alphas, probs)\n",
+ "ax1.axhline(0.5, color=\"gray\", linestyle=\"--\", linewidth=0.8)\n",
+ "ax1.axvline(0, color=\"gray\", linestyle=\"--\", linewidth=0.8)\n",
+ "ax1.set_xlabel(\"Gate logit α\")\n",
+ "ax1.set_ylabel(\"P(z > 0)\")\n",
+ "ax1.set_title(\"L0 penalty contribution per record\")\n",
+ "\n",
+ "ax2.plot(alphas, gates, color=\"tab:orange\")\n",
+ "ax2.axhline(0.0, color=\"gray\", linestyle=\"--\", linewidth=0.8)\n",
+ "ax2.axhline(1.0, color=\"gray\", linestyle=\"--\", linewidth=0.8)\n",
+ "ax2.set_xlabel(\"Gate logit α\")\n",
+ "ax2.set_ylabel(\"z (deterministic)\")\n",
+ "ax2.set_title(\"Deterministic gate value at inference\")\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n",
+ "# The stretch gamma/zeta means z saturates to exactly 0 or 1 at moderate |alpha|\n",
+ "print(\n",
+ " f\"alpha=-3: P(active)={hc_prob_active(-3):.4f}, gate={hc_deterministic_gate(-3):.4f}\"\n",
+ ")\n",
+ "print(\n",
+ " f\"alpha= 0: P(active)={hc_prob_active(0):.4f}, gate={hc_deterministic_gate(0):.4f}\"\n",
+ ")\n",
+ "print(\n",
+ " f\"alpha=+3: P(active)={hc_prob_active(+3):.4f}, gate={hc_deterministic_gate(+3):.4f}\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cdc8c89c7104fffa095e18ddfef8986",
+ "metadata": {},
+ "source": [
+ "The right panel shows the key property: once $\\alpha < \\approx -2$, the stretched-and-clipped gate is **exactly 0** at inference, so those records contribute zero to all estimates and can be dropped from the H5 file. Conversely, once $\\alpha > \\approx +2$, the gate is exactly 1 and the full weight $w_i$ is applied.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b118ea5561624da68c537baed56e602f",
+ "metadata": {},
+ "source": [
+ "### 1.7 Toy example: sparsity effect of λ\n",
+ "\n",
+ "This cell constructs a small calibration problem (8 targets, 200 records) and runs gradient descent manually to show how $\\lambda_{L0}$ drives sparsity. With 200 records for only 8 targets, most records are redundant — the optimizer can satisfy the targets with a subset. Higher $\\lambda_{L0}$ increases the pressure to prune.\n",
+ "\n",
+ "It does **not** use the full `SparseCalibrationWeights` model — it implements the loss directly so the mechanics are visible."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "938c804e27f84196a10c8828c723f798",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total records: 200\n",
+ "Total targets: 8\n",
+ "\n",
+ "λ=1e-01: 3/200 records retained at inference\n",
+ "λ=1e-02: 21/200 records retained at inference\n",
+ "λ=1e-04: 172/200 records retained at inference\n",
+ "λ=1e-08: 195/200 records retained at inference\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "import numpy as np\n",
+ "\n",
+ "torch.manual_seed(0)\n",
+ "\n",
+ "# --- Problem setup ---\n",
+ "# 200 records for 8 targets gives enough redundancy for pruning to work.\n",
+ "# With only 20 records (as few as targets), every record is needed and\n",
+ "# no lambda value can prune anything.\n",
+ "N_TARGETS = 8\n",
+ "N_RECORDS = 200\n",
+ "\n",
+ "# Calibration matrix: each row is a target, each column is a record\n",
+ "X = torch.rand(N_TARGETS, N_RECORDS) * 10 # shape: (targets, records)\n",
+ "\n",
+ "# True population targets (what we want the weighted sum to equal)\n",
+ "true_weights = torch.rand(N_RECORDS) + 0.5\n",
+ "y = X @ true_weights # shape: (targets,)\n",
+ "\n",
+ "BETA = 0.35\n",
+ "GAMMA = -0.1\n",
+ "ZETA = 1.1\n",
+ "\n",
+ "\n",
+ "def run_sparse_calibration(lambda_l0, epochs=800, lr=0.15):\n",
+ " # Learnable parameters: log-weight and gate logit per record\n",
+ " log_w = nn.Parameter(torch.zeros(N_RECORDS))\n",
+ " alpha = nn.Parameter(torch.full((N_RECORDS,), 3.0)) # start: P(active) ~ high\n",
+ " opt = torch.optim.Adam([log_w, alpha], lr=lr)\n",
+ "\n",
+ " history = []\n",
+ "\n",
+ " for epoch in range(epochs):\n",
+ " opt.zero_grad()\n",
+ "\n",
+ " # --- Hard Concrete gate (training: stochastic) ---\n",
+ " u = torch.zeros_like(alpha).uniform_(1e-8, 1 - 1e-8)\n",
+ " s = torch.log(u) - torch.log(1 - u) + alpha\n",
+ " s = torch.sigmoid(s / BETA)\n",
+ " s = s * (ZETA - GAMMA) + GAMMA\n",
+ " z = torch.clamp(s, 0.0, 1.0)\n",
+ "\n",
+ " w = torch.exp(log_w) * z\n",
+ "\n",
+ " # --- Calibration loss: sum of squared relative errors ---\n",
+ " y_hat = X @ w\n",
+ " rel = (y_hat - y) / (y.abs() + 1e-8)\n",
+ " cal_loss = (rel**2).sum()\n",
+ "\n",
+ " # --- L0 penalty: expected number of active gates ---\n",
+ " shift = alpha - BETA * np.log(-GAMMA / ZETA)\n",
+ " p_active = torch.sigmoid(shift)\n",
+ " l0_loss = p_active.sum()\n",
+ "\n",
+ " loss = cal_loss + lambda_l0 * l0_loss\n",
+ " loss.backward()\n",
+ " opt.step()\n",
+ "\n",
+ " if epoch % 100 == 0:\n",
+ " with torch.no_grad():\n",
+ " det_z = torch.clamp(torch.sigmoid(alpha) * (ZETA - GAMMA) + GAMMA, 0, 1)\n",
+ " active = (det_z > 0).sum().item()\n",
+ " history.append(\n",
+ " {\n",
+ " \"epoch\": epoch,\n",
+ " \"cal_loss\": cal_loss.item(),\n",
+ " \"active\": active,\n",
+ " \"lambda_l0\": lambda_l0,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " with torch.no_grad():\n",
+ " det_z = torch.clamp(torch.sigmoid(alpha) * (ZETA - GAMMA) + GAMMA, 0, 1)\n",
+ " final_active = (det_z > 0).sum().item()\n",
+ " return history, final_active\n",
+ "\n",
+ "\n",
+ "print(f\"Total records: {N_RECORDS}\")\n",
+ "print(f\"Total targets: {N_TARGETS}\")\n",
+ "print()\n",
+ "\n",
+ "for lam in [1e-1, 1e-2, 1e-4, 1e-8]:\n",
+ " hist, n_active = run_sparse_calibration(lam)\n",
+ " print(f\"λ={lam:.0e}: {n_active}/{N_RECORDS} records retained at inference\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "504fb2a444614c0babb325280ed9130a",
+ "metadata": {},
+ "source": [
+ "Higher $\\lambda_{L0}$ prunes more aggressively — with $\\lambda = 10^{-1}$ the optimizer retains only a handful of records, while $\\lambda = 10^{-8}$ preserves nearly all of them. The 8 targets can be satisfied by a small subset of the 200 records; the L0 penalty determines how much redundancy is tolerated.\n",
+ "\n",
+ "In production with 430 clones × ~60 K CPS households = ~26 M total records:\n",
+ "- `national` ($\\lambda = 10^{-4}$) → ~50 K nonzero records\n",
+ "- `local` ($\\lambda = 10^{-8}$) → ~3–4 M nonzero records"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59bbdb311c014d738909a11f9e486628",
+ "metadata": {},
+ "source": [
+ "### 1.8 `fit_l0_weights` — function signature and epoch logging\n",
+ "\n",
+ "```python\n",
+ "# From unified_calibration.py\n",
+ "def fit_l0_weights(\n",
+ " X_sparse, # scipy sparse (targets x records)\n",
+ " targets: np.ndarray, # 1-D array of target values\n",
+ " lambda_l0: float,\n",
+ " epochs: int = 100,\n",
+ " device: str = \"cpu\",\n",
+ " verbose_freq: int = None, # default: epochs // 10\n",
+ " beta: float = BETA, # 0.35\n",
+ " lambda_l2: float = LAMBDA_L2, # 1e-12\n",
+ " learning_rate: float = LEARNING_RATE, # 0.15\n",
+ " log_freq: int = None, # epochs between CSV log entries\n",
+ " log_path: str = None, # path for calibration_log.csv\n",
+ " target_names: list = None,\n",
+ " initial_weights: np.ndarray = None,\n",
+ " targets_df: pd.DataFrame = None,\n",
+ " achievable: np.ndarray = None,\n",
+ ") -> np.ndarray: # weight vector, shape (n_records,)\n",
+ "```\n",
+ "\n",
+ "When `log_freq` and `log_path` are both provided, the function writes per-target error rows to `calibration_log.csv` every `log_freq` epochs. The CSV header is:\n",
+ "\n",
+ "```\n",
+ "target_name,estimate,target,epoch,error,rel_error,abs_error,rel_abs_error,loss,achievable\n",
+ "```\n",
+ "\n",
+ "At each logging checkpoint the function also prints a weight distribution summary:\n",
+ "```\n",
+ "Epoch 50: mean_error=0.8312%, max_error=4.2%, total_loss=0.031,\n",
+ " active=2150000/26000000 (91.7% sparse)\n",
+ " Weight dist: [<0.01: 0.2%, 0.01-0.1: 1.4%, 0.1-1: 12.3%, ...]\n",
+ "```\n",
+ "\n",
+ "The `achievable` flag (one bool per target) is set to `True` when the target row sum in $X$ is nonzero — meaning at least one record can contribute to that target. Targets with `achievable=False` are not learnable given the current clone geography and should be investigated.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b43b363d81ae4b689946ece5c682cd59",
+ "metadata": {},
+ "source": [
+ "### 1.9 Convergence criteria\n",
+ "\n",
+ "There is no automatic early-stopping criterion in the current implementation — `fit_l0_weights` runs for exactly `epochs` iterations. Convergence is assessed post-hoc from the diagnostic outputs:\n",
+ "\n",
+ "- **Mean absolute relative error < 1%** across all achievable targets is a reasonable threshold for the `national` preset.\n",
+ "- **Max absolute relative error < 5%** per-target is a reasonable upper bound for the `local` preset.\n",
+ "- Residual error concentrated on a small number of targets (identifiable from `unified_diagnostics.csv`) indicates the matrix is under-determined for those specific targets, not a convergence failure.\n",
+ "\n",
+ "If mean error has not flattened by epoch 50–60, increasing `--learning-rate` or `--epochs` may help. If error plateaus above 2%, examine whether the problematic targets are marked `achievable=False`.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8a65eabff63a45729fe45fb5ade58bdc",
+ "metadata": {},
+ "source": [
+ "### 1.10 Summary of the optimization loop\n",
+ "\n",
+ "```\n",
+ "Initialize SparseCalibrationWeights:\n",
+ " - log_w_i ~ N(log(initial_weights), 0.05^2) [weight jitter]\n",
+ " - alpha_i ~ N(logit(0.999), 0.01^2) [gate logit jitter]\n",
+ "\n",
+ "For each epoch:\n",
+ " 1. Sample Hard Concrete gates z_i (stochastic during training)\n",
+ " 2. Compute effective weights: w_i^eff = exp(log_w_i) * z_i\n",
+ " 3. Compute predictions: y_hat_j = sum_i X_ji * w_i^eff\n",
+ " 4. Calibration loss: L_cal = sum_j ((y_hat_j - y_j) / y_j)^2\n",
+ " 5. L0 penalty: L_l0 = sum_i P(z_i > 0) = sum_i sigmoid(alpha_i - beta*log(-gamma/zeta))\n",
+ " 6. Total loss: L = L_cal + lambda_l0 * L_l0 + lambda_l2 * ||w||^2\n",
+ " 7. Adam step on {log_w_i, alpha_i}\n",
+ "\n",
+ "At inference:\n",
+ " - Replace stochastic z_i with deterministic: clip(sigmoid(alpha_i)*(zeta-gamma)+gamma, 0, 1)\n",
+ " - Records where deterministic z_i = 0 are dropped from the H5 file\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3933fab20d04ec698c2621248eb3be0",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 2: Weight expansion and H5 assembly\n",
+ "\n",
+ "`fit_l0_weights` returns a flat 1-D weight vector of length `n_clones_total * n_hh`. Stage 4 of the pipeline (`publish_local_area.py::build_h5`) expands this into a full H5 dataset by cloning every entity in every nonzero-weight record.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "639e942b",
+ "metadata": {},
+ "source": [
+ "### 2.1 Internal pipeline in `build_h5`\n",
+ "\n",
+ "1. **Load base simulation** — one `Microsimulation` loaded from `dataset_path`. Entity arrays, household IDs, and membership mappings extracted.\n",
+ "2. **Reshape weights** — the flat weight vector is reshaped to `(n_clones, n_hh)`.\n",
+ "3. **CD subset filtering** — if `cd_subset` is set, clones for CDs not in the subset are zeroed out.\n",
+ "4. **County filtering** — if `county_fips_filter` is set, clones whose county FIPS is not in the set are zeroed out.\n",
+ "5. **Identify active clones** — `np.where(W > 0)` finds all nonzero entries. Each represents a distinct (clone, household) pair. Block GEOIDs and CD GEOIDs are extracted for the active set.\n",
+ "6. **Build entity membership maps** — for each household, records which person indices belong to it (`hh_to_persons`) and which sub-entity indices belong to it (`hh_to_entity` for tax_unit, spm_unit, family, marital_unit). These maps drive coherent cloning.\n",
+ "7. **Clone entity arrays** — for each active (clone, household) pair, the function looks up and concatenates the person and sub-entity index arrays. This produces `hh_clone_idx`, `person_clone_idx`, and `entity_clone_idx[ek]`.\n",
+ "8. **Reindex entity IDs** — all entity IDs are reassigned to globally unique values (`0..n-1`). Cross-reference arrays (e.g., `person_household_id`, `person_tax_unit_id`) are rebuilt using a `(clone_id * offset + old_entity_id)` key for searchsorted-based remapping.\n",
+ "9. **Derive geography** — block GEOIDs are deduplicated, then `derive_geography_from_blocks()` maps unique blocks to state FIPS, county, tract, CBSA, SLDU, SLDL, place, VTD, PUMA, and ZCTA. Results are broadcast back via inverse index.\n",
+ "10. **Determine variables to save** — the set includes all `sim.input_variables` plus geographic additions (`block_geoid`, `tract_geoid`, `cbsa_code`, `sldu`, `sldl`, `place_fips`, `vtd`, `puma`, `zcta`), `county`, `spm_unit_spm_threshold`, and `congressional_district_geoid`.\n",
+ "11. **Clone variable arrays** — for each variable in the save set, the base simulation's arrays are indexed with the appropriate clone index map (`hh_clone_idx` for household vars, `person_clone_idx` for person vars, `entity_clone_idx[ek]` for sub-entities).\n",
+ "12. **Override entity IDs and weights** — entity IDs are replaced with the new globally unique values. Only `household_weight` is written; sub-entity weights (`tax_unit_weight`, `spm_unit_weight`, etc.) are formula variables in policyengine-us that derive from `household_weight` at runtime.\n",
+ "13. **Override geography** — `state_fips`, `county`, `county_fips`, `congressional_district_geoid`, and all sub-state geographic variables are overwritten from the derived geography. LA County clones (FIPS 06037) get `zip_code = \"90001\"` for ACA rating area resolution.\n",
+ "14. **Recalculate SPM thresholds** — SPM thresholds depend on geography, so they must be recomputed for each clone's CD. `load_cd_geoadj_values()` fetches the geographic adjustment factor per CD, and `calculate_spm_thresholds_vectorized()` recalculates thresholds using cloned person ages, SPM tenure types, and the CD-level geoadj.\n",
+ "15. **Apply calibration takeup draws** — `apply_block_takeup_to_arrays()` redraws takeup booleans per clone using the same seeded RNG as the matrix builder (see `calibration_package_internals.ipynb` section 2.4). The `takeup_filter` parameter controls which takeup variables are drawn.\n",
+ "16. **Write H5** — all variable arrays are written to the output file as `{variable}/{period}` datasets.\n",
+ "\n",
+ "For additional details on some of the steps see below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4dd4641cc4064e0191573fe9c69df29b",
+ "metadata": {},
+ "source": [
+ "### 2.2 Reshaping the flat weight vector\n",
+ "\n",
+ "The weight vector is first reshaped into a 2-D matrix:\n",
+ "\n",
+ "```python\n",
+ "# weights has shape (n_clones_total * n_hh,)\n",
+ "W = weights.reshape(n_clones_total, n_hh).copy() # shape: (n_clones_total, n_hh)\n",
+ "```\n",
+ "\n",
+ "Row $c$ of `W` is the weight for clone $c$ applied to each household. Most entries are 0 (pruned by the L0 gate). The nonzero entries identify which (clone, household) pairs to include in the output file:\n",
+ "\n",
+ "```python\n",
+ "active_geo, active_hh = np.where(W > 0) # indices into (clone, household)\n",
+ "clone_weights = W[active_geo, active_hh] # scalar weight for each active clone\n",
+ "```\n",
+ "\n",
+ "`n_clones = len(active_geo)` is the number of output records. For the `local` preset this is typically 3–4 M; for `national` it is ~50 K.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8309879909854d7188b41380fd92a7c3",
+ "metadata": {},
+ "source": [
+ "### 2.3 Geographic filtering before expansion\n",
+ "\n",
+ "`build_h5` supports two geographic filters applied to `W` before finding active entries:\n",
+ "\n",
+ "**CD subset filter** — zero out clone rows whose congressional district is not in the target set:\n",
+ "\n",
+ "```python\n",
+ "if cd_subset is not None:\n",
+ " cd_subset_set = set(cd_subset)\n",
+ " cd_mask = np.vectorize(lambda cd: cd in cd_subset_set)(clone_cds_matrix)\n",
+ " W[~cd_mask] = 0\n",
+ "```\n",
+ "\n",
+ "**County FIPS filter** — zero out clones not in the target county set. Used for city datasets (e.g., NYC = 5 borough county FIPS codes):\n",
+ "\n",
+ "```python\n",
+ "# PR #671 implementation (replaces probabilistic scaling)\n",
+ "if county_fips_filter is not None:\n",
+ " fips_array = np.asarray(geography.county_fips).reshape(n_clones_total, n_hh)\n",
+ " fips_mask = np.isin(fips_array, list(county_fips_filter))\n",
+ " W[~fips_mask] = 0\n",
+ "```\n",
+ "\n",
+ "NYC is defined as `NYC_COUNTY_FIPS = {\"36005\", \"36047\", \"36061\", \"36081\", \"36085\"}` (Bronx, Kings, New York, Queens, Richmond). On the current branch, the older probabilistic approach scales weights by `P(target_counties | CD)` via `get_county_filter_probability()` — PR #671 replaces this with the simpler hard binary filter shown above."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ed186c9a28b402fb0bc4494df01f08d",
+ "metadata": {},
+ "source": [
+ "### 2.4 Entity membership preservation\n",
+ "\n",
+ "Each CPS household contains persons, which belong to sub-entities: `tax_unit`, `spm_unit`, `family`, `marital_unit`. The H5 assembly must clone all of these coherently.\n",
+ "\n",
+ "The function builds membership maps from the base simulation before doing any cloning:\n",
+ "\n",
+ "```python\n",
+ "SUB_ENTITIES = [\"tax_unit\", \"spm_unit\", \"family\", \"marital_unit\"]\n",
+ "\n",
+ "# persons → household\n",
+ "hh_to_persons = defaultdict(list) # hh_idx -> [person_idx, ...]\n",
+ "\n",
+ "# household → sub-entity indices (all sub-entities a household's persons belong to)\n",
+ "hh_to_entity = {} # entity_key -> {hh_idx: [entity_idx, ...]}\n",
+ "```\n",
+ "\n",
+ "For each active (clone, household) pair, `build_h5` looks up:\n",
+ "- which person rows belong to that household (`hh_to_persons[hh_idx]`)\n",
+ "- which sub-entity rows belong to those persons (`hh_to_entity[ek][hh_idx]`)\n",
+ "\n",
+ "These index arrays are concatenated across all active clones to build the output:\n",
+ "\n",
+ "```python\n",
+ "# Clone person rows\n",
+ "person_parts = [np.array(hh_to_persons.get(h, []), dtype=np.int64) for h in active_hh]\n",
+ "person_clone_idx = np.concatenate(person_parts)\n",
+ "\n",
+ "# Clone sub-entity rows (same pattern for each entity key)\n",
+ "for ek in SUB_ENTITIES:\n",
+ " parts = [np.array(hh_to_entity[ek].get(h, []), dtype=np.int64) for h in active_hh]\n",
+ " entity_clone_idx[ek] = np.concatenate(parts)\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cb1e1581032b452c9409d6c6813c49d1",
+ "metadata": {},
+ "source": [
+ "### 2.5 Entity ID reassignment\n",
+ "\n",
+ "After cloning, IDs must be unique across all output records. The function assigns new sequential IDs and remaps cross-entity references:\n",
+ "\n",
+ "```python\n",
+ "new_hh_ids = np.arange(n_clones, dtype=np.int32)\n",
+ "new_person_ids = np.arange(n_persons, dtype=np.int32)\n",
+ "new_person_hh_ids = np.repeat(new_hh_ids, persons_per_clone)\n",
+ "```\n",
+ "\n",
+ "For sub-entity cross-references, a compound key `clone_id * offset + old_entity_id` uniquely identifies each (clone, original entity) pair. Binary search (`np.searchsorted`) then maps each person's old sub-entity ID to the new sequential ID:\n",
+ "\n",
+ "```python\n",
+ "offset = int(old_eids.max()) + 1\n",
+ "entity_keys = clone_ids_e * offset + old_eids # unique per (clone, entity)\n",
+ "person_keys = clone_ids_for_persons * offset + p_old_eids\n",
+ "positions = np.searchsorted(sorted_keys, person_keys)\n",
+ "new_person_entity_ids[ek] = sorted_new[positions]\n",
+ "```\n",
+ "\n",
+ "This avoids a Python loop over millions of records and runs in $O(n \\log n)$.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "379cbbc1e968416e875cc15c1202d7eb",
+ "metadata": {},
+ "source": [
+ "### 2.6 Geographic variables and the block deduplication optimization\n",
+ "\n",
+ "Each active clone receives a census block GEOID from the `geography` object (a `GeographyAssignment`). Geographic variables — `block_geoid`, `tract_geoid`, `cbsa_code`, `sldu`, `sldl`, `place_fips`, `vtd`, `puma`, `zcta`, `county`, `congressional_district_geoid` — are derived from this block GEOID.\n",
+ "\n",
+ "Because many clones share the same block (especially in dense urban areas), `build_h5` deduplicates before calling `derive_geography_from_blocks`:\n",
+ "\n",
+ "```python\n",
+ "unique_blocks, block_inv = np.unique(active_blocks, return_inverse=True)\n",
+ "# Derives {var_name: array} for each unique block only\n",
+ "unique_geo = derive_geography_from_blocks(unique_blocks)\n",
+ "# Then broadcasts back to all clones via block_inv\n",
+ "clone_geo = {k: v[block_inv] for k, v in unique_geo.items()}\n",
+ "```\n",
+ "\n",
+ "The variable data is then written to H5 by looping over `sim.input_variables` plus the geographic additions, indexing each variable array with the appropriate clone index map (`hh_clone_idx`, `person_clone_idx`, or `entity_clone_idx[ek]`).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d64e6ebf",
+ "metadata": {},
+ "source": [
+ "### 2.7 SPM threshold recalculation\n",
+ "\n",
+ "The Supplemental Poverty Measure threshold is one of the few formula variables that `build_h5` explicitly recomputes rather than leaving to the engine at simulation time. This is necessary because the threshold depends on **geography** — specifically a geographic adjustment factor derived from local rent levels — and each clone's geography is different from the base dataset's.\n",
+ "\n",
+ "The computation has three inputs per SPM unit:\n",
+ "\n",
+ "| Input | Source |\n",
+ "|---|---|\n",
+ "| Geographic adjustment factor | `load_cd_geoadj_values()` — median 2BR rent in the clone's CD vs national median, from `national_and_district_rents_2023.csv` |\n",
+ "| Household composition | Person ages from the cloned person array, counted into adults (age >= 18) and children per SPM unit |\n",
+ "| Tenure type | `spm_unit_tenure_type` from the base simulation, cloned via `entity_clone_idx[\"spm_unit\"]` |\n",
+ "\n",
+ "The formula is:\n",
+ "\n",
+ "$$\\text{threshold} = \\text{base\\_threshold}(\\text{tenure}) \\times \\text{equivalence\\_scale}(n_{\\text{adults}}, n_{\\text{children}}) \\times \\text{geoadj}(\\text{CD})$$\n",
+ "\n",
+ "`calculate_spm_thresholds_vectorized()` in `calibration_utils.py` implements this without a Microsimulation instance. It uses `np.add.at()` to count adults and children per SPM unit from the person-level age array, looks up base thresholds from `SPMCalculator` for the tax year, and multiplies through.\n",
+ "\n",
+ "If this step were skipped, every clone would inherit the base dataset's SPM threshold (computed for an arbitrary geography), and poverty calculations in the H5 would be wrong for any clone that landed in a different cost-of-living area."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7caf09f",
+ "metadata": {},
+ "source": [
+ "### 2.8 Takeup consistency with the matrix builder\n",
+ "\n",
+ "The H5 builder must produce the **same takeup draws** as the matrix builder for every `(variable, household, clone)` triple. If they diverge, the matrix rows computed during calibration targeted a different subpopulation than what ends up in the H5 — the calibrated weights would be applied to the wrong set of participants.\n",
+ "\n",
+ "`apply_block_takeup_to_arrays()` in `utils/takeup.py` is the H5 builder's entry point. It iterates over `SIMPLE_TAKEUP_VARS` and calls `compute_block_takeup_for_entities()` for each — the same function the matrix builder's clone loop uses (see `calibration_package_internals.ipynb` section 2.4).\n",
+ "\n",
+ "The seeding invariant is maintained by passing:\n",
+ "- `hh_ids` — the **original** CPS household IDs (not the new reassigned IDs), via `household_ids[active_hh]`\n",
+ "- `hh_clone_indices` — the clone index from `active_geo`, matching the matrix builder's per-clone index\n",
+ "- `hh_blocks` — the census block GEOID for rate resolution by state\n",
+ "\n",
+ "Because `compute_block_takeup_for_entities()` seeds its RNG with `(var_name, original_hh_id, clone_idx)`, the draws are identical regardless of call order, process, or which side (matrix builder vs H5 builder) makes the call.\n",
+ "\n",
+ "The `takeup_filter` parameter allows the H5 builder to re-randomize only a subset of takeup variables. Variables not in the filter are set to `True` (all-takeup), which matches the matrix builder's all-takeup-true state simulation. This is used when a pipeline run only needs to refresh specific programs."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "277c27b1587741f2af2001be3712ef0d",
+ "metadata": {},
+ "source": [
+ "### 2.9 Output file structure\n",
+ "\n",
+ "The output H5 file mirrors the layout of the input CPS H5:\n",
+ "- One dataset per variable per period (e.g., `/household/household_weight/2024`)\n",
+ "- `household_weight` is set to `clone_weights` — the calibrated scalar for each active clone\n",
+ "- All other variables are cloned from the base simulation by indexing with the appropriate entity clone index array\n",
+ "- Enum/string variables are encoded as byte strings (`dtype='S'`)\n",
+ "\n",
+ "After writing, the H5 can be opened directly with `Microsimulation(dataset=output_path)` for simulation.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db7b79bc585a40fcaf58bf750017e135",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## Part 3: Diagnostics — reading calibration output\n",
+ "\n",
+ "A completed pipeline run produces three diagnostic files:\n",
+ "\n",
+ "| File | Written by | Content |\n",
+ "|------|-----------|--------|\n",
+ "| `unified_diagnostics.csv` | `compute_diagnostics()` in `unified_calibration.py` | Final per-target error snapshot after optimization |\n",
+ "| `calibration_log.csv` | `fit_l0_weights()` in `unified_calibration.py` | Per-target error at every `log_freq` epoch during optimization |\n",
+ "| `validation_results.csv` | `validate_area()` in `validate_staging.py` | Per-target sim-vs-target comparison on built H5 files (Stage 4) |\n",
+ "\n",
+ "The first two are produced during the optimization step (Stage 3) when `--log-freq` is set. The third is produced during H5 assembly (Stage 4) and validates that the built datasets match the calibration targets. All three are uploaded to `calibration/runs/{run_id}/diagnostics/` on the HuggingFace model repo."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "916684f9a58a4a2aa5f864670399430d",
+ "metadata": {},
+ "source": [
+ "### 3.1 `unified_diagnostics.csv` — column reference\n",
+ "\n",
+ "Sample rows (from `national_unified_diagnostics.csv`):\n",
+ "\n",
+ "```\n",
+ "target,true_value,estimate,rel_error,abs_rel_error,achievable\n",
+ "cd_1000/household_count/[snap>0],41214.0,41526.207,0.00758,0.00758,True\n",
+ "cd_101/household_count/[snap>0],34742.0,35243.766,0.01444,0.01444,True\n",
+ "cd_102/household_count/[snap>0],55107.0,55872.1,0.01388,0.01388,True\n",
+ "cd_103/household_count/[snap>0],36374.0,37070.164,0.01914,0.01914,True\n",
+ "```\n",
+ "\n",
+ "| Column | Type | Description |\n",
+ "|--------|------|-------------|\n",
+ "| `target` | string | Target identifier in `geo_level/variable/[filter]` format |\n",
+ "| `true_value` | float | Population target value (from `targets_df`) |\n",
+ "| `estimate` | float | Weighted estimate: $\\hat{y} = X_{j\\cdot} \\cdot w$ |\n",
+ "| `rel_error` | float | $(\\hat{y} - y) / \\mid y\\mid$ — signed relative error |\n",
+ "| `abs_rel_error` | float | $\\mid \\hat{y} - y\\mid / \\mid y\\mid$ — unsigned relative error |\n",
+ "| `achievable` | bool | `True` if the target's row in $X$ has at least one nonzero entry |\n",
+ "\n",
+ "The target name format is `{geo_level}/{variable}/{filter_expression}`. For example, `cd_1000/household_count/[snap>0]` means: congressional district 1000, count of SNAP-receiving households.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1671c31a24314836a5b85d7ef7fbf015",
+ "metadata": {},
+ "source": [
+ "### 3.2 `calibration_log.csv` — column reference\n",
+ "\n",
+ "Sample rows (from `national_calibration_log.csv` at epoch 500):\n",
+ "\n",
+ "```\n",
+ "target_name,estimate,target,epoch,error,rel_error,abs_error,rel_abs_error,loss,achievable\n",
+ "\"cd_1000/household_count/[snap>0]\",41487.3,41214.0,500,273.3,0.00663,273.3,0.00663,4.40e-05,True\n",
+ "\"cd_101/household_count/[snap>0]\",35073.8,34742.0,500,331.8,0.00955,331.8,0.00955,9.12e-05,True\n",
+ "```\n",
+ "\n",
+ "| Column | Type | Description |\n",
+ "|--------|------|-------------|\n",
+ "| `target_name` | string | Same target identifier as in diagnostics |\n",
+ "| `estimate` | float | Weighted estimate at this epoch |\n",
+ "| `target` | float | Population target value |\n",
+ "| `epoch` | int | Training epoch when this row was written |\n",
+ "| `error` | float | $\\hat{y} - y$ — signed absolute error |\n",
+ "| `rel_error` | float | $(\\hat{y} - y) / y$ — signed relative error |\n",
+ "| `abs_error` | float | $\\mid\\hat{y} - y\\mid$ — unsigned absolute error |\n",
+ "| `rel_abs_error` | float | $\\mid\\hat{y} - y\\mid / \\mid y \\mid$ — unsigned relative error |\n",
+ "| `loss` | float | $((\\hat{y} - y) / y)^2$ — per-target loss contribution |\n",
+ "| `achievable` | bool | Same meaning as in diagnostics |\n",
+ "\n",
+ "Note: `rel_error` and `rel_abs_error` differ in sign convention only. `loss` is the squared relative error — the quantity minimized by the optimizer for that target.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33b0902fd34d4ace834912fa1002cf8e",
+ "metadata": {},
+ "source": [
+ "### 3.3 Reading and analyzing the diagnostic files\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f6fa52606d8c4a75a9b52967216f8f3f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Diagnostics: 12208 targets\n",
+ " Achievable: 12208\n",
+ " Not achievable: 0\n",
+ "\n",
+ "Among achievable targets:\n",
+ " Mean abs rel error: 5.3971%\n",
+ " Max abs rel error: 80.8059%\n",
+ " Median: 3.5038%\n",
+ " Targets > 5% error: 4401\n",
+ " Targets > 1% error: 10251\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from pathlib import Path\n",
+ "\n",
+ "# Adjust paths to match your run output\n",
+ "DIAG_PATH = Path(\"../../national_unified_diagnostics.csv\")\n",
+ "LOG_PATH = Path(\"../../national_calibration_log.csv\")\n",
+ "\n",
+ "if DIAG_PATH.exists():\n",
+ " diag = pd.read_csv(DIAG_PATH)\n",
+ " print(f\"Diagnostics: {len(diag)} targets\")\n",
+ " print(f\" Achievable: {diag['achievable'].sum()}\")\n",
+ " print(f\" Not achievable: {(~diag['achievable']).sum()}\")\n",
+ "\n",
+ " achievable = diag[diag[\"achievable\"]]\n",
+ " print(f\"\\nAmong achievable targets:\")\n",
+ " print(f\" Mean abs rel error: {achievable['abs_rel_error'].mean():.4%}\")\n",
+ " print(f\" Max abs rel error: {achievable['abs_rel_error'].max():.4%}\")\n",
+ " print(f\" Median: {achievable['abs_rel_error'].median():.4%}\")\n",
+ " print(f\" Targets > 5% error: {(achievable['abs_rel_error'] > 0.05).sum()}\")\n",
+ " print(f\" Targets > 1% error: {(achievable['abs_rel_error'] > 0.01).sum()}\")\n",
+ "else:\n",
+ " print(f\"File not found: {DIAG_PATH}\")\n",
+ " print(\"Run unified_calibration.py to generate diagnostic output.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f5a1fa73e5044315a093ec459c9be902",
+ "metadata": {},
+ "source": [
+ "### 3.4 Identifying targets that drive residual error\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "cdf66aed5cc84ca1b48e60bad68798a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 10 targets by absolute relative error:\n",
+ " target true_value estimate abs_rel_error\n",
+ " cd_4808/person_count/[age<40,age>34] 82852.000000 15902.670 80.81%\n",
+ " cd_4808/person_count/[age<45,age>39] 72130.000000 21323.390 70.44%\n",
+ " cd_4808/person_count/[age<10,age>4] 70636.000000 22059.459 68.77%\n",
+ " cd_4808/person_count/[age<15,age>9] 67675.000000 22953.785 66.08%\n",
+ " cd_4808/person_count/[age<35,age>29] 65916.000000 23415.785 64.48%\n",
+ " cd_4808/person_count/[age<20,age>14] 65705.000000 23428.840 64.34%\n",
+ "cd_4808/person_count/[adjusted_gross_income<200000,adjusted_gross_income>=100000,tax_unit_is_filer==1] 61919.224901 100059.586 61.60%\n",
+ " cd_4808/person_count/[age<55,age>49] 62492.000000 24135.570 61.38%\n",
+ " cd_1209/person_count/[adjusted_gross_income<50000,adjusted_gross_income>=25000,tax_unit_is_filer==1] 136540.910911 220156.020 61.24%\n",
+ " cd_4808/person_count/[age<50,age>44] 62451.000000 24325.008 61.05%\n",
+ "\n",
+ "Error by variable (top 10):\n",
+ " mean_error max_error n_targets\n",
+ "variable \n",
+ "person_count 5.54% 80.81% 11772\n",
+ "household_count 1.51% 7.52% 436\n"
+ ]
+ }
+ ],
+ "source": [
+ "if DIAG_PATH.exists():\n",
+ " # Parse target name into components for grouping\n",
+ " diag[[\"geo\", \"variable\", \"filter\"]] = diag[\"target\"].str.extract(\n",
+ " r\"^([^/]+)/([^/]+)/(.+)$\"\n",
+ " )\n",
+ "\n",
+ " # Worst 10 achievable targets by absolute relative error\n",
+ " worst = diag[diag[\"achievable\"]].nlargest(10, \"abs_rel_error\")[\n",
+ " [\"target\", \"true_value\", \"estimate\", \"abs_rel_error\"]\n",
+ " ]\n",
+ " worst[\"abs_rel_error\"] = worst[\"abs_rel_error\"].map(\"{:.2%}\".format)\n",
+ " print(\"Top 10 targets by absolute relative error:\")\n",
+ " print(worst.to_string(index=False))\n",
+ "\n",
+ " print()\n",
+ "\n",
+ " # Error grouped by variable (aggregated across geographies)\n",
+ " by_var = (\n",
+ " diag[diag[\"achievable\"]]\n",
+ " .groupby(\"variable\")[\"abs_rel_error\"]\n",
+ " .agg([\"mean\", \"max\", \"count\"])\n",
+ " .sort_values(\"mean\", ascending=False)\n",
+ " .head(10)\n",
+ " )\n",
+ " by_var.columns = [\"mean_error\", \"max_error\", \"n_targets\"]\n",
+ " by_var[\"mean_error\"] = by_var[\"mean_error\"].map(\"{:.2%}\".format)\n",
+ " by_var[\"max_error\"] = by_var[\"max_error\"].map(\"{:.2%}\".format)\n",
+ " print(\"Error by variable (top 10):\")\n",
+ " print(by_var.to_string())\n",
+ "else:\n",
+ " print(\"[see actual run output]\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28d3efd5258a48a79c179ea5c6759f01",
+ "metadata": {},
+ "source": [
+ "### 3.5 Tracking convergence from the calibration log\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "3f9bc0b9dd2c44919cc8dcca39b469f8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAGGCAYAAADmRxfNAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAXaFJREFUeJzt3QeYE1XbxvFnF5beey8ivTeV8ooKUvRVUSxYaVZAKVawYO9iwY6K+oryiQUrWEAUFBBQBARRUEFFitJ72fmu+4TJJiFZdsku2/6/6wpsJieTk5lJcp45zzmT4HmeZwAAAAAQh8R4ngwAAAAABBYAAAAAMgQ9FgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmAB5GEnnHCCu/l+//13S0hIsJdffjm4rG/fvlasWDHLarfffrurG4CM+bynh74HatWqxaaPQtvlv//9L9sGILAAcpYVK1bYFVdcYUcddZQVKlTISpQoYR06dLDHH3/cdu7caTndjh07XAAxffr0rK4KcMQtWbLEHf8K8AEgJ8qf1RUAkDYfffSRnXPOOVawYEG75JJLrEmTJrZnzx6bOXOmXX/99fbjjz/a888/H9fmrFmzpgtQkpKSsiywuOOOO9zfkWdWb7nlFrvpppuypF7AkQosdPzr2M+M3oFPP/30sJ87duxYS05OztD6AMh9CCyAHOC3336z3r17u4b/tGnTrHLlysHHBg0aZMuXL3eBR7yUaqSekIyyb98+1xgpUKBA3OvKnz+/uyHj7Nq1y+2bxESyYo+U7du3W9GiReNej+d5bv8VLlw4zc+J53OYVScbAOQs/JoAOcCDDz5o27ZtsxdffDEsqPAdffTRNmTIkOD9cePG2UknnWQVKlRwPRyNGjWyZ5555pCvE22Mhe/XX3+1bt26uUZRlSpV7M4773SNm8jnPvzww/bYY49ZnTp13GvrLKx6Vm677TZr3bq1lSxZ0q3jP//5j33xxRdhzy9fvrz7W2dttS7dlBoSa4yFApe77ror+Fo6yzty5EjbvXt31Bxo9e4cc8wxLnhSOtmrr75qaaHgSOlmTZs2dc9VPbt3727z5s3LlLpovXqvr7zyykF1+eSTT9xjH374YXDZX3/9Zf3797eKFSu6127cuLG99NJLYc9TepmeN2HCBNf7U7VqVStSpIht2bLFPT5x4kR3nKg+6g179913o+bVa1to/+o1VFavqfS8jRs3HvY237Rpkw0bNsw9R/WvVq2a65X7559/gmW0HUeNGuWOdZWpXr263XDDDQdt31j0/nT8qSFerlw5u+iii9x28+m41fZZuXLlQc8dMWKEa5SHvsc5c+a4Y0DHs7Zjp06d7Ouvvw57nn/M6jNwwQUXWOnSpa1jx45R66fPnHok5cQTTwwe/35aoL89tf/btGnj3sdzzz2Xrs975BgL/5h488037Z577nHbXfupc+fO7mRFqMhjIfTzrp5S/7hv27atzZ07N+r2T8vxFcvkyZPdd4a+O4oXL26nnnqq66WNrKPGgx3qu8oP8K699lp3HKne9evXd+8lspy89tpr7hjWftY+PP7446P2/hzqWN+7d6/7bqtbt64rU7ZsWXc8fPbZZ2naBkCO4AHI9qpWreodddRRaS7ftm1br2/fvt6jjz7qjRkzxuvatat+Lb0nn3wyrFynTp3czffbb7+5cuPGjQsu69Onj1eoUCGvbt263sUXX+zW8d///teVu/XWWw96bqNGjVxd77//fvf6K1eu9NavX+9VrlzZGz58uPfMM894Dz74oFe/fn0vKSnJ+/77793zt23b5h7TOs4880zvf//7n7v98MMP7vFRo0a5x0Kpblp29tlne0899ZR3ySWXuPs9e/YMK1ezZk33ehUrVvRGjhzp3kOrVq28hIQEb/HixYfcntqWWm+PHj28xx57zHv44Ye9M844w23bzKqLtuEpp5xyUF369evnlS5d2tuzZ4+7v2bNGq9atWpe9erVvTvvvNNtw9NPP929tra/74svvgjunxYtWnijR4/27rvvPm/79u3ehx9+6F6/WbNmbrn2q16jSZMmrr6hLr30Ui9//vzeZZdd5j377LPejTfe6BUtWtQdc36d0vM+t27d6l4nX758bp2q/1133eXW5x8b+/fvd8dwkSJFvKFDh3rPPfecN3jwYFcP7YdD0fGs9651apvcdNNNXuHChb1atWp5GzdudGV0nKpuOjYjaV+ceuqpwftTp071ChQo4LVr18575JFH3Dq17bRszpw5wXL+Mattrno+/fTT7tiIZsWKFd4111zjymt7+ce/9q+/PY8++mi3X1R/bXvt03g+7/4x0bJlS69169bu+bfffrvbzsccc0zYc3V8hx4L/uddz1W9HnjgAbftypUr547H0GMhPcdXNK+++qp7fvfu3d3702tp35UqVcrVI73fVcnJyd5JJ53k1qnjWeVOO+00V07HVyhtDy1v376999BDD3mPP/64d8EFF7jjPr3Huh7TMh3nY8eOdcfO+eef774rgdyCwALI5jZv3ux+2NLSgPLt2LHjoGXdunU7KDhJa2ChZVdffXXYD7MaWmpIKWgIfW6JEiW8devWhb3Ovn37vN27d4ctU4NOP8T9+/cPLtO6tA41yCJFBhYLFixw99UwCHXddde55dOmTQv74deyr776KrhMdSxYsKB37bXXeqnRevRcNfoiaTtkVl1GjBjhAq8NGzYEl2kbqjEVus0GDBjggrZ//vkn7LV79+7tlSxZMngs+I1IHQORx0fTpk1dY1CNfN/06dNd+dCG34wZM9yy8ePHhz1/ypQpBy1P6/u87bbbXLl33nkn5vZVAzsxMdG9fig1rvXcr7/+2otFDdwKFSq4RuzOnTvDGrt6rl7fp0BBDexQ3377rSunxq1fJzVc9Xny6yfaprVr1/ZOPvnkg45ZNR7TYuLEia68HzCE8rentnVGfd79Y6Jhw4Zhn081nrV80aJFhwwsypYtG3aMvvfee275Bx98kO7jKxo9R8e8GuOhFHDp+A5dntbvqkmTJrlyd999d9g6dVJADf/ly5e7+7/88os77nSiQ8FtqNB9n9ZjvXnz5mEBKpAbkQoFZHN+qoq6/9MqNO968+bNLqVEqRpKEdD9wzF48ODg30qB0H2lOH3++edh5Xr16hVMafLly5cvmN+tVJoNGza41CGldHz33XeHVZ+PP/7Y/T98+PCw5UpvkMgxJ0rDUCqFT3VU+oO2SWrefvtt936VhhPJT83KjLqcd955LnXinXfeCS5T+oXShvSY6OSQ6nfaaae5v7Wf/ZtSQbSvI7dvnz59wo6P1atX26JFi1zqUei0wjpelPoVmc6i1J+TTz457LWUYqTnhqa2pfV9qv7Nmze3M888M+b21es2bNjQGjRoEPa6Sv+RyNcNpbSydevW2cCBA8PGDymVRusL3TfarvPnz3ezr/n+7//+z6XKnHHGGe7+ggUL7JdffnGpTf/++2+wLkqtUQrRV199ddAg5yuvvNIyQu3atd1+zejPe79+/cLGX/j77FCfDX+bKT0o1nPTc3xFozQhHfPnn39+2L7Xd8qxxx4bdd8f6rtKn1c9/5prrjno86rPkdKuZNKkSW5fKo0zchxSZFpmWo71UqVKufQtHT9AbsVISCCb05SysnXr1jQ/R7neagjPmjXLzbQUSg0NNQ7TQz+qyhkOVa9ePfd/5NSYavxEo/ECjzzyiP3000+uwXyo8oeiXHjVSzn3oSpVquR+wCNz5WvUqHHQOtQgihwbEEmNTOVplylT5ojWRY1tNXzVsB0wYIBbpr81PsBvUK9fv941upTjHmtGMDWqQ0Vub79ukXX3l4UGJmoQ6fhRLn9aXist71PbV8FoavS6S5cuPShgjfW60d6fGnmRtH2VF+/TGAcFh9rOGh+jRqaCmh49egQ/h36jUAFaLNpGoY3twz3GI8VaT7yf98j95Nf9UJ+NtDw3PcdXNP729o/5SP5+Sc93leqkz3TkyRoFr6F11rGp9SloOJS0HOsa66EAVfXROBON0bn44outWbNmh1w/kFMQWADZnH449SO4ePHiNJXXj6HOnKrRNHr0aDc4UWcjdZbu0UcfzfQpI6PNUqPBjxpY2bNnTzc1rhqmOmN43333hZ0dPhxpvWieXi+aaIM1s0tddDZYg2p1hlaNoPfff9+dufVnx/L3pQYix2roRjZa0jOLUCS9nvbd+PHjoz4eracqI7a5Xldnt3U8R6NjPCPoc6azzhrMrMBi9uzZtmrVKnvggQfC6iIPPfSQtWjRIup6Ii8oGc82P9R6MuLzHs9+yuzPlV////3vfy5Qj5RdZopLy3bQoG/tr/fee8/1Pr7wwgtuHz377LN26aWXHsHaApkne3wiAaRKs8HojLTOSLZr1y7Vsh988IGbKUeN0NCzaKmli6Tlx11d+v6ZP/n555/d/2mZ1eWtt95yZxGV1hPa+I5ML0rPlbU19a7qpTOa/plGWbt2rTuLr8czgma70Uw8St+K1WuRWXVRYKFZZJQupNmXlBanaYdDG/IKOPbv329dunQ5rNfw6xY5C1C0ZdoWSifRRRkzqrGsdR4qaFaZH374wTWg03v1df/9LVu27KCz3loWuW+0zZU2pcfUc6GZgJRqFloXP+A/3G0ey+FcWT4zPu8ZKT3HVzT+9lZAm5btnZbvKtVJx7F6gUN7LdSbGlpnvbbWp1m9YgWR6aXvEKWe6aaZ/hRsaPYwAgvkFoyxAHIATaupqRP146PGaiSdBdN0qKFnzkLPlCkdQlNSxuPJJ58M/q11677mtldj71Ci1UnTdSpQCqVGnKgxfiinnHKK+19Tn4byz2orhz4jKE1H9fYv3BfKfz+ZVRcFKTpTrwaubppqWA2R0O2q+inwiNY4V6pUWs7SKy1DU2OqoeP78ssvXW58qHPPPdcFMZpWN5LGzKRlv0VS/RU0aPrRWNtXr6upYXWRtki6oKPGN8SicTxqlOqscOjUtMqjV3pV5L5RfbRd33jjDZcGpaA+9LoTGk+iBqemJg3dXunZ5rH4r5Oe7ZhZn/eMkp7jKxqNKVEQd++994alUKa2vQ/1XaXPq47j0HKi3gMFd0p9E/WwKhVKKUyRPT+H0yOjMTmRPVtKB0vrlMlATkCPBZADqCHz+uuvu7OpamyGXnn7m2++cQ0gpRpJ165dXSqEzrLq+gL6MVeDTI2rv//++7BeX4Nep0yZ4tJtNGBSjTINelW6SKy891BqnKm3QgN01ZDTBf/U0FPucmhjQ2fBtUyNaJ1x1Nk9vU/dImkMguqjnhw1xDQY9Ntvv3VjOdQg0LUAMoLWozzoJ554wvVIKC9ajYwZM2a4xzQwNDPron2uwaPaBxprETmI9P7773dnp7VfLrvsMrf91Lui3HWdldXfh6JGm3K/1ROhM6nKC1ejS9s9dP/ofemYUgqbBjHrWFODTdtFx6CC27PPPjtd70+pcerR0vgGXYtDDXfVWWfgdYxo22r7Kz1Jg6D1XlVPNQx1hlnL/Ws7RKP6KZVJ70v1VyqZgnPVVWewdf2MUPqcaH8pKNQZbX+gvE/bXyksanzqWh5ar64JosBHdVMjWL0Ih0NnxRUoqL4KDjRo3L8+RSyZ8XnPaGk9vqLR9tQ1OXQMtGrVyvXY6TtHKWr6DtI6QwOEtHxXaVtpH998881u3IWOMaUmKUVp6NChwV4SNfpVRoG0UuTOOusst090nQ4FTPocpIc+m7qOiI5xfbdpYgEd+6GDzYEcL6unpQKQdj///LObXlFzuGv6xOLFi3sdOnRwc7vv2rUrWO799993c8ZrTneV1bzvL730kpsSMXTe97RON6vrFGieff9aApomVlNphk7B6D9Xc71H0tSM9957r5uWUVMwau57TfcZOYWlfPPNN27KT72/0Klno13HYu/evd4dd9zhpvnU1Ky6loOmaQ3dFqLXiDbNY+T7j0XT5ep9NWjQwNWrfPny7poW8+fPz/S6aMpLvW/dZs6cGbV+a9eu9QYNGuReU69dqVIlr3Pnzt7zzz9/0NSimtI0mgkTJrj3p/2jqVl1DPXq1csti6T1ah/pWhA6BjWd6A033OCtXr36sN7nv//+665Loeu1aPtqalIdG6FT6GraWB3HjRs3dnXUdRBUB21zTcl8KP/3f//njjs9t0yZMt6FF17o/fnnn1HL6hoD2lZ6b6FT1IbSNTbOOussN92q1qn3e+6557prXPj8Y9af5jQt9NqaJlbX9QidejbW9ozn8x7rmIj1PRBtutlon/doU0an5/iKRnXVFLqaYlbvs06dOu7aHfPmzUv3d5U/je2wYcO8KlWquM+MphDWewmdRtanbekfOzrutA0/++yzdB/rmt5W1wfR9Ln67Oi933PPPWHX/AByugT9k9XBDQAg+9EZdJ3l5crAyAnHl3pt1QNwqF4QAJmHMRYAkMcpd11jJEJNnz7djX1Q6gbA8QUgLRhjAQB5nMYHaMYdTVur3HGNXdD4Bk3vmVEXd0PexfEF5B0EFgCQx+lCXhpQqkHJmmVHsxNpkL0GhpctWzarq4ccjuMLyDsYYwEAAAAgboyxAAAAABA3AgsAAAAAcWOMhZm72NXq1autePHi7qqbAAAAAMxdaV4XDNXkHpEXaSWwiEJBRfXq1Tl2AAAAgCj++OMPq1atmqWGHgsz11Phb7ASJUqkusEAAACAvGLLli3uBLzfXk4NgYWmxjqQ/qSggsACAAAACJeW4QIM3gYAAAAQNwILAAAAAHEjsAAAAAAQN8ZYAACAXGn//v22d+/erK4GkK0lJSVZvnz5MmRdBBYAACDXzbu/Zs0a27RpU1ZXBcgRSpUqZZUqVYr7em4EFgAAIFfxg4oKFSpYkSJFuPgtkEoQvmPHDlu3bp27X7lyZYsHgQUAAMhV6U9+UFG2bNmsrg6Q7RUuXNj9r+BCn5t40qIYvA0AAHINf0yFeioApI3/eYl3TBKBBQAAyHXizRUH8pKEDPq8ZGlg8dVXX9lpp51mVapUcW9o0qRJB+V93XbbbS7fS900Xbp0sV9++SWszIYNG+zCCy90V8zWwJMBAwbYtm3bjvA7AQAAAPK2LA0stm/fbs2bN7ennnoq6uMPPvigPfHEE/bss8/anDlzrGjRotatWzfbtWtXsIyCih9//NE+++wz+/DDD12wcvnllx/BdwFkkHnjzF472+ynj82Sk9msAIAcq1atWvbYY49ldTXs9ttvtxYtWlhO9eKLL1rXrl3jWofa0TqRn+sDix49etjdd99tZ5555kGPqbdCB+Qtt9xiZ5xxhjVr1sxeffVVW716dbBnY+nSpTZlyhR74YUX7Nhjj7WOHTvamDFjbMKECa4ckK0peNi5MeV+nZPMVkwzm3C+2ZNtzOa+YLZnR1bWEABwBPXt29dlcFx55ZUHPTZo0CD3mMogb9i1a5fdeuutNmrUqOAynUivV6+ey9S5+OKLbc+ePcHHNm/e7B5buXJl2Hr69+9v3333nc2YMSPT65xtx1j89ttvbro4pT/5SpYs6QKIWbNmufv6X+lPbdq0CZZR+cTERNfDEcvu3btty5YtYTfgiNm/1+z78WZPH2v23uCU5UXLmbW/2qxgSbMNK8w+utbs0cZm0+4227qWHQQAeUD16tXdCdKdO3eGNTBff/11q1GjRpbWLTsJbVBnh9fWCfF9+/ale12pPe+tt95yAUSHDh3c/eTkZLvgggtc4Kk28Lx58+z5558Plr/pppvcYzVr1gxbT4ECBdzzlAWUZwMLBRVSsWLFsOW67z+m/zUtVqj8+fNbmTJlgmWiue+++1yQ4t/0IQYynXof5jxn9kRLs/cGmv3zs9nvM1N6LQoUNTv5DrPhS8y6P2BWqqbZzg1mXz1k9ngzs+3/sJMAIJdr1aqVa5e88847wWX6W0FFy5Ytw8qqoak2Te3atd1YVKWXqzEaOvWuxp76j9evX98ef/zxsHWoB6Rnz5728MMPuzGtmqJXvSOpzQ60YsUKl02iNlmxYsWsbdu29vnnnx9UbuvWrXb++ee7VPaqVauGpb6rQa00Jb2vggULuvG211xzzSFTmpSlovdTqFAht1xTC1966aVWvnx51wg/6aST7IcffrD0WLx4scui0XvRe1JPwD//pPzmnnDCCTZ48GAbOnSolStXzqXlT58+3fUgTZ482Vq3bu3ew8yZM93Ja70PtU9VR2XTzJ07N7iuWM+LRgFmaAqT6qTbwIEDrXHjxnb66ae77B355ptv3OsMGTIk6rq0nvfffz8sYM1TgUVmGjFihOsu8m9//PFHVlcJudnOTWZfPWz2WFOzyTeYbf7DrGgFsy53mA1dZFa4dHj5gsXMjrvS7Jrvzc591azaMWZHnRjo0fCtWaxv5SP+VgAgx14EbM++LLnptdNLqSvjxo0L3n/ppZesX79+B5VTUKE0ceXQa7zpsGHD7KKLLrIvv/wyGHhUq1bNJk6caEuWLHET4owcOdLefPPNsPV88cUXLljQ/6+88oq9/PLL7haLJsk55ZRTbOrUqfb9999b9+7dXcN11apVYeUeeughF+yojM6mq9GrVB55++237dFHH7XnnnvOTcyjNPemTZumul2WL1/unqdAa8GCBW7ZOeec466/oIb6/PnzXWDWuXNnN7lPWigwUTCioE09AEqxX7t2rZ177rlh5bRddOb/66+/dtvbp/d1//33uwZ+s2bN7IYbbnB1VHmlHx199NEuEImsT+TzolHAEZqVo+BJwd+nn37qLmqn1CY9V0HgVVdd5bZlrGtQaD3qGUktoycjZNsL5Omy4qKdG3oVQN33B+GojH+lQJ82mnae//xoFB3qBhwRC//PbNpdgb9L1TDrMMSsxYVmSYEL0sSUmM+s0RmB257tKcs3rTJ77niz8vXN2g0ya3qOWX6OZwCIZefe/dbotk+yZAMtubObFSmQvuaWggOdBPVz5dWY1dlrne326cz4vffe63oK2rVr55YdddRRrjGqBmanTp0sKSnJ7rjjjuBzdKZfKTQKLEIbzqVLl7Ynn3zSNUobNGhgp556qgsaLrvssqj1U7Cgm++uu+6yd999150R15l9n1J41IAW5f7rfSiYOPnkk10QoraaUthVT/VcHHPMMYdMQVIgpQa26L1+++23ri3ot+vU86IgRT03aZnMR+9bQYW2ZWggp16jn3/+2dVb6tat6yYV8v3999/u/zvvvNO9H39SomeeecYFZeoBkbFjx7pgSoOwr7/++uDzQ58XK+DRyW/15PjU06F9pwBSQZqCOwWhClBOPPFE10Oiba5ejauvvjpsX+g6FcrSiRx/kWcCCx38OuB0YPuBhMZCKNJSVCb6IGnDK0JVd5JMmzbNRegaiwFkCTX8t603qxY4Jq3lxWZL3jdrdYlZk15m+Q7jY6c0Kd+aRWb5C5mtW2L23iCzz+8wO+Zyszb9zYpylVkAyOnUcFbjXg1U9Xjob6XgRJ6911nryMapGt+hKVNKP1JDWQ15pcHo8chZkpRWE3qmWyd0Fy1alGqPhVKTPvroI9fA1kldrTuyx8IPeELv+zNFqadBfysYUo+HGsnq9VBKeywaO+AHFaKUJ9Ul8grrqot6YNJC61BPjdKgImkdfmDhtzMjhfYorFixwvUe+GMiREGTAiY/ZSna86LxU5b8lC9fZGqVgh8FW+oVOv74413AoaCmSZMm7n5ob4jS4XTM5NrAQgeDPhihA7bVtaUxEopclcumWaMUJSrQ0Mh4RW7KBZSGDRu6g1ERtbqltDMVnfXu3TsswgOOiPXLzGY+ZrboTbOydc2u+sYsMdGsQBGzfh9l3Os0ODUwDuO7V8xmP2u2dbXZF3ebzXjErMX5ZieMNCuW8sULAHld4aR8rucgq177cOhMtH/GOdq0/P41u9S41/iFUP7Ze/VyXHfddfbII4+4Rn3x4sVdelJkOowav6F0ZlwnaWPROnUWXr0DSvVRg/Xss89O14Bq9QgsW7bM9bhoXRo3oLopjSuyPj6N1YjcBgqCQntyfJrcJy20DgU0DzzwwEGPhWbMRL72oZYfyqGep2BJ+2HjxpDZI6O44oor3P7V/lJwoYBNvRPqsdK2DA0slNETGpjlusBCuWzquvENHz7c/d+nTx8XpStPTd1K6spSz4SiNOW+hUZv48ePdx885dNpNqhevXodkVHvQNBf35nNHG229ENl8gaWFasQGJSdWT0IhUsFUqqOG2j247tm34wxW7PQbOFEsy63s3MAIIQaaOlNR8pqOnGqhrrqrhz9SI0aNXIBhHoJ1IiMRqlH7du3d412X1rP5KdG69Wgb/9yAWqc//777weVmz179kH3dVLYp4BEjXrdNGBcaVjqKdE4ibRQOU3Wo14OXTfjcGgdGhOh56fWW5IWderUCY7D8Gdm0klv9TDoZHl6aD3axxobE+s6Fkqv0sl4DeL2AxB/0L3+1+D90P2u2cUiJwDIaFn6KdMo+9QGNenDpBw03WLRBtUUbMARt/p7s6l3Bq494WvwX7OOw1PSoDJbviSzZucGxlms/Nps4+9mhUoGHtNnS6lSGvjduGegLAAgR1Bqkp8+E21Arnof1HOgfHudrdbJV+Xkq1Gr2ZF0klYZH0qT+eSTT1zmx//+9z/XyNXf8dB6NYBaAYHaasooidbDobpoXIIyTdQroUHk6mERnUBWw1ep6zrD/tprr7lAI3Kq1NRofIZ6YrR+vY7SlnQdM72Ggp5DpRuJAhqNg9DsVTqhrXalsmnU26MZqGINho7VC3HVVVe5sRR+9o3qpfQjzc6VXgooNY4kWlCicSXK6tE29sfJKGhTepkCEQ0luPnmm4PlNdBbaWcKfjJTzgrfgexk+7+BoCIhX6Bh33GoWYWUMzFHVEKCWa2OgZtPgcaC8YHb56PMjr3CrFWfQG8HACDbU4CQGg2aVmqLZof69ddfXfqPzsBr5ic/TUbpMeedd54LANR4Vu+FZlCKx+jRo12qlnpDNPbjxhtvjHpNsGuvvdZlp2gAud6Lnuf3vqiuGnSsbBUFGJoR6oMPPjhovERq9J4+/vhj14DWrFnr169343M1tiDycgWxKHVejXO9BzXINShewY16jJQJk17333+/C7I0Za2m21Vwo8BODf/0UjCi5ytg1MDrUBpLoe0bmvqvYE0BpTJ3FNxoGmDfG2+8EXMwfkZK8A5nHrRcRh8G7TDtuEN9iJFH7d8XSDnas82szYEp//TRmfFwIKgofXhdsJlqxwazuS+affu82fYDs6cVKBYYTK7pbLNjnQEgTkr30JjN0GsdADnVOeec44JFzRJ2uDQVsabU1UDvyAAlLZ+b9LST8+R1LIA027vLbN5LZk+2Nnvn0sCZ/11bUnoJjr8++zbQi5Qx63R94FoZZzxlVqFRIDCa80zgIn1/zs/qGgIAgFRoQHu0GavSQzN3KSUuVlCRkUiFAqLZvdVs3jizWU+ZbTtwFfciZc2OvSoQUOQkSYXMWl4UuHbGiqmB97RxpVmVkAFc//xiVuaowLUzAABAtlCrVi13TYp4aCzKkUJgAURaNtns3SvNdm0K3C9R1az91YHrUIReTyKnUUB0dJfATb0ufu6oemXG9Qi8N80ypQBEV/8GAABIBwILwB8v4fdElKtntnuLWdmjzToMNWt2nln+ArlrOxUKyZFc/5NZ8r7AjFKTbzD74h6z1v0Cg71LcD0YAACQNgQWyNv+XWH29WNm+/aYnfVcYFnZOmb9PzGr2jpvpAZVaWE2bInZD6+bzXrabMOBbTLrycCVwk+4KZAmBQAAkAoGbyNvWrPI7K3+Zk+2MfvuVbOF/2e26Y+Ux6sfkzeCCp+uDt72UrPB88x6v2FWs0OgF0PbRalSAAAAh0CPBfKWVXPMZjxi9ssnKcvqdg1c1K5U9aysWfagcRcNTgncdEXxX6ebVWyU8vi0e8xKVDZrfr5ZUuGsrCkAAMhmCCyQdyx80+ydAxeHSUg0a9TTrOMws8rNsrpm2VPVVoGbb8vfZjMfNUveazbt7kAPR9vLzIqVz8paAgCAbIJUKOReyfvNtqxOuV+/h1nRCoHZnZTyc844gor0KFjc7OQ7zUrWMNvxr9mXD5g92tjsvcFm637K+P0HAAByFAIL5D4aiP39a2ZPHWP2+rmBGZ/8hvHQhWanjwkM0Eb6aAradgPNrvne7JyXzaq2Mdu/2+z7/5k9fazZorfYogCQR66t8Nhjj2V1Nez222+3Fi1aWE714osvWteuXeNax7PPPmunnXaaZRcEFsg99uwwm/1s4KrS7w0y+3e52aZVZht/SynDuID45ctv1vhMs8ummvX/1KzhaWYFipvVOSmljLb7vt0Z8GIAkHf07dvXEhIS7MorrzzosUGDBrnHVAY5365du+zWW2+1UaNGBZd99tlnVq9ePStRooRdfPHFtmfPnuBjmzdvdo+tXLkybD39+/e37777zmbMmGHZAYEFcr6dm8y+esjssSZmU2402/KnWbGKgbSdYT8yVWpmqnGs2XmvmQ1bbFakTMryty81e6yZ2VcPm+3YkKlVAIDcpHr16jZhwgTbuXNnWCP09ddftxo1alheENqgzg6v7Xme7du3L93r8lJ53ltvveUCiA4dOrj7ycnJdsEFF7igctasWTZv3jx7/vnng+Vvuukm91jNmjXD1lOgQAH3vCeeeMKyAwIL5HyrZgUGEyvvv1RNs1NHmw1ZaNZhSCD9CZmvcKmUv7etD/RYbFtjNu2uwDiMj64LXDMEAJCqVq1aueDinXfeCS7T3woqWrZsGVZ2ypQp1rFjRytVqpSVLVvW/vvf/9qKFSnfta+++qoVK1bMfvnll+CygQMHWoMGDWzHjh1RX1/PP+OMM6xixYruuW3btrXPP//8oHJbt261888/34oWLWpVq1a1p556KqxBrTQl1blgwYJWpUoVu+aaaw6Z0vTCCy9Y7dq1rVChQm75pk2b7NJLL7Xy5cu7RvhJJ51kP/zwQ7qOoMWLF1uPHj3ce9F7Uk/AP//8E3z8hBNOsMGDB9vQoUOtXLly1q1bN5s+fbrrHZo8ebK1bt3avYeZM2fa7t273fuoUKGCq6O2/dy5c4Prmh7jedEoeAxNYVKddNP+ady4sZ1++um2dOlS99g333zjXmfIkCFR16X1vP/++2HBaFYhsEDOs3Gl2S8hX3J1uwVSc856wezq78zaDjBLCnwpIQtoligFdmc+b1apqdneHWZzx5qNaW024UKz1QvYLQCyxp7tsW+R1+xJtWxEAy5WucOk9JZx48YF77/00kvWr1+/g8pt377dhg8f7s5uT5061RITE+3MM890Z7/lkksusVNOOcUuvPBCd+b8o48+co338ePHW5EiRaK+9rZt29xztL7vv//eunfv7hquq1atCiv30EMPWfPmzV0ZnU1Xo1epPPL222/bo48+as8995wLaiZNmmRNmzZN9T0vX77cPU9B1IIFgd+Jc845x9atW+ca6vPnz3dBV+fOnW3DhrT1hCswUTCigEzbSIHY2rVr7dxzzw0r98orr7gz/19//bUbs+DT+7r//vtdA79Zs2Z2ww03uDqqvNKPjj76aBeIRNbnpojnRaOAo02bNsH7Cp4qV65sn376qQv6lNqk5+7du9euuuoqty3z5Yt+fS2tR/t3zpw5luU8eJs3b9boXvc/srG1Sz3vnSs87/bSnnd/Lc/bvS2ra4RDSU72vBXTPe+1czxvVInAbdHbbDcAmWbnzp3ekiVL3P8H8b+Hot1eOzu87N2VYpd96ZTwsg/Ujl4unfr06eOdccYZ3rp167yCBQt6v//+u7sVKlTIW79+vXtMZWJRGbVnFi1aFFy2YcMGr1q1at5VV13lVaxY0bvnnnvSXa/GjRt7Y8aMCd6vWbOm171797Ay5513ntejRw/39yOPPOLVq1fP27NnT5rWP2rUKC8pKcm9b9+MGTO8EiVKeLt27QorW6dOHe+5554LPq958+Yx13vXXXd5Xbt2DVv2xx9/uG20bNkyd79Tp05ey5Ytw8p88cUXrsykSZOCy7Zt2+bqOH78+OAyvb8qVap4Dz74YMznRbNx40ZX7quvvgpbrvfcpk0br1atWt7AgQPd+u+8805vyJAh3uLFi7327du77Rq6L3ylS5f2Xn75ZS8zPjfpaSdzHQtkf3/NN5sx2uynD1OWVW4eSH0qUDQra4ZDSUgwO6pT4LZ+WWC2roanpzw+/xWz3VsCUwAXKsn2BIADZ69PPfVUe/nll11akf5Wmk4k9Qbcdttt7ky10mj8ngr1LjRp0sT9Xbp0aTf7kM6st2/f3p1NT416LJSapN6Nv//+250JV4pNZI9Fu3btDrrvzxSlngb9fdRRR7keD/WAqNcjf/7YzU6NHdD79inlSXVRilco1SU03Ss1WscXX3zh0qAiaR0aDC1KW4omtEdB5dV74I+JkKSkJDvmmGOCKUvRnheNn7Lkp3z5IlOrfv75Z5fOpl6h448/3vUKKa1L+1b3Q3tDChcuHDO97UgisED2tfZHs09GBq7+7GvwX7P/DDerGv1LANlY+fpmXe8KnxZ4+n1mW/82m/6AWes+ZsdeYVYqbwxOBJAFRoZc2yhSQkSayfXLUykbkUk+dJFlNKVDKfdfQscvhFJjXQ3ysWPHunEMCizU6IwcgPzVV1+5NBoFCkqfKl489vjD6667zqU0Pfzwwy7VRw3Ws88+O10DqjVGZNmyZW5shtalcQNKnfryyy9dYzwajdUIpaBCqUEatxBJY0rSQuvQNnrggQcOekzrjvXah1p+KEUP8TwFSxqLsXHjxlTLXXHFFfbII4+4/argQgGbUtg6derktmVoYKF0rNDALKswxgLZV2J+s1+/DHzZNz/fbOAcs97jCSpykxNGmJWrb7Znq9msJ80eb2E2sZ/Zn/OzumYAciP1cse6RY7NS7Vs4bSVjYPO9Ksxr7Pk6m2I9O+//7rG+y233OLGHTRs2DBqQ1UDf9Ww/uCDD9yZez9YiUXjDDSlrcZqaFxEpUqV7Pfffz+o3OzZsw+6rzr4FJCoUa/ZihQcaKajRYvSHoBpPMWaNWtcL4cCnNBbtN6bWOv48ccf3XU3IteR3qChTp06wXEYPu0b9TA0atQoXesqUKCAe86SJUtillEvU5kyZdwg7v379wdfz//fX+b3pmjmsMjB/VmBwALZw/59ZgvfNPvi3vAz3Kc9Frgg25nPmlVokJU1REbLXyDQSzFwttmFb5nV7mTm7Tf78R2zF04K9GIAQB6lHgal2KjxGW3QrlKcdOZbU5Jq4PO0adPcQO7ImZs0C5JmMlIKjQZt/9///Z+b6jSWunXrBgdQK5VIU5n6KVah1MB+8MEHXbqOelQmTpwYnLVIKVxqGGtGpl9//dVee+01F2hETpWami5durj0qp49e7oBzQpuFCTdfPPNbiB2WujaHzqTr9mrFACoAf7JJ5+4gfChDfO0UCCiQdTXX3+9GwSu/XLZZZe59KMBAwZYenXr1i3mjFEasH733XfbmDFjgvtaQZvSyxSgaWB9aEqWBnor7UzBT1YjsEDW0iwcc180G9PK7J3LAtc92BhyZqR1X7PSaf8iQg6UmGhW92SzPu+bXTkz0DuVmGRWL+QM3fZ/4pphBQByIk2xqls0mgFKU5ZqtiSlPw0bNsylG4VSQ18N4nvvDZy0Uw+E/laKzV9//RV1vaNHj3YNWY3HUI+DGsA68x/p2muvdQ18nSVXI1jP83tWlKqk9Cw1fpWuo5Qo9ZhEjpdIjVKFPv74YzeWQIGAxkP07t3bXSBO08amhdLDFAApiNAVrvX+Na2s6qftl16a6alXr14uWNM2UUCnQEXbK70GDBjg3p8ufBdJ+03bV/X3KVjT/taUwgpuNA2w74033nBBTnaQoBHclsdt2bLFSpYs6XZurA8wMtjurWbzXjKb9ZTZtrWBZUXKmh13ldkxlzOQN6/b/q9Z0ZAfoPevNlvyvlmb/oHjo0RKbiwAhFJKyG+//RZ2PQQgOzrnnHNcgDJixIjDXodSvTSlrnqO1JbNjM9NetrJDN7GkbfyG7M3epvtOhCll6hm1uEas5YXmxWIPq828pjQoEJpcn/MNdu1yWzmaLNvxpg1Pdus3aDAdTIAAMiBHnroIdeTEw8NyNfMUfEEFRmJHgt6LI6M5P1miQdyRBVQPNrErFhFs47DzJqeE8i3B1I7fpZ9HOjh0pXWfRqX8Z9rA9PZAgA9FsBhoccCOcO/K8xmPmr2zy9m/acErmug6xUM+NSsXL2UYANIjY6ThqcFbpoxatYYsyXvmf32pVmNdgQWAABkA6RCIXP8vTCQtqLGn3dgNom/vjOrduD6ExVSpqQD0kXH0Dkvm21cafbt82ZtL015bMUXZn/MCSwrmrbpCAEAQMYgsEDGWjnLbMYjZss/S1lWr7tZx+EpQQWQETRbWLd7wpfp2Pt9RqCXrHlvs+MGmZUPXFkVAABkLgILZBxdIfvVM1KuStr4rMAYikpN2MrIfJrgTtMT79lmtvp7s/kvB251uwUGetc+PpCKByBPiHbtBQCZ+3khsEB8A2o3/GZW7ujA/VrHm1Vsala1lVmHIWZls/5CLchDFDRotqgmvQIDvL95MjDg+5dPArdm55md9XxW1xJAJtNVjXWNgtWrV1v58uXdfV0TAcDBdNUJXeF9/fr17nOjz0s8CCyQfvv2mC2cYDbzscD1KIYuNEsqHLjQ2eXTzfJxWCELqQFRs33gpskDZj9t9v14szqdU8roYnv795gVTv9FjQBkb2ocaS5+TcOp4ALAoRUpUsRq1KhxWBcODMV0s0w3m3ZqjH33auA6AlsOXLGzUCmzi98xq8r4CWRjOzaYFSiWMq3xrKfNpt1t1vKiwEUZy9TO6hoCyIQzsfv27XNXXQYQW758+Sx//vwxe/a4QB4y1s5NZt+ODZz53bkhsKxYJbP2gwM57QWLs8WRvRUpE37/t6/M9m43+/Y5s7ljzRqcatbuarMax2ZVDQFkMDWSkpKS3A3AkUGPBT0Wh7b2R7Nn2gf+Ll3LrMNQs+bnmyWFX/IdyFEDvX/9InDBveWfpyyv1tas/dVmjQ5MQgAAQB63ZcsWd2XvzZs3W4kSJVItSzI8Drbxd7NVc8yanxe4X7GxWbvBZlVamjXqyRgK5Hzq7q1zUuC2bmkgwFj4f2Z/zjVb/DaBBQAAh4EeC3osUqiBpfn/F70VaHhds8CsVPXDOa6AnGfbOrO5L5jV65YyZkiDv+ePMzv2SrOS1bK6hgAAHHH0WCB9/pxnNmO02bKPUpYddZLZ3p1sSeQdxSqYnTgyfNnsZwJjMPR/4zMP9Ny1yKoaAgCQrZEKlZfpbOyHQwMDWZ0Es4anmf1neCDtCcjr6nc3W/9T4GreiyYGbjU7Bi64pyvKxzktHwAAuQmBRV6mOfz/nG+WmD9w8TANyi5fL6trBWQfR3cJ3FYvCIzD+PEds5UzA7eqbcwu/ZyreQMAcACBRV6xf29gUKrOvJ7xVMoUnLoSceVmZqVqZHUNgexL6U+9xpp1uT0wRe28l81q/yclqNAsU9v/MStWPqtrCgBAlmHwdm4fvK1xEt+/ZvbNE2abVgWW9ZscuCoxgMOze5tZ8j6zwqUC91dMM3v9PLOm5wbSpCo2YssCAHIFBm/DbNcWs3kvBq4wvH1dYIsUKWfWbmBg+lgAh69gsfD7y6ea7d9jtuC1wE3T2Gqgt/6PcSVTAAByG3oscmOPxdolZi91N9u9OXC/ZHWz9teYtbzIrECRrK4dkDv9Mdds1hizpR+YecmBZRUaBXowdEHJxHxZXUMAANKNHou8aN9us/wFA3+XqxcYP1G8olnHYWZNzzHLl5TVNQRyt+ptzaq/GrjA5Oxnzb571WzdErNvxpi1uDCrawcAQKajxyKn91j8s9zs60cDU8YOnpcSXGg8RYlqTIcJZJWdm8y+eyUwMYKugSF7tptNvcus7QCzcnXZNwCAbI8ei7zg7x8CF7Vb8p6mpAksW/65WYNTA38zyxOQtTSwu8OQ8GULXjeb80zgVq+HWfvBZjU7MA4DAJArMN1sTrPyG7MZjwSCCJ8aKLqoXfVjsrJmAA5FF57U5/XnySm3yi0CA70b9yRlEQHJyYFxOhqXw+B/ADlItk6F2r9/v91+++322muv2Zo1a6xKlSrWt29fu+WWWyzhwJetqj9q1CgbO3asbdq0yTp06GDPPPOM1a1bN/elQq1fZvbUgeAhIdGsSa/AGApmeQJyln9+MZv9dKAHY9+uwLKSNcwGzjp4xqnMpK9/NWB1S94faMT66ZSy/V8zb3/K467s/sDfSYXNSlRJKbtmcWAKXlfeCy9fsLhZ5ebhs2hpXNhB6042K1TKrF7XlLIL3zTbtTm8nv7f6hVq3TelrMa2bFub8rqh9ShU0uykm1PKTn/AbMOvIWVD1l2gmNlZz6WU/eTmQC+x214RZfMVMBvwSUrZD4YGrhcUWs/Q8tf9nBIsvHOF2dL3D96+vhF/BradvDcocLzou98SAv9rPf79IQvMilUIlP3sNrPv/negTEg5v2z/KWalqgfKfv242fxXDi7jP+/cV83K1gmU1bih0LLBcgfKnvqIWfn6gbLqTVf5yLq655nZiSNTfrtWfGG2YHz096W/j73CrFLTQNk/55n9MCF8ff469X+zc1PKrltqtvidKPU9ULZuN7NKTQJlNTbqp48itoF/SzSrflzKNNJb15qtmHrw+/fLVmySkuqolEidEAzbBq4Cgb9L1zIrUztQds+OwHF20DY4cDwUq2hWslrgb31+1v5oMRUtl5K5sH+f2drFqV8st3TNlKA2tbL6HPllZc2i2GX1OfLfm78//IktIiUVCS+7/ufwz0Ko/IXCy/67InCtrqhlC5iVOSrl/obfYpfNlz+87KY/Ats5msTE8LKb/0r5Lo/G/wzJ1jVme3dYTKVrp3xHbFtvtne7WcESgbGzWSTXpEI98MADLkh45ZVXrHHjxjZv3jzr16+fe3PXXHONK/Pggw/aE0884crUrl3bbr31VuvWrZstWbLEChUqZDmafmjWLAyc5RR9Ydc/JfDl0uGa8IMaQM6hRsd/HzU78RazeS+Zfft8oIHjBxXb1plN7JfSiA9r/CabNTrDrNP1gbI7NpiNPSmiIR3S+NX4jv+OTrmuzf01Ux6P/JFveLrZef9Luf9QKt8xuiL5RW+n3H+xa+AHMJoa7c36T065/87lZjv+iV5WAUhoYDHtrpRr8By0HeuFBxbzXzZbvzR6WY05Cw0sfvnE7K/5sRtaodTYU7AQTb6QQEy2rDb7d7nFpH3kNxr27069gRFsUYb0YsRqmIWW1VienRtSqUNIg00XdtywInbZ0IaVGk9/zUv9+i4+NdRDe9YjKVjwaXstmhi7bMPTUoKFf342mzs2dln9XvpldTLuqwdjl9VvqR9YqCH7ycjYZXs8lBJY/PuL2aSrYpfVhTR10k+0bSecH7tspxsDQZboOB/XPXZZ9Wx2uyfwtwLosSfGLttmQMrnXoH5851il9WsdWc+G/hbjePn/hO7bOR3xLMdY5c9+mSzi95KuT+2c+zvCKWE9vs45f64Hql8R7Qwu+LLlPv/65nKd0R9s8Hfptx/4/zY3xGaQXNYSFD15iVmq7+LXrZwGbMbf0u5/+4Vsb8j8hc2u2VNyv33rwl8/8QyalPK3x9fZ7ZkUuB40nGVA2TrwOKbb76xM844w049NTBuoFatWvbGG2/Yt99+G+yteOyxx1wPhsrJq6++ahUrVrRJkyZZ7969LUfSF7nOyHz9mNnmP82GLjIrXinwWG+dsWJefCBXKFo2ECC0v9psV8iPia6JsXJm7OdVax1+f2PID1yk3VtT/tYZUDVmY4lstCbkCzRC3RlUpeUkHkjPSQycNQxVonIgcAk9i+yX9c+Ohzb+9H7D1nvgOWVCzuz5jZPt60PWF1IPNQxD6Wy1grLQ9am8yupMa6hjLg8pe2Cd/k29MaGOvy4QwISV9euRP0qjcmjE+wrddiHf3z0eDJSP3LbufoJZgaIhZe8PlA0GF35v04H/Q89mHn+DWdvLYpctXjmlbNtLA2Pzgr0rIb1YuoWO19MMg7oKfWQ5f/2hZ5G134qWDy8brEeyWdmjU8rWOM6s233R66r7oWXVy6H3F6ts6KQIqo/2c8z6hhxrmkVR7y9qfb1Az4JPvWoKrA9a54GbGqi+pKJmVdsc/P6VK6L/tY18mr3R1Sm0nBf+uj4dIwqWYykcWjbBrETV2GXD1psQfnyktl6J/AymVla9KHuLHroOUqRs2ter5xYOCWrDHov43BcqcfCJg1hldaInclnoeiJ7XNSrEE1oL7AkFTIrcKAn8lD0XB1D6hnNIbJ1KtS9995rzz//vH366adWr149++GHH6xr1642evRou/DCC+3XX3+1OnXq2Pfff28tWrQIPq9Tp07u/uOPPx51vbt373a30C6e6tWrZ30qlM72aBaZb54027o6sEwfgLNfClxoC0DeoAb6z1OiN+h1UyOhQoOUNAedVYvWoNdNP4x+ypK+7rf8FVI2olGvH6/QRrXOkusxAECetSW3pELddNNN7s00aNDA8uXL58Zc3HPPPS6oEI27EPVQhNJ9/7Fo7rvvPrvjjjss21A3pXKDNVPMzo2BZTpjoLOYrfoc2ZxrAFlPjXt/itpDUV5wWidu0NlIP0c7LQgqAADpkK0DizfffNPGjx9vr7/+uhtjsWDBAhs6dKgbxN2nT5/DXu+IESNs+PDhB/VYZJl9e8xmPmq2b2dg0I5y6Zr3Prj7DAAAAMimsnVgcf3117teC3+sRNOmTW3lypWux0GBRaVKgXEHa9eutcqVU3ICdT80NSpSwYIF3S3bKFberPOtgVzFRppyMlvvFgAAAOAg2Tp5dseOHZYY0RWvlKhk5f2auVmgFFxMnapp31J6H+bMmWPt2rWzHKXdILOmZxNUAAAAIEfK1qfGTzvtNDemokaNGi4VSoO0NXC7f//+7nFdy0KpUXfffbe7boU/3axSpXr27JnV1QcAAADyjGwdWIwZM8YFCgMHDrR169a5gOGKK66w2267LVjmhhtusO3bt9vll1/uLpDXsWNHmzJlSs6/hgUAAACQg2Tr6WaPlBxz5W0AAAAgm7aTs/UYCwAAAAA5A4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLgRWAAAAACIG4EFAAAAgLjlP9wn7t2719asWWM7duyw8uXLW5kyZeKvDQAAAIDc32OxdetWe+aZZ6xTp05WokQJq1WrljVs2NAFFjVr1rTLLrvM5s6dm3m1BQAAAJCzA4vRo0e7QGLcuHHWpUsXmzRpki1YsMB+/vlnmzVrlo0aNcr27dtnXbt2te7du9svv/ySuTUHAAAAkG0keJ7npaXg+eefb7fccos1btw41XK7d+92wUeBAgWsf//+lhNs2bLFSpYsaZs3b3Y9MQAAAAAsXe3kNAcWuRmBBQAAABBfOznuWaE0iPvHH3+0hQsXut6KjPbXX3/ZRRddZGXLlrXChQtb06ZNbd68ecHHFRfddtttVrlyZfe40rRIwwIAAACOrLgCixkzZrhxFyeeeKKdcMIJVr16dZsyZUqGVW7jxo3WoUMHS0pKssmTJ9uSJUvskUcesdKlSwfLPPjgg/bEE0/Ys88+a3PmzLGiRYtat27dbNeuXRlWDwAAAACWcalQycnJlpiYEou0bt3aNfQVVMjzzz9v9913n/3222+WEW666Sb7+uuvXQATjapepUoVu/baa+26665zy9RNU7FiRXv55Zetd+/eaXodUqEAAACAI5gKdeyxx9p3330XvL9nzx6rUaNG8L7+zsiegvfff9/atGlj55xzjlWoUMFatmxpY8eODT6uAEbX0lD6k09vXPXUTFWxKGVLGyn0BgAAAODwpSuwePLJJ+3SSy+1YcOG2fbt290Us+q1OO6449z/vXr1snvuuccyyq+//uqum1G3bl375JNP7KqrrrJrrrnGXnnlFfe4ggpRD0Uo3fcfi0a9KgpA/JtSuAAAAAAcoStvqydAF8DTuAYFEvp/2bJlbmzD/v37rW3btla1alXLKEq9Uo/Fvffe6+6rx2Lx4sVuPEWfPn0Oe70jRoyw4cOHB++rx4LgAgAAADiCg7fz5cvnGuYfffSRjRkzxvUiKMjo2bNnhgYVopmeGjVqFLZMV/petWqV+7tSpUru/7Vr14aV0X3/sWgKFizocsRCbwAAAACOYGChqWXffvtt10Px2Wef2emnn27/+c9/7Omnn7aMphmh1CMSSlf6rlmzpvu7du3aLoCYOnVqWO+DelDatWuX4fUBAAAAkAGBxejRo12600MPPeQa7hpIrZQkNeRnz57tli1atMgyisZyaL1KhVq+fLm9/vrrbuapQYMGuccTEhJs6NChdvfdd7uB3nrtSy65xM0UpR4UAAAAANlwuln1DrzxxhvuuhUrV6607t2729KlS4OPqwdDg6tDl8Xrww8/dKlXuuideig0NuKyyy4LPq7qaxC5Ao5NmzZZx44dXe9JvXr10vwaTDcLAAAAxNdOTldgodmW3nzzTevUqZP9+eefbprXn376KayMppstVKiQ5SQEFgAAAEB87eR0zQp1/fXX2ymnnGLNmzd3Yx382ZpC5bSgAgAAAED80tVjIRrHoF6Kpk2bWoMGDSw3oMcCAAAAOII9FqKAQjcAAAAASPesUPfff7/t2LEjTWU1S5SucwEAAAAgb0hzYLFkyRJ3/YiBAwfa5MmTbf369cHH9u3bZwsXLnSzMbVv397OO+88K168eGbVGQAAAEA2k+ZUqFdffdV++OEHe/LJJ+2CCy5w+Va6CreuYu33ZLRs2dIuvfRS69u3L4O4AQAAgDwk3YO3JTk52fVQ6FoWO3futHLlylmLFi3c/zkRg7cBAACAIzx4WxITE10goRsAAAAApHmMBQAAAADEQmABAAAAIG4EFgAAAADiRmABAAAA4MgHFnv37rX8+fPb4sWL4391AAAAAHkzsEhKSrIaNWrY/v37M6dGAAAAAPJGKtTNN99sI0eOtA0bNmR8jQAAAADkOId1HQtdfXv58uVWpUoVq1mzphUtWjTs8e+++y6j6gcAAAAgtwYWPXv2zPiaAAAAAMixEjzP8yyPS8+lygEAAIC8Yks62smH1WPhmz9/vi1dutT93bhxY2vZsmU8qwMAAACQQx1WYLFu3Trr3bu3TZ8+3UqVKuWWbdq0yU488USbMGGClS9fPqPrCQAAACC3zQp19dVX29atW+3HH390M0PpputaqKvkmmuuyfhaAgAAAMh9YyyUZ/X5559b27Ztw5Z/++231rVrV9d7kZMwxgIAAACIr518WD0WycnJ7kJ5kbRMjwEAAADIWw4rsDjppJNsyJAhtnr16uCyv/76y4YNG2adO3fOyPoBAAAAyK2BhS6Qp26RWrVqWZ06ddytdu3abtmYMWMyvpYAAAAAct+sUNWrV3dX19Y4i59++skta9iwoXXp0iWj6wcAAAAgNwYWe/futcKFC9uCBQvs5JNPdjcAAAAAeVu6U6E0QLtGjRq2f//+zKkRAAAAgLwxxuLmm2+2kSNHuutXAAAAAED+wx28vXz5cqtSpYrVrFnTihYtGva4xl8AAAAAyDsOK7Do2bNnxtcEAAAAQN4JLPbt22cJCQnWv39/q1atWubUCgAAAEDuHmORP39+e+ihh1yAAQAAAACHFVj4V97+8ssv2YIAAAAADn+MRY8ePeymm26yRYsWWevWrQ8avH366acfzmoBAAAA5FAJnud56X1SYmLsjg6Nv8hp17jYsmWLlSxZ0jZv3mwlSpTI6uoAAAAAOa6dfFg9FsnJyYdbNwAAAAC50GGNsQAAAACAww4sTjnlFNcN4rv//vtt06ZNwfv//vuvNWrUKD2rBAAAAJDXAotPPvnEdu/eHbx/77332oYNG4L3NQXtsmXLMraGAAAAAHJXYBE5zvswxn0DAAAAyIUYYwEAAADgyAYWmkpWt8hlAAAAAPK2dE03q9Snvn37WsGCBd39Xbt22ZVXXhm8QF7o+AsAAAAAeUe6Aos+ffqE3b/ooosOKnPJJZfEXysAAAAAuTewGDduXObVBAAAAECOxeBtAAAAAHkrsNAF+TRYfOjQocFlGucxaNAgK1u2rBUrVsx69epla9euzdJ6AgAAAHlNjgks5s6da88995w1a9YsbPmwYcPsgw8+sIkTJ9qXX35pq1evtrPOOivL6gkAAADkRTkisNi2bZtdeOGFNnbsWCtdunRw+ebNm+3FF1+00aNH20knnWStW7d240C++eYbmz17dpbWGQAAAMhLckRgoVSnU0891bp06RK2fP78+bZ3796w5Q0aNLAaNWrYrFmzYq5P0+Ju2bIl7AYAAADgCM0KlRUmTJhg3333nUuFirRmzRorUKCAlSpVKmx5xYoV3WOx3HfffXbHHXdkSn0BAACAvChb91j88ccfNmTIEBs/frwVKlQow9Y7YsQIl0bl3/Q6AAAAAHJpYKFUp3Xr1lmrVq0sf/787qYB2k888YT7Wz0Te/bssU2bNoU9T7NCVapUKeZ6deXwEiVKhN0AAAAA5NJUqM6dO9uiRYvClvXr18+No7jxxhutevXqlpSUZFOnTnXTzMqyZcts1apV1q5duyyqNQAAAJD3ZOvAonjx4takSZOwZUWLFnXXrPCXDxgwwIYPH25lypRxPQ9XX321CyqOO+64LKo1AAAAkPdk68AiLR599FFLTEx0PRaa7albt2729NNPZ3W1AAAAgDwlwfM8z/I4TTdbsmRJN5Cb8RYAAABA+tvJ2XrwNgAAAICcgcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADEjcACAAAAQNwILAAAAADk7sDivvvus7Zt21rx4sWtQoUK1rNnT1u2bFlYmV27dtmgQYOsbNmyVqxYMevVq5etXbs2y+oMAAAA5EXZOrD48ssvXdAwe/Zs++yzz2zv3r3WtWtX2759e7DMsGHD7IMPPrCJEye68qtXr7azzjorS+sNAAAA5DUJnud5lkOsX7/e9VwogDj++ONt8+bNVr58eXv99dft7LPPdmV++ukna9iwoc2aNcuOO+64NK13y5YtVrJkSbe+EiVKZPK7AAAAAHKG9LSTs3WPRSS9ISlTpoz7f/78+a4Xo0uXLsEyDRo0sBo1arjAIpbdu3e7jRR6AwAAAHD4ckxgkZycbEOHDrUOHTpYkyZN3LI1a9ZYgQIFrFSpUmFlK1as6B5LbeyGIi//Vr169UyvPwAAAJCb5ZjAQmMtFi9ebBMmTIh7XSNGjHC9H/7tjz/+yJA6AgAAAHlVfssBBg8ebB9++KF99dVXVq1ateDySpUq2Z49e2zTpk1hvRaaFUqPxVKwYEF3AwAAAJAHeiw0rlxBxbvvvmvTpk2z2rVrhz3eunVrS0pKsqlTpwaXaTraVatWWbt27bKgxgAAAEDelD+7pz9pxqf33nvPXcvCHzehcRGFCxd2/w8YMMCGDx/uBnRrpPrVV1/tgoq0zggFAAAAIJdPN5uQkBB1+bhx46xv377BC+Rde+219sYbb7jZnrp162ZPP/10qqlQkZhuFgAAAIivnZytA4sjhcACAAAAyEPXsQAAAACQPRFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAAIgbgQUAAACAuBFYAAAAACCwAAAAAJD16LEAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxyx//KoCsk5zsWbLn2X7Ps+RkC/lby832H3jcLU/2zDuwTGU8t+zgMnqeu5+c9vUGylv4eoN/h7x2sB4Hl9F6PfMsX0KC5UtMsAT9n5BgiQlmiYn6X8vN/R+4mSsXfCxBzzmwTI+7/+3A8sA6/XUF1qsygfUFHzuwPPD6IeuKLJMYpW56ncTA6wWfH3ydBD4mAADkcgQW2cDSv7fY3v3J4Q3WYOM3ekPY/Z1KY/mghrLfSPbXHa1MWGP6QAM6+He050Q2pvVcS70xfWA9KY3piIZ6sJ4hdY7SmPfvI+fwA6HoAVNEsBQa9EQGTBFBUtqCqlReO+S1woKq0AAvLKAL3A8L1kLfQ8hrh92P8vqxXjvBsjYQU4CbxRXIctmgCjB9FhBtI8T6jkiIWjZauYQ0PTf289NWMn3rTDjs107P9khruXRt4zQ+PyGObVyicJJVKF7IsjsCi2zg/LGzbdOOvVldjVwrslEXejY+rOGZ6pn98MZltDP8YQ3KkMbkoRqsoa8roUGYH2D5gV305SmBYVjwFaVHJCUwTD2484PKsOeG9g6F3E9rfOeeo8iTJhsAAOnSt30tu/30xpbdEVhkA5VKFLIiSfnCzt4eqgEa7Qxtvsg0lbA0l2jpM6GN4BivG3F2NvrrRj8DG/t1Y5WLfVY6NHUnzeslBeeIUPARGtj4vVRhQUrMYClaYJOSehbWAxfaixbseYsMmFJ68CJ7tyIDrKg9dREBVNi6YvTiHSrYi52GFzvYyw7SeqYv014/G5yrzuptkJPo+M4psrxHLs7tGqv2+v5IU9lMWGf0ekZfa9SyadwlaX6P6ahT+rZx1KVxrtNLU9lCSfksJyCwyAamDD0+q6sAHLZAAGqWzxIsh3zvAQCATMCsUAAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADilj/+VeR8nue5/7ds2ZLVVQEAAACyDb997LeXU0NgYWZbt251G6N69eqZvW8AAACAHNleLlmyZKplEry0hB+5XHJysq1evdqKFy9uCQkJWRIJKqj5448/rESJEkf89ZE9cByA4wB8H4DfBWS39oFCBQUVVapUscTE1EdR0GOhgSaJiVatWjXLajpYCCzAcQC+D8DvAmgfIDu1Dw7VU+Fj8DYAAACAuBFYAAAAAIgbgUU2ULBgQRs1apT7H3kXxwE4DsD3AfhdQE5uHzB4GwAAAEDc6LEAAAAAEDcCCwAAAABxI7AAAAAAEDcCi0xy++23u4vthd4aNGgQfHzXrl02aNAgK1u2rBUrVsx69epla9euDVvHqlWr7NRTT7UiRYpYhQoV7Prrr7d9+/ZlVpWRAb766is77bTT3EVktM8nTZp00EVmbrvtNqtcubIVLlzYunTpYr/88ktYmQ0bNtiFF17o5qouVaqUDRgwwLZt2xZWZuHChfaf//zHChUq5C6a8+CDD7L/ctBx0Ldv34O+H7p37x5WhuMgZ7vvvvusbdu27sKr+v7u2bOnLVu2LKxMRv0OTJ8+3Vq1auUGdh599NH28ssvH5H3iIw5Dk444YSDvg+uvPLKsDIcBznbM888Y82aNQteh6Jdu3Y2efLk3PldoCtvI+ONGjXKa9y4sff3338Hb+vXrw8+fuWVV3rVq1f3pk6d6s2bN8877rjjvPbt2wcf37dvn9ekSROvS5cu3vfff+99/PHHXrly5bwRI0awu7Ix7aebb77Ze+edd3RFe+/dd98Ne/z+++/3SpYs6U2aNMn74YcfvNNPP92rXbu2t3PnzmCZ7t27e82bN/dmz57tzZgxwzv66KO9888/P/j45s2bvYoVK3oXXniht3jxYu+NN97wChcu7D333HNH9L3i8I+DPn36uP0c+v2wYcOGsDIcBzlbt27dvHHjxrnP6IIFC7xTTjnFq1Gjhrdt27YM/R349ddfvSJFinjDhw/3lixZ4o0ZM8bLly+fN2XKlCP+nnF4x0GnTp28yy67LOz7QN/zPo6DnO/999/3PvroI+/nn3/2li1b5o0cOdJLSkpyx0Vu+y4gsMjEwEKNw2g2bdrkDqiJEycGly1dutQ1QGbNmuXu66BJTEz01qxZEyzzzDPPeCVKlPB2796dWdVGBopsUCYnJ3uVKlXyHnroobBjoWDBgi44EH0Z6Hlz584Nlpk8ebKXkJDg/fXXX+7+008/7ZUuXTrsOLjxxhu9+vXrs/+yoViBxRlnnBHzORwHuc+6devcsfDll19m6O/ADTfc4E5ihTrvvPNcgxbZ/zjwA4shQ4bEfA7HQe5UunRp74UXXsh13wWkQmUipbgoFeKoo45yqS3qxpL58+fb3r17XRqMT2lSNWrUsFmzZrn7+r9p06ZWsWLFYJlu3brZli1b7Mcff8zMaiOT/Pbbb7ZmzZqw/V6yZEk79thjw/a70p/atGkTLKPyiYmJNmfOnGCZ448/3goUKBB2bKh7fePGjey/HEJd1urOrl+/vl111VX277//Bh/jOMh9Nm/e7P4vU6ZMhv4OqEzoOvwy/jqQvY8D3/jx461cuXLWpEkTGzFihO3YsSP4GMdB7rJ//36bMGGCbd++3aVE5bbvgvxH9NXyEDUWldumRsPff/9td9xxh8uJX7x4sWtcqlGoBmQoHTB6TPR/6AHkP+4/hpzH32/R9mvofldjM1T+/Pndj1Bomdq1ax+0Dv+x0qVLZ+r7QPw0nuKss85y+3HFihU2cuRI69Gjh/sByJcvH8dBLpOcnGxDhw61Dh06uIajZNTvQKwyanDs3LnTjeVC9j0O5IILLrCaNWu6E5EaP3fjjTe6E0XvvPOOe5zjIHdYtGiRCyQ0nkLjKN59911r1KiRLViwIFd9FxBYZBI1EnwasKNAQ18cb775Jl/0QB7Xu3fv4N86C6XviDp16rhejM6dO2dp3ZDxNChTJ5VmzpzJ5s3DYh0Hl19+edj3gSb30PeATjroewG5Q/369V0QoV6rt956y/r06WNffvml5TakQh0hikTr1atny5cvt0qVKtmePXts06ZNYWU0A4AeE/0fOSOAf98vg5zF32/R9mvofl+3bl3Y45r1QTMEcWzkXkqXVBqEvh+E4yD3GDx4sH344Yf2xRdfWLVq1YLLM+p3IFYZzTxDb0X2Pw6i0YlICf0+4DjI+QoUKOBmamrdurWbLax58+b2+OOP57rvAgKLI0TThersg85E6KBKSkqyqVOnBh9Xt6fGYKibTPS/us1CG5mfffaZO0DUdYacR2kv+uCH7nd1UWrsROh+15eLci5906ZNc13o/o+Nymg6U+Vkhh4bOhtCGlTO9Oeff7oxFvp+EI6DnE/j9tWYVLqDPsOR6YsZ9TugMqHr8Mv460D2Pg6i0VltCf0+4DjIfZKTk2337t2577vgiA4Vz0OuvfZab/r06d5vv/3mff31126KME0Nphkh/KnFNOXctGnT3NRi7dq1c7fIqcW6du3qpqjTdGHly5dnutlsbuvWrW4qON308Ro9erT7e+XKlcHpZkuVKuW999573sKFC93MQNGmm23ZsqU3Z84cb+bMmV7dunXDppvVDBKabvbiiy92U9VNmDDBTTHHdLM54zjQY9ddd52b7UPfD59//rnXqlUrt5937doVXAfHQc521VVXuaml9TsQOo3ojh07gmUy4nfAn2Ly+uuvdzPJPPXUU0w3m4OOg+XLl3t33nmn2//6PtBvw1FHHeUdf/zxwXVwHOR8N910k5sJTPtYv/26r9keP/3001z3XUBgkUk0xVflypW9AgUKeFWrVnX39QXiU0Ny4MCBbroxHQhnnnmm+7IJ9fvvv3s9evRw1yhQUKJgZe/evZlVZWSAL774wjUkI2+aXtSfcvbWW291gYGmme3cubOb0zrUv//+6wKJYsWKuank+vXr5xqjoXQNjI4dO7p16PhSwIKccRyoQaEfB/0oaIrBmjVrujnsQ6cRFI6DnC3a/tdN1zTI6N8BHW8tWrRwvzdqlIa+BrL3cbBq1SoXRJQpU8Z9n+u6RWoYhl7HQjgOcrb+/fu773p9RvXdr99+P6jIbd8FCfrnyPaRAAAAAMhtGGMBAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAAAAIG4EFgAAAADiRmABAMgVEhISbNKkSVldDQDIswgsAABx69u3r2vYR966d+/O1gWAPCJ/VlcAAJA7KIgYN25c2LKCBQtmWX0AAEcWPRYAgAyhIKJSpUpht9KlS7vH1HvxzDPPWI8ePaxw4cJ21FFH2VtvvRX2/EWLFtlJJ53kHi9btqxdfvnltm3btrAyL730kjVu3Ni9VuXKlW3w4MFhj//zzz925plnWpEiRaxu3br2/vvvs3cB4AghsAAAHBG33nqr9erVy3744Qe78MILrXfv3rZ06VL32Pbt261bt24uEJk7d65NnDjRPv/887DAQYHJoEGDXMChIERBw9FHHx32GnfccYede+65tnDhQjvllFPc62zYsIE9DABHQILned6ReCEAQO4eY/Haa69ZoUKFwpaPHDnS3dRjceWVV7rgwHfcccdZq1at7Omnn7axY8fajTfeaH/88YcVLVrUPf7xxx/baaedZqtXr7aKFSta1apVrV+/fnb33XdHrYNe45ZbbrG77rorGKwUK1bMJk+ezFgPADgCGGMBAMgQJ554YljgIGXKlAn+3a5du7DHdH/BggXub/VcNG/ePBhUSIcOHSw5OdmWLVvmggYFGJ07d061Ds2aNQv+rXWVKFHC1q1bF/d7AwAcGoEFACBDqCEfmZqUUTTuIi2SkpLC7isgUXACAMh8jLEAABwRs2fPPuh+w4YN3d/6X2MvlL7k+/rrry0xMdHq169vxYsXt1q1atnUqVPZWwCQTdFjAQDIELt377Y1a9aE/8jkz2/lypVzf2tAdps2baxjx442fvx4+/bbb+3FF190j2mQ9ahRo6xPnz52++232/r16+3qq6+2iy++2I2vEC3XOI0KFSq42aW2bt3qgg+VAwBkPQILAECGmDJlipsCNpR6G3766afgjE0TJkywgQMHunJvvPGGNWrUyD2m6WE/+eQTGzJkiLVt29bd1wxSo0ePDq5LQceuXbvs0Ucfteuuu84FLGeffTZ7DwCyCWaFAgBk/o9NQoK9++671rNnT7Y2AORSjLEAAAAAEDcCCwAAAABxY4wFACDTcS1WAMj96LEAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxI7AAAAAAEDcCCwAAAABxI7AAAAAAYPH6fzwywDJOkv9vAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Final epoch: 3000\n",
+ "Targets with >5% error at final epoch: 4401\n",
+ " target_name target estimate rel_abs_error\n",
+ "cd_4808/person_count/[age<40,age>34] 82852.0 15902.673828 0.808059\n",
+ "cd_4808/person_count/[age<45,age>39] 72130.0 21323.394531 0.704376\n",
+ " cd_4808/person_count/[age<10,age>4] 70636.0 22059.468750 0.687702\n",
+ " cd_4808/person_count/[age<15,age>9] 67675.0 22953.794922 0.660823\n",
+ "cd_4808/person_count/[age<35,age>29] 65916.0 23415.787109 0.644763\n"
+ ]
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "if LOG_PATH.exists():\n",
+ " log = pd.read_csv(LOG_PATH)\n",
+ " log[\"target_name\"] = log[\"target_name\"].str.strip('\"')\n",
+ "\n",
+ " # Mean and max error over epochs (achievable targets only)\n",
+ " by_epoch = (\n",
+ " log[log[\"achievable\"]]\n",
+ " .groupby(\"epoch\")[\"rel_abs_error\"]\n",
+ " .agg([\"mean\", \"max\"])\n",
+ " .reset_index()\n",
+ " )\n",
+ "\n",
+ " fig, ax = plt.subplots(figsize=(8, 4))\n",
+ " ax.plot(by_epoch[\"epoch\"], by_epoch[\"mean\"] * 100, label=\"Mean abs rel error (%)\")\n",
+ " ax.plot(\n",
+ " by_epoch[\"epoch\"],\n",
+ " by_epoch[\"max\"] * 100,\n",
+ " label=\"Max abs rel error (%)\",\n",
+ " linestyle=\"--\",\n",
+ " )\n",
+ " ax.set_xlabel(\"Epoch\")\n",
+ " ax.set_ylabel(\"Error (%)\")\n",
+ " ax.set_title(\"Calibration convergence over training epochs\")\n",
+ " ax.legend()\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ "\n",
+ " # Identify targets that are still poorly calibrated at the final epoch\n",
+ " final_epoch = log[\"epoch\"].max()\n",
+ " final_log = log[log[\"epoch\"] == final_epoch]\n",
+ " stuck = final_log[final_log[\"rel_abs_error\"] > 0.05]\n",
+ " print(f\"Final epoch: {final_epoch}\")\n",
+ " print(f\"Targets with >5% error at final epoch: {len(stuck)}\")\n",
+ " if len(stuck) > 0:\n",
+ " print(\n",
+ " stuck[[\"target_name\", \"target\", \"estimate\", \"rel_abs_error\"]]\n",
+ " .sort_values(\"rel_abs_error\", ascending=False)\n",
+ " .head(5)\n",
+ " .to_string(index=False)\n",
+ " )\n",
+ "else:\n",
+ " print(f\"File not found: {LOG_PATH}\")\n",
+ " print(\"Pass --log-freq to unified_calibration.py to enable epoch logging.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e382214b5f147d187d36a2058b9c724",
+ "metadata": {},
+ "source": [
+ "### 3.6 What good vs poor convergence looks like\n",
+ "\n",
+ "**Good convergence** (typical `national` run at epoch 100):\n",
+ "- Mean absolute relative error across achievable targets: < 1%\n",
+ "- Max absolute relative error: < 5%\n",
+ "- Error curve in the log plot is monotonically decreasing and has flattened by epoch 60–70\n",
+ "- No cluster of targets at consistently high error in the by-variable breakdown\n",
+ "\n",
+ "**Signs of poor convergence or data issues:**\n",
+ "- Mean error > 2% at epoch 100: consider increasing `--epochs` or checking `--learning-rate`\n",
+ "- A single variable with consistently high error across all geographies: the calibration matrix may not have enough variation in that variable across clones — check that `n_clones` is sufficient\n",
+ "- Several targets with `achievable=False`: those CDs have no clones assigned to them — inspect the geography assignment seed and clone count\n",
+ "- Error increasing after epoch 50+: learning rate may be too high; try reducing `LEARNING_RATE` to 0.05\n",
+ "\n",
+ "**Not all targets are equally important.** The `loss` column (squared relative error) is what the optimizer minimizes. Targets with large `loss` values at the final epoch are the ones pulling the optimizer's attention. Filter `calibration_log.csv` by `loss > 0.01` at the final epoch to find these.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5b09d5ef5b5e4bb6ab9b829b10b6a29f",
+ "metadata": {},
+ "source": [
+ "### 3.7 `achievable=False` targets\n",
+ "\n",
+ "A target is marked `achievable=False` when `row_sums[j] == 0`, i.e., no record in the calibration matrix has a nonzero contribution to that target. This can happen when:\n",
+ "\n",
+ "- The target's filter condition (e.g., `[snap>0]`) matches no households in any of the N clones assigned to that congressional district\n",
+ "- The clone count is too low for sparsely populated CDs\n",
+ "- The geography assignment (random block → CD mapping) did not assign any clone to that CD\n",
+ "\n",
+ "Non-achievable targets contribute nothing to the loss — the optimizer ignores them. They should be audited rather than silently ignored: a systematic pattern (e.g., all targets in CD 1050 are non-achievable) indicates a geography assignment failure.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e863705",
+ "metadata": {},
+ "source": [
+ "### 3.8 `validation_results.csv` — post-H5 validation\n",
+ "\n",
+ "After each H5 file is built (Stage 4), the pipeline validates it by loading the H5, running `sim.calculate()` for every calibration target, and comparing the weighted aggregate against the target value from `policy_data.db`. This catches cases where the H5 assembly introduced errors that the matrix builder didn't predict — for example, entity cloning bugs, geography overrides that shifted values, or takeup draws that diverged.\n",
+ "\n",
+ "The validation is performed by `validate_area()` in `validate_staging.py`. For each target applicable to the H5's geographic area:\n",
+ "\n",
+ "1. Computes per-household values using `_calculate_target_values_standalone()` — the same function the matrix builder uses, ensuring apples-to-apples comparison.\n",
+ "2. Multiplies by `household_weight` from the H5 and sums: `sim_value = dot(per_hh, hh_weight)`.\n",
+ "3. Compares against the target value and records the error.\n",
+ "\n",
+ "Each row also gets a **sanity check** — a ceiling test that flags implausible aggregates (e.g., a state with $30T in employment income, or a district with 340M persons). Ceilings are defined per geo level in `SANITY_CEILINGS`.\n",
+ "\n",
+ "#### Column reference\n",
+ "\n",
+ "| Column | Description |\n",
+ "|---|---|\n",
+ "| `area_type` | `states`, `districts`, or `cities` |\n",
+ "| `area_id` | State abbreviation, district friendly name, or city name |\n",
+ "| `variable` | PolicyEngine variable name |\n",
+ "| `target_name` | Human-readable name with constraints (e.g., `snap [snap>0] state:NC`) |\n",
+ "| `period` | Source data period |\n",
+ "| `target_value` | Administrative target from `policy_data.db` |\n",
+ "| `sim_value` | Weighted aggregate from the H5 simulation |\n",
+ "| `error` | `sim_value - target_value` |\n",
+ "| `rel_error` | `error / target_value` (signed) |\n",
+ "| `abs_error` | $\\mid$ `error` $\\mid$ |\n",
+ "| `rel_abs_error` | $\\mid$ `error` $\\mid$ / $\\mid$ `target_value` $\\mid$ |\n",
+ "| `sanity_check` | `PASS` or `FAIL` — ceiling test |\n",
+ "| `sanity_reason` | Reason for sanity failure (empty if PASS) |\n",
+ "| `in_training` | Whether this target was in the `target_config.yaml` training set |\n",
+ "\n",
+ "The `in_training` column distinguishes targets that the optimizer saw (training) from targets it did not (holdout). Training targets with high error indicate poor convergence; holdout targets with high error indicate poor generalization.\n",
+ "\n",
+ "#### Structural sanity checks\n",
+ "\n",
+ "In addition to the per-target validation, `run_sanity_checks()` in `sanity_checks.py` runs structural integrity checks on each H5:\n",
+ "\n",
+ "- All households have positive weight\n",
+ "- Key monetary variables (`employment_income`, `adjusted_gross_income`, `snap`, `ssi`, `eitc`, `social_security`, `income_tax_before_credits`) exist and have reasonable ranges\n",
+ "- Takeup booleans (`takes_up_snap_if_eligible`, etc.) exist and are not all-True or all-False\n",
+ "- Entity ID cross-references are consistent (every person's `person_household_id` points to a valid household)\n",
+ "\n",
+ "#### Pipeline integration\n",
+ "\n",
+ "Validation rows from all areas are aggregated by `_collect_validation_diagnostics()` in `pipeline.py` and written to `runs/{run_id}/diagnostics/validation_results.csv` on the HuggingFace model repo. A summary (median relative absolute error, sanity failure count) is recorded in `meta.json`. National validation output from `validate_national_h5.py` is saved separately as `national_validation.txt`.\n",
+ "\n",
+ "#### Fetching validation results\n",
+ "\n",
+ "```python\n",
+ "from huggingface_hub import hf_hub_download\n",
+ "\n",
+ "path = hf_hub_download(\n",
+ " repo_id=\"policyengine/policyengine-us-data\",\n",
+ " repo_type=\"model\",\n",
+ " filename=f\"calibration/runs/{run_id}/diagnostics/validation_results.csv\",\n",
+ ")\n",
+ "df = pd.read_csv(path)\n",
+ "```"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pe3.13 (3.13.0)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/methodology.md b/docs/methodology.md
index 9cca425b7..b92208c23 100644
--- a/docs/methodology.md
+++ b/docs/methodology.md
@@ -1,6 +1,6 @@
# Methodology
-PolicyEngine constructs its representative household dataset through a multi-stage pipeline. Survey data from the CPS is merged with tax detail from the IRS PUF, stratified, and supplemented with variables from ACS, SIPP, and SCF. The resulting dataset is then cloned to geographic variants, simulated through PolicyEngine US with stochastic take-up, and calibrated via L0-regularized optimization against administrative targets at the national, state, and congressional district levels. The pipeline produces 488 geographically representative H5 datasets.
+PolicyEngine constructs its representative household dataset through a five-step pipeline that runs on Modal, preceded by a prerequisite database build. The database build (`make database`) populates a SQLite store of administrative calibration targets. The five Modal steps are: (1) build datasets — assemble the enhanced microdata from CPS, PUF, ACS, SIPP, and SCF; (2) build package — run PolicyEngine on every clone to construct a sparse calibration matrix; (3) fit weights — find household weights via L0-regularized optimization against the administrative targets; (4) build H5 files — write 488 geographically representative datasets; (5) promote — move the staged files to production on HuggingFace.
```mermaid
graph TD
@@ -95,7 +95,7 @@ graph TD
classDef output fill:#5091CC,stroke:#2C6496,color:#FFFFFF
```
-## Stage 1: Variable Imputation
+## Stage 1: Baseline Dataset Build through Variable Imputation
The imputation process begins by aging both the CPS and PUF datasets to the target year, then creating a copy of the aged CPS dataset. This allows us to preserve the original CPS structure while adding imputed tax variables.
@@ -113,9 +113,9 @@ We clone the aged CPS dataset to create two versions. The first copy retains ori
This dual approach ensures that variables not collected in CPS are added from the PUF, while variables collected in CPS but with measurement error are replaced with more accurate PUF values. Most importantly, household structure and relationships are preserved in both copies.
-### Quantile Random Forests
+### Quantile Regression Forests
-Quantile Random Forests (QRF) is an extension of random forests that estimates conditional quantiles rather than conditional means. QRF builds an ensemble of decision trees on the training data and stores all observations in leaf nodes rather than just their means. This enables estimation of any quantile of the conditional distribution at prediction time.
+Quantile Regression Forests (QRF) is an extension of random forests that estimates conditional quantiles rather than conditional means. QRF builds an ensemble of decision trees on the training data and stores all observations in leaf nodes rather than just their means. This enables estimation of any quantile of the conditional distribution at prediction time.
#### QRF Sampling Process
@@ -147,16 +147,6 @@ For CPS Copy 2, we replace existing CPS income variables with more accurate PUF
We concatenate these two CPS copies to create the Extended CPS, effectively doubling the dataset size.
-### Additional Imputations
-
-Beyond PUF tax variables, we impute variables from three other data sources:
-
-From the Survey of Income and Program Participation (SIPP), we impute tip income using predictors including employment income, age, number of children under 18, and number of children under 6.
-
-From the Survey of Consumer Finances (SCF), we match auto loan balances based on household demographics and income, then calculate interest on auto loans from these imputed balances. We also impute various net worth components and wealth measures not available in CPS.
-
-From the American Community Survey (ACS), we impute property taxes for homeowners based on state of residence, household income, and demographic characteristics. We also impute rent values for specific tenure types where CPS data is incomplete, along with additional housing-related variables.
-
### Example: Tip Income Imputation
To illustrate how QRF preserves conditional distributions, consider tip income imputation. The training data from SIPP contains workers with employment income and tip income.
@@ -174,29 +164,25 @@ QRF finds that similar workers in SIPP have a conditional distribution of tip in
If the random quantile drawn is 0.85, the imputed tip income would be approximately \$6,500. This approach ensures that some similar workers receive no tips while others receive substantial tips, preserving realistic variation.
-## Stage 2: Stratification and Source Imputation
-
-After creating the Extended CPS, we reduce and enrich the dataset before calibration.
-
### Stratified Sampling
-The Extended CPS contains roughly 400K person records after the PUF cloning step. Running full microsimulation on every clone of this dataset would be prohibitively expensive. We apply stratified sampling to reduce the dataset to approximately 12,000 households while preserving the tails of the income distribution.
+The Extended CPS contains roughly 400K person records after the PUF cloning step. Running full microsimulation on every clone of this dataset would be prohibitively expensive. Before calibration we apply stratified sampling to reduce the dataset to approximately 12,000 households while preserving the tails of the income distribution.
The stratification works in two steps. First, all households above the 99.5th percentile of adjusted gross income are retained unconditionally — this preserves the top 1% of the AGI distribution, which contributes disproportionately to tax revenue and is difficult to reconstruct from a uniform sample. Second, from the remaining households, we draw a uniform random sample to reach the target size. Weights are adjusted proportionally so that the stratified dataset still represents the full population.
### Source Imputation
-We then impute additional variables from three supplementary surveys onto the stratified CPS. These imputations use quantile regression forests with state of residence as a predictor, which allows the imputed values to reflect geographic variation.
+We then impute additional variables from three supplementary surveys onto the stratified CPS using quantile regression forests.
-**ACS (American Community Survey)**: Rent, real estate taxes. State is included as a predictor, which is important because property tax rates and rent levels vary substantially across states.
+**ACS (American Community Survey)**: Rent, real estate taxes. The ACS imputation includes state FIPS as a predictor, which allows the imputed values to reflect geographic variation in property tax rates and rent levels.
-**SIPP (Survey of Income and Program Participation)**: Tip income, bank account assets, stock assets, bond assets. These financial variables are not available in CPS and are imputed from SIPP's more detailed wealth module.
+**SIPP (Survey of Income and Program Participation)**: Tip income, bank account assets, stock assets, bond assets. The SIPP lacks state identifiers, so these imputations are state-blind at the microdata level — geographic variation in tip income and assets enters only through calibration weights, not through the imputed values themselves.
-**SCF (Survey of Consumer Finances)**: Net worth, auto loan balances, auto loan interest. SCF provides the most comprehensive household balance sheet data among US surveys.
+**SCF (Survey of Consumer Finances)**: Net worth, auto loan balances, auto loan interest. The SCF also lacks state identifiers, so these imputations are likewise state-blind.
The output of this stage is the source-imputed stratified CPS (`source_imputed_stratified_extended_cps_2024.h5`), which serves as the input to the geography-specific calibration pipeline.
-## Stage 3: Geography-Specific Calibration
+## Stage 2: Geography-Specific Calibration Setup
The calibration stage adjusts household weights so that the dataset matches administrative totals at the national, state, and congressional district levels simultaneously. This is the core innovation of the pipeline: rather than calibrating a single national dataset, we create geographic variants of each household and optimize a single weight vector over all variants jointly.
@@ -244,6 +230,10 @@ The **hierarchy inconsistency factor (HIF)** adjusts district-level estimates so
**State-specific uprating factors** adjust variables that depend on state-level policy parameters. For example, ACA premium tax credits depend on state-specific benchmark premiums from CMS and KFF data, so the uprating factor for PTC varies by state.
+## Stage 3: Calibration Weight-Fitting
+
+Once the matrix is built, we use it to fit household weights to calibration targets.
+
### L0-Regularized Optimization
The optimization finds a weight vector **w** such that the matrix-vector product **X · w** approximates the target vector **t**. The loss function minimizes the mean squared relative error between achieved and target values.
@@ -255,9 +245,9 @@ Two presets control the degree of sparsity:
- **Local preset** (λ_L0 = 1e-8): Retains 3–4 million records with nonzero weight. Used for building state and district H5 files where geographic detail matters.
- **National preset** (λ_L0 = 1e-4): Retains approximately 50,000 records. Used for the national web app dataset where fast computation is prioritized over geographic granularity.
-The optimizer is Adam with a learning rate of 0.15, running for 100–200 epochs. Training runs on GPU (A100 or T4) via Modal for production builds, or on CPU for local development.
+The optimizer is Adam with a learning rate of 0.15. The default epoch count is 100; production builds typically run 1000-1500 epochs to ensure convergence. Training runs on GPU (A100 or T4) via Modal for production builds, or on CPU for local development.
-## Stage 4: Local Area Dataset Generation
+## Stage 4: Local Area Calibrated Dataset Generation
Calibrated weights are converted into geography-specific H5 datasets — one per state, congressional district, and city.
@@ -277,11 +267,25 @@ Supplemental Poverty Measure thresholds vary by housing tenure and metropolitan
### Output
-The pipeline produces 488 H5 datasets: 51 state files (including DC), 435 congressional district files, a national file, and city files for New York City. Each file is a self-contained PolicyEngine dataset that can be loaded directly into `Microsimulation` for policy analysis.
+The pipeline produces 488 local H5 datasets: 51 state files (including DC), 435 congressional district files, a national file, and city files for New York City. Each file is a self-contained PolicyEngine dataset that can be loaded directly into `Microsimulation` for policy analysis.
+
+## Key design decisions
+
+### Why 430 clones per household
+
+The pipeline clones each of the ~12,000 stratified households 430 times, producing approximately 5.2 million total records entering calibration. We chose 430 so that the population-weighted random block sampling covers every populated census block in the US with at least one clone in expectation. Fewer clones reduce geographic resolution; more clones increase memory and compute cost proportionally.
+
+### Why L0 regularization (not L1 or L2)
+
+L1 and L2 regularization shrink weights toward zero or toward uniform but retain all records with nonzero weight. Running PolicyEngine simulations at scale requires iterating over every nonzero-weight record, so retaining millions of records makes per-area simulation slow. L0 regularization drives most weights to *exactly* zero, producing a sparse weight vector where only a few hundred thousand records carry nonzero weight. The optimizer selects those records to collectively match the administrative targets, making per-area simulation fast while preserving calibration accuracy.
+
+### Why ~2,800 calibration targets
+
+We draw targets from every granular administrative source available: income by AGI bracket at the national and state level (IRS SOI), population by age and state (Census), benefit totals by program and state (USDA, CMS), and congressional district population counts. We chose this set empirically as the largest number of targets that converge stably given the number of clones — adding more targets increases distributional accuracy but risks optimization instability.
## Validation
-We validate the pipeline at multiple stages. Imputation quality is checked via out-of-sample prediction on held-out records from source datasets. Calibration quality is measured by comparing achieved target values (**X · w**) against administrative totals, reported as relative error per target. The validation script (`validate_staging`) computes these metrics across all state and district H5 files, flagging any area where relative error exceeds acceptable thresholds.
+We validate the pipeline at multiple stages. Imputation quality is checked via out-of-sample prediction on held-out records from source datasets. Calibration quality is measured by comparing achieved target values (**X · w**) against administrative totals, reported as relative error per target. The validation script (`validate_staging`) computes these metrics across all state and district H5 files, flagging any area where relative error exceeds a 10% threshold.
Structural integrity checks verify that weights are positive, that household structures remain intact (all members of a household receive the same weight), and that state populations sum to the national total.
@@ -293,7 +297,7 @@ The implementation is available at:
Key files:
- `policyengine_us_data/datasets/cps/extended_cps.py` — PUF imputation onto CPS
- `policyengine_us_data/calibration/create_stratified_cps.py` — Stratified sampling
-- `policyengine_us_data/calibration/create_source_imputed_cps.py` — ACS/SIPP/SCF source imputation
+- `policyengine_us_data/calibration/source_impute.py` — ACS/SIPP/SCF source imputation
- `policyengine_us_data/calibration/unified_calibration.py` — L0 calibration orchestrator
- `policyengine_us_data/calibration/unified_matrix_builder.py` — Sparse calibration matrix builder
- `policyengine_us_data/calibration/clone_and_assign.py` — Geography cloning and block assignment
diff --git a/modal_app/data_build.py b/modal_app/data_build.py
index a423761e4..216a5956c 100644
--- a/modal_app/data_build.py
+++ b/modal_app/data_build.py
@@ -468,49 +468,27 @@ def build_datasets(
for future in as_completed(futures):
future.result() # Raises if script failed
- # GROUP 2: Depends on Group 1 - run in parallel
- # cps.py needs acs, puf.py needs irs_puf + uprating
- print("=== Phase 2: Building CPS and PUF (parallel) ===")
- group2 = [
- (
- "policyengine_us_data/datasets/cps/cps.py",
- SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/cps.py"],
- ),
- (
- "policyengine_us_data/datasets/puf/puf.py",
- SCRIPT_OUTPUTS["policyengine_us_data/datasets/puf/puf.py"],
- ),
- ]
- with ThreadPoolExecutor(max_workers=2) as executor:
- futures = {
- executor.submit(
- run_script_with_checkpoint,
- script,
- output,
- branch,
- checkpoint_volume,
- env=env,
- log_file=log_file,
- ): script
- for script, output in group2
- }
- for future in as_completed(futures):
- future.result()
-
- # SEQUENTIAL: Extended CPS (needs both cps and puf)
- print("=== Phase 3: Building extended CPS ===")
- run_script_with_checkpoint(
+ # GROUP 2: Sequential chain — each step depends on the previous.
+ # cps.py needs acs; puf.py needs irs_puf + uprating + cps
+ # (pension imputation); extended_cps.py needs both cps and puf.
+ print("=== Phase 2: Building CPS → PUF → extended CPS ===")
+ for script in (
+ "policyengine_us_data/datasets/cps/cps.py",
+ "policyengine_us_data/datasets/puf/puf.py",
"policyengine_us_data/datasets/cps/extended_cps.py",
- SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/extended_cps.py"],
- branch,
- checkpoint_volume,
- env=env,
- log_file=log_file,
- )
+ ):
+ run_script_with_checkpoint(
+ script,
+ SCRIPT_OUTPUTS[script],
+ branch,
+ checkpoint_volume,
+ env=env,
+ log_file=log_file,
+ )
# GROUP 3: After extended_cps - run in parallel
# enhanced_cps and stratified_cps both depend on extended_cps
- print("=== Phase 4: Building enhanced and stratified CPS (parallel) ===")
+ print("=== Phase 3: Building enhanced and stratified CPS (parallel) ===")
phase4_futures = []
with ThreadPoolExecutor(max_workers=2) as executor:
if not skip_enhanced_cps:
@@ -545,11 +523,11 @@ def build_datasets(
for future in as_completed(phase4_futures):
future.result()
- # GROUP 4: After Phase 4 - run in parallel
+ # GROUP 4: After Phase 3 - run in parallel
# create_source_imputed_cps needs stratified_cps
# small_enhanced_cps needs enhanced_cps
print(
- "=== Phase 5: Building source imputed CPS "
+ "=== Phase 4: Building source imputed CPS "
"and small enhanced CPS (parallel) ==="
)
phase5_futures = []
diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py
index 95d293d81..491c5d262 100644
--- a/modal_app/pipeline.py
+++ b/modal_app/pipeline.py
@@ -875,11 +875,12 @@ def run_pipeline(
print("\n[Step 3/5] Fit weights (skipped - completed)")
# ── Step 4: Build H5s + stage + diagnostics (parallel) ──
- # Per plan: all four tasks run in parallel:
# 4a. coordinate_publish (regional H5s)
# 4b. coordinate_national_publish (national H5)
# 4c. stage_base_datasets (datasets → HF staging)
- # 4d. upload_run_diagnostics (diagnostics → HF)
+ # 4d. upload_run_diagnostics (calibration diagnostics → HF)
+ # 4e. _write_validation_diagnostics (after H5 builds)
+ # 4f. upload_run_diagnostics (validation diagnostics → HF)
if not _step_completed(meta, "publish_and_stage"):
print(
"\n[Step 4/5] Building H5s, staging datasets, "
@@ -914,13 +915,13 @@ def run_pipeline(
)
# While H5 builds run, stage base datasets
- # and upload diagnostics in this container
+ # and upload calibration diagnostics in this container
pipeline_volume.reload()
print(" Staging base datasets to HF...")
stage_base_datasets(run_id, version, branch)
- print(" Uploading run diagnostics...")
+ print(" Uploading calibration diagnostics...")
upload_run_diagnostics(run_id, branch)
# Now wait for H5 builds to finish
@@ -959,6 +960,10 @@ def run_pipeline(
vol=pipeline_volume,
)
+ # Upload validation diagnostics (written after H5 builds)
+ print(" Uploading validation diagnostics...")
+ upload_run_diagnostics(run_id, branch)
+
_record_step(
meta,
"publish_and_stage",
diff --git a/policyengine_us_data/calibration/clone_and_assign.py b/policyengine_us_data/calibration/clone_and_assign.py
index 52a53c20e..8605fd42f 100644
--- a/policyengine_us_data/calibration/clone_and_assign.py
+++ b/policyengine_us_data/calibration/clone_and_assign.py
@@ -193,50 +193,6 @@ def _sample(size, mask_slice=None):
)
-def save_geography(geography: GeographyAssignment, path) -> None:
- """Save a GeographyAssignment to a compressed .npz file.
-
- Args:
- geography: The geography assignment to save.
- path: Output file path (should end in .npz).
- """
- from pathlib import Path
-
- path = Path(path)
- np.savez_compressed(
- path,
- block_geoid=geography.block_geoid,
- cd_geoid=geography.cd_geoid,
- county_fips=geography.county_fips,
- state_fips=geography.state_fips,
- n_records=np.array([geography.n_records]),
- n_clones=np.array([geography.n_clones]),
- )
-
-
-def load_geography(path) -> GeographyAssignment:
- """Load a GeographyAssignment from a .npz file.
-
- Args:
- path: Path to the .npz file saved by save_geography.
-
- Returns:
- GeographyAssignment with all fields restored.
- """
- from pathlib import Path
-
- path = Path(path)
- data = np.load(path, allow_pickle=True)
- return GeographyAssignment(
- block_geoid=data["block_geoid"],
- cd_geoid=data["cd_geoid"],
- county_fips=data["county_fips"],
- state_fips=data["state_fips"],
- n_records=int(data["n_records"][0]),
- n_clones=int(data["n_clones"][0]),
- )
-
-
def double_geography_for_puf(
geography: GeographyAssignment,
) -> GeographyAssignment:
diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py
index b3e6085a9..68ce32cee 100644
--- a/policyengine_us_data/calibration/publish_local_area.py
+++ b/policyengine_us_data/calibration/publish_local_area.py
@@ -477,13 +477,13 @@ def build_h5(
zip_codes[la_mask] = "90001"
data["zip_code"] = {time_period: zip_codes.astype("S")}
- # === Gap 4: Congressional district GEOID ===
+ # === Congressional district GEOID ===
clone_cd_geoids = np.array([int(cd) for cd in active_clone_cds], dtype=np.int32)
data["congressional_district_geoid"] = {
time_period: clone_cd_geoids,
}
- # === Gap 1: SPM threshold recalculation ===
+ # === SPM threshold recalculation ===
print("Recalculating SPM thresholds...")
unique_cds_list = sorted(set(active_clone_cds))
cd_geoadj_values = load_cd_geoadj_values(unique_cds_list)
diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py
index 1917e1a14..e6d1ef49f 100644
--- a/policyengine_us_data/calibration/unified_matrix_builder.py
+++ b/policyengine_us_data/calibration/unified_matrix_builder.py
@@ -2274,7 +2274,7 @@ def build_matrix(
raise RuntimeError(f"Clone {ci} failed: {exc}") from exc
else:
- # ---- Sequential clone processing (unchanged) ----
+ # ---- Sequential clone processing ----
clone_dir = Path(cache_dir) if cache_dir else None
if clone_dir:
clone_dir.mkdir(parents=True, exist_ok=True)
diff --git a/policyengine_us_data/datasets/org/org.py b/policyengine_us_data/datasets/org/org.py
index b709bbc69..09ca8165f 100644
--- a/policyengine_us_data/datasets/org/org.py
+++ b/policyengine_us_data/datasets/org/org.py
@@ -1,3 +1,4 @@
+import logging
from functools import lru_cache
from microimpute.models.qrf import QRF
@@ -392,7 +393,13 @@ def load_org_training_data() -> pd.DataFrame:
"""Load ORG donor rows built from official CPS basic monthly files."""
cache_path = STORAGE_FOLDER / ORG_FILENAME
if cache_path.exists():
- return pd.read_csv(cache_path)
+ try:
+ return pd.read_csv(cache_path)
+ except (EOFError, pd.errors.ParserError):
+ logging.warning(
+ "Corrupt ORG cache %s — deleting and rebuilding", cache_path
+ )
+ cache_path.unlink()
months = []
for month in ORG_MONTHS:
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index bde0f33ff..242e0dc8f 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -167,14 +167,7 @@ def impute_pension_contributions_to_puf(puf_df):
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps import CPS_2024
- # CPS_2024 may not exist yet during parallel CI builds.
- # Fall back to CPS_2021 release artifact if needed.
- try:
- cps = Microsimulation(dataset=CPS_2024)
- except Exception:
- from policyengine_us_data.datasets.cps import CPS_2021
-
- cps = Microsimulation(dataset=CPS_2021)
+ cps = Microsimulation(dataset=CPS_2024)
cps.subsample(10_000)
predictors = [