From e91724566a4125872d8958fe8439b07a68a5409d Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 1 Apr 2026 16:34:28 -0400
Subject: [PATCH 1/2] Add pre-publish H5 validation and improve pipeline
 observability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #677 (partial — items 2, 3, 5 from the issue).

- New validate_h5.py utility that checks entity dimension consistency,
  household_weight existence, and zero-weight sanity before upload.
- Integrate validation into worker_script.py so malformed H5 files are
  caught before being marked as completed.
- Surface Modal call ID in pipeline.yaml via ::notice annotations for
  GH Actions → Modal correlation.
- Add continue-on-error + clear ::error annotation to versioning.yaml
  so a broken PAT produces a human-readable failure instead of a
  cryptic git-auth error.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pipeline.yaml               |   4 +-
 .github/workflows/versioning.yaml             |  11 ++
 modal_app/worker_script.py                    |   6 +
 .../tests/test_validate_h5.py                 | 179 ++++++++++++++++++
 policyengine_us_data/utils/validate_h5.py     | 134 +++++++++++++
 5 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 policyengine_us_data/tests/test_validate_h5.py
 create mode 100644 policyengine_us_data/utils/validate_h5.py

diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
index 606cf7f19..2982cdb38 100644
--- a/.github/workflows/pipeline.yaml
+++ b/.github/workflows/pipeline.yaml
@@ -64,5 +64,7 @@ jobs:
               num_workers=int('${NUM_WORKERS}'),
               skip_national='${SKIP_NATIONAL}' == 'true',
           )
-          print(f'Pipeline spawned. Monitor on the Modal dashboard.')
+          print(f'::notice ::Modal call ID: {fc.object_id}')
+          print(f'::notice ::Dashboard: https://modal.com/apps/policyengine/main/deployed/policyengine-us-data-pipeline')
+          print(f'Pipeline spawned. Call ID: {fc.object_id}')
           "
diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml
index 9bad75b38..97334a27a 100644
--- a/.github/workflows/versioning.yaml
+++ b/.github/workflows/versioning.yaml
@@ -15,11 +15,22 @@ jobs:
         if: (github.event.head_commit.message != 'Update package version')
         runs-on: ubuntu-latest
         steps:
+          # Checkout requires a PAT (POLICYENGINE_GITHUB) with repo write
+          # access so the workflow can push the version-bump commit back to
+          # main.  If the secret is missing or expired the step fails with a
+          # cryptic git-auth error.  See issue #677 for PAT rotation.
           - name: Checkout repo
+            id: checkout
+            continue-on-error: true
             uses: actions/checkout@v4
             with:
               token: ${{ secrets.POLICYENGINE_GITHUB }}
               fetch-depth: 0
+          - name: Abort if checkout failed (PAT issue)
+            if: steps.checkout.outcome == 'failure'
+            run: |
+              echo "::error ::Checkout failed — the POLICYENGINE_GITHUB PAT is likely expired or missing. See https://github.com/PolicyEngine/policyengine-us-data/issues/677"
+              exit 1
           - name: Setup Python
             uses: actions/setup-python@v5
             with:
diff --git a/modal_app/worker_script.py b/modal_app/worker_script.py
index 27dbb8c2a..f3c18181d 100644
--- a/modal_app/worker_script.py
+++ b/modal_app/worker_script.py
@@ -211,6 +211,7 @@ def main():
         NYC_COUNTY_FIPS,
         AT_LARGE_DISTRICTS,
     )
+    from policyengine_us_data.utils.validate_h5 import validate_h5_or_raise
     from policyengine_us_data.calibration.calibration_utils import (
         STATE_CODES,
     )
@@ -426,6 +427,11 @@ def main():
                 raise ValueError(f"Unknown item type: {item_type}")
 
             if path:
+                validate_h5_or_raise(
+                    path,
+                    label=f"{item_type}:{item_id}",
+                    period=args.period,
+                )
                 results["completed"].append(f"{item_type}:{item_id}")
                 print(
                     f"Completed {item_type}:{item_id}",
diff --git a/policyengine_us_data/tests/test_validate_h5.py b/policyengine_us_data/tests/test_validate_h5.py
new file mode 100644
index 000000000..b3a86ef6a
--- /dev/null
+++ b/policyengine_us_data/tests/test_validate_h5.py
@@ -0,0 +1,179 @@
+"""Tests for H5 pre-publish validation."""
+
+from unittest.mock import patch, MagicMock
+
+import h5py
+import numpy as np
+import pytest
+
+from policyengine_us_data.utils.validate_h5 import (
+    validate_h5_entity_dimensions,
+    validate_h5_or_raise,
+)
+
+
+def _make_mock_tbs(variable_entities: dict[str, str]):
+    """Build a mock CountryTaxBenefitSystem with given variable→entity mappings."""
+    tbs = MagicMock()
+    variables = {}
+    for var_name, entity_key in variable_entities.items():
+        var_mock = MagicMock()
+        var_mock.entity.key = entity_key
+        variables[var_name] = var_mock
+    tbs.variables = variables
+    return tbs
+
+
+def _write_h5(path, period, datasets: dict[str, np.ndarray]):
+    with h5py.File(path, "w") as f:
+        grp = f.create_group(str(period))
+        for name, arr in datasets.items():
+            grp.create_dataset(name, data=arr)
+
+
+PERIOD = 2024
+N_PERSONS = 10
+N_HOUSEHOLDS = 5
+
+
+@pytest.fixture
+def mock_tbs():
+    return _make_mock_tbs(
+        {
+            "person_id": "person",
+            "household_id": "household",
+            "age": "person",
+            "household_weight": "household",
+            "income": "person",
+        }
+    )
+
+
+class TestDimensionsMatch:
+    def test_all_correct(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "good.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS),
+                "income": np.ones(N_PERSONS),
+                "household_weight": np.ones(N_HOUSEHOLDS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        assert results == []
+
+    def test_or_raise_passes(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "good.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS),
+                "income": np.ones(N_PERSONS),
+                "household_weight": np.ones(N_HOUSEHOLDS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            validate_h5_or_raise(h5_path, period=PERIOD)
+
+
+class TestPersonDimensionMismatch:
+    def test_wrong_person_variable_length(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "bad_dim.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS + 99),  # wrong length
+                "income": np.ones(N_PERSONS),
+                "household_weight": np.ones(N_HOUSEHOLDS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        fails = [r for r in results if r["status"] == "FAIL"]
+        assert len(fails) == 1
+        assert "age" in fails[0]["detail"]
+
+    def test_or_raise_raises(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "bad_dim.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS + 99),
+                "income": np.ones(N_PERSONS),
+                "household_weight": np.ones(N_HOUSEHOLDS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            with pytest.raises(ValueError, match="age"):
+                validate_h5_or_raise(h5_path, period=PERIOD)
+
+
+class TestMissingHouseholdWeight:
+    def test_missing_weight(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "no_weight.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS),
+                "income": np.ones(N_PERSONS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        checks = [r["check"] for r in results]
+        assert "household_weight_exists" in checks
+
+
+class TestAllZeroWeights:
+    def test_zero_weights(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "zero_weight.h5"
+        _write_h5(
+            h5_path,
+            PERIOD,
+            {
+                "person_id": np.arange(N_PERSONS),
+                "household_id": np.arange(N_HOUSEHOLDS),
+                "age": np.ones(N_PERSONS),
+                "income": np.ones(N_PERSONS),
+                "household_weight": np.zeros(N_HOUSEHOLDS),
+            },
+        )
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        checks = [r["check"] for r in results]
+        assert "household_weight_nonzero" in checks
diff --git a/policyengine_us_data/utils/validate_h5.py b/policyengine_us_data/utils/validate_h5.py
new file mode 100644
index 000000000..0e04e0998
--- /dev/null
+++ b/policyengine_us_data/utils/validate_h5.py
@@ -0,0 +1,134 @@
+"""Pre-publish validation for H5 dataset files.
+
+Checks entity dimension consistency and weight sanity before upload.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import h5py
+import numpy as np
+
+from policyengine_us_data.utils.downsample import ENTITY_ID_VARIABLES
+
+
+def validate_h5_entity_dimensions(
+    h5_path: str | Path, period: int = 2024
+) -> list[dict]:
+    """Validate that every variable in the H5 has the correct entity length.
+
+    Args:
+        h5_path: Path to an H5 dataset file.
+        period: Tax year key inside the H5 (top-level group).
+
+    Returns:
+        List of ``{check, status, detail}`` dicts.
+    """
+    from policyengine_us import CountryTaxBenefitSystem
+
+    tbs = CountryTaxBenefitSystem()
+    results: list[dict] = []
+    h5_path = Path(h5_path)
+
+    with h5py.File(h5_path, "r") as f:
+        group = f[str(period)]
+        variable_names = list(group.keys())
+
+        entity_counts: dict[str, int] = {}
+        for entity_key, id_var in ENTITY_ID_VARIABLES.items():
+            if id_var in group:
+                entity_counts[entity_key] = len(group[id_var])
+
+        # Dimension checks
+        for var_name in variable_names:
+            variable_meta = tbs.variables.get(var_name)
+            if variable_meta is None:
+                continue
+            entity_key = getattr(getattr(variable_meta, "entity", None), "key", None)
+            expected = entity_counts.get(entity_key)
+            if expected is None:
+                continue
+            actual = len(group[var_name])
+            if actual != expected:
+                results.append(
+                    {
+                        "check": "dimension",
+                        "status": "FAIL",
+                        "detail": (
+                            f"{var_name} ({entity_key}): "
+                            f"expected {expected}, got {actual}"
+                        ),
+                    }
+                )
+
+        # household_weight existence
+        if "household_weight" not in group:
+            results.append(
+                {
+                    "check": "household_weight_exists",
+                    "status": "FAIL",
+                    "detail": "household_weight not found in H5",
+                }
+            )
+        else:
+            weights = np.asarray(group["household_weight"])
+            if np.all(weights == 0):
+                results.append(
+                    {
+                        "check": "household_weight_nonzero",
+                        "status": "FAIL",
+                        "detail": "all household_weight values are zero",
+                    }
+                )
+
+        # Reasonable household count
+        hh_count = entity_counts.get("household", 0)
+        if hh_count == 0:
+            results.append(
+                {
+                    "check": "household_count",
+                    "status": "FAIL",
+                    "detail": "household count is zero",
+                }
+            )
+
+    return results
+
+
+def validate_h5_or_raise(
+    h5_path: str | Path, label: str = "", period: int = 2024
+) -> None:
+    """Run all H5 validations and raise on any failure.
+
+    Args:
+        h5_path: Path to the H5 file.
+        label: Optional label for error messages.
+        period: Tax year key inside the H5.
+
+    Raises:
+        ValueError: If any validation check fails.
+    """
+    failures = validate_h5_entity_dimensions(h5_path, period=period)
+    if failures:
+        tag = f" [{label}]" if label else ""
+        lines = [f"H5 validation failed{tag} for {h5_path}:"]
+        for f in failures:
+            lines.append(f"  {f['check']}: {f['detail']}")
+        raise ValueError("\n".join(lines))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <h5_path> [period]", file=sys.stderr)
+        sys.exit(1)
+    path = sys.argv[1]
+    yr = int(sys.argv[2]) if len(sys.argv) > 2 else 2024
+    issues = validate_h5_entity_dimensions(path, period=yr)
+    if issues:
+        for issue in issues:
+            print(f"[{issue['status']}] {issue['check']}: {issue['detail']}")
+        sys.exit(1)
+    else:
+        print(f"All checks passed for {path}")

From 00b5e869afd8cdccd0ec99bc40fde1b25b86857b Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 1 Apr 2026 16:41:06 -0400
Subject: [PATCH 2/2] Fix H5 validator to handle both flat and variable/period
 layouts

Pipeline-built files (build_h5) use variable/period nesting while
storage files use flat top-level datasets. Auto-detect layout in
_read_array(). Tests now cover both formats.

Verified against real files: SC.h5 (nested) and extended_cps_2024.h5 (flat).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../tests/test_validate_h5.py                 | 150 ++++++++----------
 policyengine_us_data/utils/validate_h5.py     |  48 ++++--
 2 files changed, 103 insertions(+), 95 deletions(-)

diff --git a/policyengine_us_data/tests/test_validate_h5.py b/policyengine_us_data/tests/test_validate_h5.py
index b3a86ef6a..599f6ddf5 100644
--- a/policyengine_us_data/tests/test_validate_h5.py
+++ b/policyengine_us_data/tests/test_validate_h5.py
@@ -13,7 +13,7 @@
 
 
 def _make_mock_tbs(variable_entities: dict[str, str]):
-    """Build a mock CountryTaxBenefitSystem with given variable→entity mappings."""
+    """Build a mock CountryTaxBenefitSystem with given variable->entity mappings."""
     tbs = MagicMock()
     variables = {}
     for var_name, entity_key in variable_entities.items():
@@ -24,17 +24,33 @@ def _make_mock_tbs(variable_entities: dict[str, str]):
     return tbs
 
 
-def _write_h5(path, period, datasets: dict[str, np.ndarray]):
+def _write_h5_flat(path, datasets: dict[str, np.ndarray]):
+    """Flat layout: datasets at the top level (storage files)."""
     with h5py.File(path, "w") as f:
-        grp = f.create_group(str(period))
         for name, arr in datasets.items():
-            grp.create_dataset(name, data=arr)
+            f.create_dataset(name, data=arr)
+
+
+def _write_h5_nested(path, period, datasets: dict[str, np.ndarray]):
+    """Nested layout: variable/period (pipeline-built files)."""
+    with h5py.File(path, "w") as f:
+        for name, arr in datasets.items():
+            grp = f.create_group(name)
+            grp.create_dataset(str(period), data=arr)
 
 
 PERIOD = 2024
 N_PERSONS = 10
 N_HOUSEHOLDS = 5
 
+GOOD_DATA = {
+    "person_id": np.arange(N_PERSONS),
+    "household_id": np.arange(N_HOUSEHOLDS),
+    "age": np.ones(N_PERSONS),
+    "income": np.ones(N_PERSONS),
+    "household_weight": np.ones(N_HOUSEHOLDS),
+}
+
 
 @pytest.fixture
 def mock_tbs():
@@ -49,20 +65,10 @@ def mock_tbs():
     )
 
 
-class TestDimensionsMatch:
+class TestFlatLayout:
     def test_all_correct(self, tmp_path, mock_tbs):
         h5_path = tmp_path / "good.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS),
-                "income": np.ones(N_PERSONS),
-                "household_weight": np.ones(N_HOUSEHOLDS),
-            },
-        )
+        _write_h5_flat(h5_path, GOOD_DATA)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
@@ -70,62 +76,59 @@ def test_all_correct(self, tmp_path, mock_tbs):
             results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
         assert results == []
 
-    def test_or_raise_passes(self, tmp_path, mock_tbs):
-        h5_path = tmp_path / "good.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS),
-                "income": np.ones(N_PERSONS),
-                "household_weight": np.ones(N_HOUSEHOLDS),
-            },
-        )
+    def test_wrong_person_length(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "bad.h5"
+        data = {**GOOD_DATA, "age": np.ones(N_PERSONS + 99)}
+        _write_h5_flat(h5_path, data)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
         ):
-            validate_h5_or_raise(h5_path, period=PERIOD)
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        dim_fails = [r for r in results if r["check"] == "dimension"]
+        assert len(dim_fails) == 1
+        assert "age" in dim_fails[0]["detail"]
 
 
-class TestPersonDimensionMismatch:
-    def test_wrong_person_variable_length(self, tmp_path, mock_tbs):
-        h5_path = tmp_path / "bad_dim.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS + 99),  # wrong length
-                "income": np.ones(N_PERSONS),
-                "household_weight": np.ones(N_HOUSEHOLDS),
-            },
-        )
+class TestNestedLayout:
+    def test_all_correct(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "good_nested.h5"
+        _write_h5_nested(h5_path, PERIOD, GOOD_DATA)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
         ):
             results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
-        fails = [r for r in results if r["status"] == "FAIL"]
-        assert len(fails) == 1
-        assert "age" in fails[0]["detail"]
-
-    def test_or_raise_raises(self, tmp_path, mock_tbs):
-        h5_path = tmp_path / "bad_dim.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS + 99),
-                "income": np.ones(N_PERSONS),
-                "household_weight": np.ones(N_HOUSEHOLDS),
-            },
-        )
+        assert results == []
+
+    def test_wrong_person_length(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "bad_nested.h5"
+        data = {**GOOD_DATA, "age": np.ones(N_PERSONS + 99)}
+        _write_h5_nested(h5_path, PERIOD, data)
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            results = validate_h5_entity_dimensions(h5_path, period=PERIOD)
+        dim_fails = [r for r in results if r["check"] == "dimension"]
+        assert len(dim_fails) == 1
+        assert "age" in dim_fails[0]["detail"]
+
+
+class TestOrRaise:
+    def test_passes(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "good.h5"
+        _write_h5_flat(h5_path, GOOD_DATA)
+        with patch(
+            "policyengine_us.CountryTaxBenefitSystem",
+            return_value=mock_tbs,
+        ):
+            validate_h5_or_raise(h5_path, period=PERIOD)
+
+    def test_raises_on_mismatch(self, tmp_path, mock_tbs):
+        h5_path = tmp_path / "bad.h5"
+        data = {**GOOD_DATA, "age": np.ones(N_PERSONS + 99)}
+        _write_h5_flat(h5_path, data)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
@@ -137,16 +140,8 @@ def test_or_raise_raises(self, tmp_path, mock_tbs):
 class TestMissingHouseholdWeight:
     def test_missing_weight(self, tmp_path, mock_tbs):
         h5_path = tmp_path / "no_weight.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS),
-                "income": np.ones(N_PERSONS),
-            },
-        )
+        data = {k: v for k, v in GOOD_DATA.items() if k != "household_weight"}
+        _write_h5_flat(h5_path, data)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
@@ -159,17 +154,8 @@ def test_missing_weight(self, tmp_path, mock_tbs):
 class TestAllZeroWeights:
     def test_zero_weights(self, tmp_path, mock_tbs):
         h5_path = tmp_path / "zero_weight.h5"
-        _write_h5(
-            h5_path,
-            PERIOD,
-            {
-                "person_id": np.arange(N_PERSONS),
-                "household_id": np.arange(N_HOUSEHOLDS),
-                "age": np.ones(N_PERSONS),
-                "income": np.ones(N_PERSONS),
-                "household_weight": np.zeros(N_HOUSEHOLDS),
-            },
-        )
+        data = {**GOOD_DATA, "household_weight": np.zeros(N_HOUSEHOLDS)}
+        _write_h5_flat(h5_path, data)
         with patch(
             "policyengine_us.CountryTaxBenefitSystem",
             return_value=mock_tbs,
diff --git a/policyengine_us_data/utils/validate_h5.py b/policyengine_us_data/utils/validate_h5.py
index 0e04e0998..f168e862b 100644
--- a/policyengine_us_data/utils/validate_h5.py
+++ b/policyengine_us_data/utils/validate_h5.py
@@ -14,6 +14,27 @@
 from policyengine_us_data.utils.downsample import ENTITY_ID_VARIABLES
 
 
+def _read_array(f: h5py.File, var_name: str, period: int):
+    """Read a variable array, handling both H5 layouts.
+
+    Pipeline-built files use ``variable/period`` nesting (groups at top level,
+    datasets underneath keyed by year).  Storage flat files store datasets
+    directly at the top level with no period sub-key.
+
+    Returns None if the variable is not found.
+    """
+    if var_name not in f:
+        return None
+    item = f[var_name]
+    if isinstance(item, h5py.Dataset):
+        return item
+    # Group — look for period sub-key
+    period_key = str(period)
+    if period_key in item:
+        return item[period_key]
+    return None
+
+
 def validate_h5_entity_dimensions(
     h5_path: str | Path, period: int = 2024
 ) -> list[dict]:
@@ -21,10 +42,10 @@ def validate_h5_entity_dimensions(
 
     Args:
         h5_path: Path to an H5 dataset file.
-        period: Tax year key inside the H5 (top-level group).
+        period: Tax year key inside the H5.
 
     Returns:
-        List of ``{check, status, detail}`` dicts.
+        List of ``{check, status, detail}`` dicts (empty means all OK).
     """
     from policyengine_us import CountryTaxBenefitSystem
 
@@ -33,15 +54,14 @@ def validate_h5_entity_dimensions(
     h5_path = Path(h5_path)
 
     with h5py.File(h5_path, "r") as f:
-        group = f[str(period)]
-        variable_names = list(group.keys())
+        variable_names = list(f.keys())
 
         entity_counts: dict[str, int] = {}
         for entity_key, id_var in ENTITY_ID_VARIABLES.items():
-            if id_var in group:
-                entity_counts[entity_key] = len(group[id_var])
+            arr = _read_array(f, id_var, period)
+            if arr is not None:
+                entity_counts[entity_key] = len(arr)
 
-        # Dimension checks
         for var_name in variable_names:
             variable_meta = tbs.variables.get(var_name)
             if variable_meta is None:
@@ -50,7 +70,10 @@ def validate_h5_entity_dimensions(
             expected = entity_counts.get(entity_key)
             if expected is None:
                 continue
-            actual = len(group[var_name])
+            arr = _read_array(f, var_name, period)
+            if arr is None:
+                continue
+            actual = len(arr)
             if actual != expected:
                 results.append(
                     {
@@ -63,8 +86,9 @@ def validate_h5_entity_dimensions(
                     }
                 )
 
-        # household_weight existence
-        if "household_weight" not in group:
+        # household_weight existence and sanity
+        hw = _read_array(f, "household_weight", period)
+        if hw is None:
             results.append(
                 {
                     "check": "household_weight_exists",
@@ -73,8 +97,7 @@ def validate_h5_entity_dimensions(
                 }
             )
         else:
-            weights = np.asarray(group["household_weight"])
-            if np.all(weights == 0):
+            if np.all(np.asarray(hw) == 0):
                 results.append(
                     {
                         "check": "household_weight_nonzero",
@@ -83,7 +106,6 @@ def validate_h5_entity_dimensions(
                     }
                 )
 
-        # Reasonable household count
         hh_count = entity_counts.get("household", 0)
         if hh_count == 0:
             results.append(