diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 000000000..01691fa13 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,12 @@ +coverage: + status: + project: + default: + informational: true + patch: + default: + informational: true + +comment: + layout: "condensed_header, condensed_files" + behavior: default diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml deleted file mode 100644 index ebc7b6d2a..000000000 --- a/.github/workflows/code_changes.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# Workflow that runs on code changes after merge to main. - -name: Code changes -on: - workflow_call: - workflow_dispatch: - push: - branches: - - main - paths: - - pyproject.toml - -jobs: - Lint: - uses: ./.github/workflows/reusable_lint.yaml - - Test: - needs: Lint - uses: ./.github/workflows/reusable_test.yaml - with: - full_suite: true - upload_data: true - deploy_docs: true - secrets: inherit - - Publish: - runs-on: ubuntu-latest - needs: [Lint, Test] - if: github.event.head_commit.message == 'Update package version' - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.13 - - name: Install uv - uses: astral-sh/setup-uv@v5 - - name: Install package - run: uv sync --dev - - name: Build package - run: uv run python -m build - - name: Publish a Python distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI }} - skip-existing: true \ No newline at end of file diff --git a/.github/workflows/local_area_promote.yaml b/.github/workflows/local_area_promote.yaml deleted file mode 100644 index 8d7d235ea..000000000 --- a/.github/workflows/local_area_promote.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: Promote Local Area H5 Files - -on: - workflow_dispatch: - inputs: - version: - description: 'Version to promote (e.g. 1.23.0)' - required: true - type: string - branch: - description: 'Branch to use for repo setup' - required: false - default: 'main' - type: string - -jobs: - promote-local-area: - runs-on: ubuntu-latest - permissions: - contents: read - env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - - name: Install Modal CLI - run: pip install modal - - - name: Promote staged files to production - run: | - VERSION="${{ github.event.inputs.version }}" - BRANCH="${{ github.event.inputs.branch }}" - echo "Promoting version ${VERSION} from branch ${BRANCH}" - modal run modal_app/local_area.py::main_promote --version="${VERSION}" --branch="${BRANCH}" diff --git a/.github/workflows/local_area_publish.yaml b/.github/workflows/local_area_publish.yaml deleted file mode 100644 index 47958a90c..000000000 --- a/.github/workflows/local_area_publish.yaml +++ /dev/null @@ -1,131 +0,0 @@ -name: Publish Local Area H5 Files - -on: - # TEMPORARILY DISABLED - re-enable push/repository_dispatch triggers when ready - # push: - # branches: [main] - # paths: - # - 'policyengine_us_data/calibration/**' - # - '.github/workflows/local_area_publish.yaml' - # - 'modal_app/**' - # repository_dispatch: - # types: [calibration-updated] - workflow_dispatch: - inputs: - num_workers: - description: 'Number of parallel workers' - required: false - default: '8' - type: string - skip_upload: - description: 'Skip upload (build only)' - required: false - default: false - type: boolean - -# Trigger strategy: -# 1. Automatic: Code changes to calibration/ pushed to main -# 2. repository_dispatch: Calibration workflow triggers after uploading new weights -# 3. workflow_dispatch: Manual trigger with optional parameters - -jobs: - publish-local-area: - runs-on: ubuntu-latest - permissions: - contents: read - env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - - name: Install Modal CLI - run: pip install modal - - - name: Run local area build and stage on Modal - run: | - NUM_WORKERS="${{ github.event.inputs.num_workers || '8' }}" - SKIP_UPLOAD="${{ github.event.inputs.skip_upload || 'false' }}" - BRANCH="${{ github.head_ref || github.ref_name }}" - - CMD="modal run modal_app/local_area.py::main --branch=${BRANCH} --num-workers=${NUM_WORKERS}" - - if [ "$SKIP_UPLOAD" = "true" ]; then - CMD="${CMD} --skip-upload" - fi - - echo "Running: $CMD" - $CMD - - - name: Post-build summary - if: success() - run: | - echo "## Build + Stage Complete" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Files have been uploaded to GCS and staged on HuggingFace." >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Next step: Validation runs automatically" >> $GITHUB_STEP_SUMMARY - echo "The validate-staging job will now check all staged H5s." >> $GITHUB_STEP_SUMMARY - - validate-staging: - needs: publish-local-area - runs-on: ubuntu-latest - env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - - name: Set up uv - uses: astral-sh/setup-uv@v5 - - - name: Install dependencies - run: uv sync - - - name: Validate staged H5s - run: | - uv run python -m policyengine_us_data.calibration.validate_staging \ - --area-type states --output validation_results.csv - - - name: Upload validation results to HF - run: | - uv run python -c " - from policyengine_us_data.utils.huggingface import upload - upload('validation_results.csv', - 'policyengine/policyengine-us-data', - 'calibration/logs/validation_results.csv') - " - - - name: Post validation summary - if: always() - run: | - echo "## Validation Results" >> $GITHUB_STEP_SUMMARY - if [ -f validation_results.csv ]; then - TOTAL=$(tail -n +2 validation_results.csv | wc -l) - FAILS=$(grep -c ',FAIL,' validation_results.csv || true) - echo "- **${TOTAL}** targets validated" >> $GITHUB_STEP_SUMMARY - echo "- **${FAILS}** sanity failures" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Review in dashboard, then trigger **Promote** workflow." >> $GITHUB_STEP_SUMMARY - else - echo "Validation did not produce output." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload validation artifact - uses: actions/upload-artifact@v4 - with: - name: validation-results - path: validation_results.csv diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 0c71a9d5c..894452c11 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -63,6 +63,7 @@ jobs: national_epochs=int('${NATIONAL_EPOCHS}'), num_workers=int('${NUM_WORKERS}'), skip_national='${SKIP_NATIONAL}' == 'true', + scope='${SCOPE}', ) print(f'Pipeline spawned. Monitor on the Modal dashboard.') " diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml new file mode 100644 index 000000000..a15817f98 --- /dev/null +++ b/.github/workflows/pr.yaml @@ -0,0 +1,103 @@ +name: PR checks + +on: + pull_request: + branches: [main] + +jobs: + check-fork: + runs-on: ubuntu-latest + steps: + - name: Check if PR is from fork + run: | + if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then + echo "::error::PRs must be from branches in PolicyEngine/policyengine-us-data, not forks." + echo "Fork PRs cannot access secrets required for data downloads." + echo "Please close this PR and push your branch directly to the upstream repo." + exit 1 + fi + + check-lock-freshness: + runs-on: ubuntu-latest + needs: check-fork + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - uses: astral-sh/setup-uv@v5 + - name: Check lock file is up-to-date + run: | + uv lock --locked || { + echo "::error::uv.lock is outdated. Run 'uv lock' and commit the changes." + exit 1 + } + + lint: + runs-on: ubuntu-latest + needs: check-fork + steps: + - uses: actions/checkout@v4 + - name: Install ruff + run: pip install ruff>=0.9.0 + - name: Check formatting + run: ruff format --check . + + check-changelog: + runs-on: ubuntu-latest + needs: check-fork + steps: + - uses: actions/checkout@v4 + - name: Check for changelog fragment + run: | + FRAGMENTS=$(find changelog.d -type f ! -name '.gitkeep' | wc -l) + if [ "$FRAGMENTS" -eq 0 ]; then + echo "::error::No changelog fragment found in changelog.d/" + echo "Add one with: echo 'Description.' > changelog.d/\$(git branch --show-current)..md" + echo "Types: added, changed, fixed, removed, breaking" + exit 1 + fi + + unit-tests: + runs-on: ubuntu-latest + needs: [check-fork, lint] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - uses: astral-sh/setup-uv@v5 + - name: Install package + run: uv sync --dev + - name: Run unit tests with coverage + env: + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + run: > + uv run pytest policyengine_us_data/tests/unit/ + --cov=policyengine_us_data + --cov-report=xml + -v + - name: Upload coverage to Codecov + if: always() + uses: codecov/codecov-action@v4 + with: + file: coverage.xml + flags: unit + fail_ci_if_error: false + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + smoke-test: + runs-on: ubuntu-latest + needs: [check-fork, lint] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install package (no dev deps) + run: python -m pip install . + - name: Test basic import + run: python -c "import policyengine_us_data; print('Minimal import OK')" + - name: Test core import + run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" diff --git a/.github/workflows/pr_changelog.yaml b/.github/workflows/pr_changelog.yaml deleted file mode 100644 index 49ac82a9d..000000000 --- a/.github/workflows/pr_changelog.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Changelog entry - -on: - pull_request: - branches: [main] - -jobs: - check-changelog: - name: Check changelog fragment - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Check for changelog fragment - run: | - FRAGMENTS=$(find changelog.d -type f ! -name '.gitkeep' | wc -l) - if [ "$FRAGMENTS" -eq 0 ]; then - echo "::error::No changelog fragment found in changelog.d/" - echo "Add one with: echo 'Description.' > changelog.d/\$(git branch --show-current)..md" - echo "Types: added, changed, fixed, removed, breaking" - exit 1 - fi diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml deleted file mode 100644 index bc10cc6f1..000000000 --- a/.github/workflows/pr_code_changes.yaml +++ /dev/null @@ -1,177 +0,0 @@ -# Workflow that runs on code changes to a pull request. - -name: PR code changes -on: - pull_request: - branches: - - main - paths: - - pyproject.toml - - uv.lock - - modal_app/** - - policyengine_us_data/** - - tests/** - - .github/workflows/** - - Makefile - -concurrency: - group: pr-code-changes-${{ github.event.pull_request.number }} - cancel-in-progress: true - -jobs: - check-fork: - runs-on: ubuntu-latest - steps: - - name: Check if PR is from fork - run: | - if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then - echo "❌ ERROR: This PR is from a fork repository." - echo "PRs must be created from branches in the main PolicyEngine/policyengine-us-data repository." - echo "Please close this PR and create a new one following these steps:" - echo "1. git checkout main" - echo "2. git pull upstream main" - echo "3. git checkout -b your-branch-name" - echo "4. git push -u upstream your-branch-name" - echo "5. Create PR from the upstream branch" - exit 1 - fi - echo "✅ PR is from the correct repository" - - decide-test-scope: - name: Decide PR test scope - runs-on: ubuntu-latest - needs: check-fork - outputs: - full_suite: ${{ steps.decide.outputs.full_suite }} - reason: ${{ steps.decide.outputs.reason }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - id: decide - env: - BASE_SHA: ${{ github.event.pull_request.base.sha }} - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - PR_LABELS_JSON: ${{ toJson(github.event.pull_request.labels.*.name) }} - run: | - python - <<'PY' - import fnmatch - import json - import os - import subprocess - - labels = set(json.loads(os.environ["PR_LABELS_JSON"])) - changed_files = subprocess.check_output( - [ - "git", - "diff", - "--name-only", - os.environ["BASE_SHA"], - os.environ["HEAD_SHA"], - ], - text=True, - ).splitlines() - - full_suite_label = "full-data-ci" - critical_patterns = [ - "modal_app/**", - "policyengine_us_data/calibration/**", - "policyengine_us_data/datasets/**", - "policyengine_us_data/db/**", - "policyengine_us_data/storage/download_private_prerequisites.py", - "policyengine_us_data/utils/loss.py", - "policyengine_us_data/utils/mortgage_interest.py", - "policyengine_us_data/utils/soi.py", - "policyengine_us_data/utils/uprating.py", - ] - - matched_files = [ - path - for path in changed_files - if any(fnmatch.fnmatch(path, pattern) for pattern in critical_patterns) - ] - - if full_suite_label in labels: - full_suite = True - reason = f"label:{full_suite_label}" - elif matched_files: - full_suite = True - reason = f"critical-path:{matched_files[0]}" - else: - full_suite = False - reason = "basic-pytest-only" - - with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output: - output.write(f"full_suite={'true' if full_suite else 'false'}\n") - output.write(f"reason={reason}\n") - - summary = [ - "### PR test scope", - f"- full suite: `{'true' if full_suite else 'false'}`", - f"- reason: `{reason}`", - ] - if matched_files: - summary.append(f"- first matching file: `{matched_files[0]}`") - with open(os.environ["GITHUB_STEP_SUMMARY"], "a", encoding="utf-8") as out: - out.write("\n".join(summary) + "\n") - PY - - check-lock-freshness: - name: Check uv.lock freshness - runs-on: ubuntu-latest - needs: check-fork - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - name: Install uv - uses: astral-sh/setup-uv@v5 - - name: Check lock file is up-to-date - run: | - uv lock --locked || { - echo "::error::uv.lock is outdated. Run 'uv lock' and commit the changes." - exit 1 - } - - Lint: - needs: [check-fork, check-lock-freshness] - uses: ./.github/workflows/reusable_lint.yaml - - SmokeTestForMultipleVersions: - name: Smoke test (${{ matrix.os }}, Python ${{ matrix.python-version }}) - runs-on: ${{ matrix.os }} - needs: [check-fork, Lint] - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ['3.13'] - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install package ONLY (no dev deps) - run: python -m pip install . - - - name: Test basic import - run: python -c "import policyengine_us_data; print('Minimal import OK')" - - - name: Test specific core import - run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" - - Test: - needs: [check-fork, Lint, decide-test-scope] - uses: ./.github/workflows/reusable_test.yaml - with: - full_suite: ${{ needs.decide-test-scope.outputs.full_suite == 'true' }} - upload_data: false - deploy_docs: false - secrets: inherit diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml new file mode 100644 index 000000000..2ef081ab4 --- /dev/null +++ b/.github/workflows/push.yaml @@ -0,0 +1,240 @@ +name: Push to main + +on: + push: + branches: [main] + +jobs: + # ── Lint ──────────────────────────────────────────────────── + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install ruff + run: pip install ruff>=0.9.0 + - name: Check formatting + run: ruff format --check . + + # ── Download prerequisites ────────────────────────────────── + download-prerequisites: + runs-on: ubuntu-latest + needs: lint + if: github.event.head_commit.message != 'Update package version' + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: Download prerequisites on Modal + run: | + modal run modal_app/data_build.py \ + --script download_prerequisites \ + --branch=${{ github.ref_name }} + + # ── Phase 1: Independent datasets (parallel) ─────────────── + phase1: + needs: download-prerequisites + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + dataset: [uprating, acs, irs_puf] + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Build + test: ${{ matrix.dataset }}" + run: | + modal run modal_app/data_build.py \ + --script ${{ matrix.dataset }} \ + --run-tests \ + --branch=${{ github.ref_name }} + + # ── Phase 2: CPS + PUF (depend on Phase 1) ───────────────── + phase2: + needs: phase1 + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + dataset: [cps, puf] + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Build + test: ${{ matrix.dataset }}" + run: | + modal run modal_app/data_build.py \ + --script ${{ matrix.dataset }} \ + --run-tests \ + --branch=${{ github.ref_name }} + + # ── Phase 3: Extended CPS (depends on Phase 2) ───────────── + phase3: + needs: phase2 + runs-on: ubuntu-latest + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Build + test: extended_cps" + run: | + modal run modal_app/data_build.py \ + --script extended_cps \ + --run-tests \ + --branch=${{ github.ref_name }} + + # ── Phase 4: Enhanced + Stratified CPS (depend on Phase 3) ─ + phase4: + needs: phase3 + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + dataset: [enhanced_cps, stratified_cps] + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Build + test: ${{ matrix.dataset }}" + run: | + modal run modal_app/data_build.py \ + --script ${{ matrix.dataset }} \ + --run-tests \ + --branch=${{ github.ref_name }} + + # ── Phase 5: Source imputed + Small enhanced (depend on 4) ── + phase5: + needs: phase4 + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + dataset: [source_imputed_cps, small_enhanced_cps] + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Build + test: ${{ matrix.dataset }}" + run: | + modal run modal_app/data_build.py \ + --script ${{ matrix.dataset }} \ + --run-tests \ + --branch=${{ github.ref_name }} + + # ── Remaining integration tests (depend on Phase 4) ───────── + remaining-tests: + needs: phase4 + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + test: + - policyengine_us_data/tests/integration/test_census_cps.py + - policyengine_us_data/tests/integration/test_database_build.py + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Modal CLI + run: pip install modal + - name: "Test: ${{ matrix.test }}" + run: | + modal run modal_app/data_build.py \ + --test ${{ matrix.test }} \ + --branch=${{ github.ref_name }} + + # ── Manual approval gate ──────────────────────────────────── + approval-gate: + needs: [phase5, remaining-tests] + runs-on: ubuntu-latest + environment: pipeline-approval + steps: + - run: echo "Pipeline approved. Dispatching H5 build." + + # ── Dispatch pipeline ─────────────────────────────────────── + trigger-pipeline: + needs: approval-gate + runs-on: ubuntu-latest + steps: + - name: Trigger pipeline workflow + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + await github.rest.actions.createWorkflowDispatch({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'pipeline.yaml', + ref: 'main', + inputs: { scope: 'all' } + }) + console.log('Pipeline dispatched with scope=all') + + # ── PyPI publish (version bump commits only) ──────────────── + publish: + runs-on: ubuntu-latest + needs: lint + if: github.event.head_commit.message == 'Update package version' + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - uses: astral-sh/setup-uv@v5 + - name: Install package + run: uv sync --dev + - name: Build package + run: uv run python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI }} + skip-existing: true diff --git a/.github/workflows/reusable_lint.yaml b/.github/workflows/reusable_lint.yaml deleted file mode 100644 index 862e90a8a..000000000 --- a/.github/workflows/reusable_lint.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: Reusable Lint - -on: - workflow_call: - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Install ruff - run: pip install ruff>=0.9.0 - - name: Check formatting - run: ruff format --check . diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml deleted file mode 100644 index 4575a508c..000000000 --- a/.github/workflows/reusable_test.yaml +++ /dev/null @@ -1,88 +0,0 @@ -name: Reusable Test - -on: - workflow_call: - inputs: - full_suite: - description: 'Run full test suite including data build' - required: false - default: false - type: boolean - upload_data: - description: 'Upload data after build' - required: false - default: false - type: boolean - deploy_docs: - description: 'Deploy documentation to GitHub Pages' - required: false - default: false - type: boolean - secrets: - HUGGING_FACE_TOKEN: - required: false - POLICYENGINE_US_DATA_GITHUB_TOKEN: - required: false - MODAL_TOKEN_ID: - required: false - MODAL_TOKEN_SECRET: - required: false - -jobs: - test: - runs-on: ubuntu-latest - permissions: - contents: write - id-token: write - env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Install uv - uses: astral-sh/setup-uv@v5 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '24' - - - name: Install Modal CLI - if: inputs.full_suite - run: pip install modal - - - name: Run data build and tests on Modal - if: inputs.full_suite - run: | - modal run modal_app/data_build.py \ - ${{ inputs.upload_data && '--upload' || '--no-upload' }} \ - --branch=${{ github.head_ref || github.ref_name }} - - - name: Install package - run: uv sync --dev - - - name: Run basic tests - if: ${{ !inputs.full_suite }} - run: uv run pytest - - - name: Test documentation builds - run: uv run make documentation - env: - BASE_URL: ${{ inputs.deploy_docs && '/policyengine-us-data' || '' }} - - - name: Deploy Github Pages documentation - if: inputs.deploy_docs - uses: JamesIves/github-pages-deploy-action@v4 - with: - branch: gh-pages - folder: docs/_build/html - clean: true diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml index b6fae4c68..20b8e8d57 100644 --- a/.github/workflows/versioning.yaml +++ b/.github/workflows/versioning.yaml @@ -15,10 +15,16 @@ jobs: if: (github.event.head_commit.message != 'Update package version') runs-on: ubuntu-latest steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.APP_ID }} + private-key: ${{ secrets.APP_PRIVATE_KEY }} - name: Checkout repo uses: actions/checkout@v4 with: - token: ${{ secrets.POLICYENGINE_GITHUB }} + token: ${{ steps.app-token.outputs.token }} fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 diff --git a/CLAUDE.md b/CLAUDE.md index 25e315030..09e0da618 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,9 +6,29 @@ - `make data` - Generate project datasets ## Testing -- `pytest` - Run all tests -- `pytest path/to/test_file.py::test_function` - Run a specific test -- `make test` - Also runs all tests + +### Running Tests +- `make test-unit` - Run unit tests only (fast, no data dependencies) +- `make test-integration` - Run integration tests (requires built H5 datasets) +- `make test` - Run all tests +- `pytest policyengine_us_data/tests/unit/ -v` - Unit tests directly +- `pytest policyengine_us_data/tests/integration/test_cps.py -v` - Specific integration test + +### Test Organization +Tests are split into two directories: + +- **`policyengine_us_data/tests/unit/`** — Self-contained tests that use synthetic data, mocks, patches, or checked-in fixtures. Run in seconds with no external dependencies. + - `unit/datasets/` — unit tests for dataset code (no `test_` prefix on folder) + - `unit/calibration/` — unit tests for calibration code (no `test_` prefix on folder) + +- **`policyengine_us_data/tests/integration/`** — Tests that require built H5 datasets, HuggingFace downloads, Microsimulation objects, or database ETL. Named after the dataset they test. + +### Test Placement Rules +- **NEVER** put tests that require H5 files or Microsimulation in `unit/` +- **NEVER** put tests that use only synthetic data or mocks in `integration/` +- Integration test files are named after their dataset dependency: `test_cps.py` tests `cps_2024.h5` +- Sanity checks (value ranges, population counts) belong in the per-dataset integration test file, not in a separate sanity file +- When adding a new integration test, add it to the existing per-dataset file if one exists ## Formatting - `make format` - Format all code using ruff @@ -22,7 +42,17 @@ - **Documentation**: Google-style docstrings with Args and Returns sections - **Error Handling**: Use validation checks with specific error messages - **Line Length**: ruff default (see pyproject.toml for any override) -- **Python Version**: Targeting Python 3.11 +- **Python Version**: Targeting Python 3.12-3.13 + +## CI/CD Structure +Four workflow files in `.github/workflows/`: + +- **`pr.yaml`** — Runs on every PR to main: fork check, lint, uv.lock freshness, changelog fragment, unit tests with Codecov, smoke test. ~2-3 minutes. +- **`push.yaml`** — Runs on push to main. Two paths: + - Version bump commits (`Update package version`): build and publish to PyPI + - All other commits: per-dataset Modal build with integration tests after each stage → manual approval gate → pipeline dispatch +- **`pipeline.yaml`** — Dispatch only. Spawns the H5 generation pipeline on Modal with scope filtering (all/national/state/congressional/local/test). +- **`versioning.yaml`** — Auto-bumps version when changelog.d fragments are merged. Commits `Update package version` which triggers the publish path in push.yaml. ## Git and PR Guidelines - **CRITICAL**: NEVER create PRs from personal forks - ALL PRs MUST be created from branches pushed to the upstream PolicyEngine repository diff --git a/Makefile b/Makefile index 53800346d..73fcb234b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch +.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch SOI_SOURCE_YEAR ?= 2021 SOI_TARGET_YEAR ?= 2023 @@ -24,6 +24,12 @@ format: test: pytest +test-unit: + pytest policyengine_us_data/tests/unit/ + +test-integration: + pytest policyengine_us_data/tests/integration/ + install: pip install policyengine-us pip install -e ".[dev]" --config-settings editable_mode=compat diff --git a/changelog.d/fix-us-data-pypi.fixed.md b/changelog.d/fix-us-data-pypi.fixed.md new file mode 100644 index 000000000..ba949f67f --- /dev/null +++ b/changelog.d/fix-us-data-pypi.fixed.md @@ -0,0 +1 @@ +Migrated versioning workflow from expired PAT to GitHub App token for reliable PyPI publishing. Reorganized tests into unit/ and integration/ directories. Consolidated CI/CD from 9 workflow files down to 4 (pr.yaml, push.yaml, pipeline.yaml, versioning.yaml). Added queue-based Modal architecture with scope filtering for H5 builds. Added Codecov integration and per-dataset build timing in GitHub Actions summaries. diff --git a/modal_app/data_build.py b/modal_app/data_build.py index a423761e4..ab0c386c7 100644 --- a/modal_app/data_build.py +++ b/modal_app/data_build.py @@ -79,13 +79,56 @@ # Test modules to run individually for checkpoint tracking TEST_MODULES = [ - "policyengine_us_data/tests/test_import.py", - "policyengine_us_data/tests/test_database.py", - "policyengine_us_data/tests/test_pandas3_compatibility.py", - "policyengine_us_data/tests/test_datasets/", - "policyengine_us_data/tests/test_calibration/", + "policyengine_us_data/tests/unit/", + "policyengine_us_data/tests/integration/", ] +# Short names for --script mode (maps to SCRIPT_OUTPUTS keys) +SCRIPT_SHORT_NAMES = { + "download_prerequisites": "policyengine_us_data/storage/download_private_prerequisites.py", + "uprating": "policyengine_us_data/utils/uprating.py", + "acs": "policyengine_us_data/datasets/acs/acs.py", + "irs_puf": "policyengine_us_data/datasets/puf/irs_puf.py", + "cps": "policyengine_us_data/datasets/cps/cps.py", + "puf": "policyengine_us_data/datasets/puf/puf.py", + "extended_cps": "policyengine_us_data/datasets/cps/extended_cps.py", + "enhanced_cps": "policyengine_us_data/datasets/cps/enhanced_cps.py", + "stratified_cps": "policyengine_us_data/calibration/create_stratified_cps.py", + "source_imputed_cps": "policyengine_us_data/calibration/create_source_imputed_cps.py", + "small_enhanced_cps": "policyengine_us_data/datasets/cps/small_enhanced_cps.py", +} + +# Files downloaded by download_private_prerequisites.py that must be +# checkpointed so subsequent --script calls in separate containers +# can access them. +PREREQUISITE_FILES = [ + "policyengine_us_data/storage/puf_2015.csv", + "policyengine_us_data/storage/demographics_2015.csv", + "policyengine_us_data/storage/soi.csv", + "policyengine_us_data/storage/np2023_d5_mid.csv", + "policyengine_us_data/storage/calibration/policy_data.db", +] + +# Integration tests to run after each script build. +# Scripts not listed here have no associated tests. +SCRIPT_TESTS = { + "acs": ["policyengine_us_data/tests/integration/test_acs.py"], + "cps": ["policyengine_us_data/tests/integration/test_cps.py"], + "extended_cps": ["policyengine_us_data/tests/integration/test_extended_cps.py"], + "enhanced_cps": [ + "policyengine_us_data/tests/integration/test_enhanced_cps.py", + "policyengine_us_data/tests/integration/test_sparse_enhanced_cps.py", + "policyengine_us_data/tests/integration/test_sipp_assets.py", + ], + "source_imputed_cps": [ + "policyengine_us_data/tests/integration/test_source_imputed_cps_masking.py", + "policyengine_us_data/tests/integration/test_source_imputed_cps_consistency.py", + ], + "small_enhanced_cps": [ + "policyengine_us_data/tests/integration/test_small_enhanced_cps.py", + ], +} + def setup_gcp_credentials(): """Write GCP credentials JSON to a temp file for google.auth.default().""" @@ -124,10 +167,32 @@ def get_current_commit() -> str: return "unknown" +def _get_storage_folder() -> Path: + """Resolve the installed package's STORAGE_FOLDER path. + + This is where Dataset classes (CPS_2024, etc.) look for H5 files. + In an editable install it matches the source tree; in a regular + install it's inside .venv/lib/.../site-packages/. + """ + try: + from policyengine_us_data.storage import STORAGE_FOLDER + + return Path(STORAGE_FOLDER) + except ImportError: + # Fallback if package not importable (shouldn't happen in + # the Modal image, but safe for local dev) + return Path("policyengine_us_data/storage") + + def get_checkpoint_path(branch: str, output_file: str) -> Path: - """Get the checkpoint path for an output file, scoped by branch and commit.""" + """Get the checkpoint path for an output file, scoped by branch and commit. + + Preserves the relative path structure to avoid filename collisions + (e.g., calibration/policy_data.db stays distinct from policy_data.db). + """ commit = get_current_commit() - return Path(VOLUME_MOUNT) / branch / commit / Path(output_file).name + # Use the relative path as-is (not just filename) to avoid collisions + return Path(VOLUME_MOUNT) / branch / commit / output_file def is_checkpointed(branch: str, output_file: str) -> bool: @@ -140,13 +205,44 @@ def is_checkpointed(branch: str, output_file: str) -> bool: return False +def _resolve_local_path(output_file: str) -> Path: + """Resolve where a checkpointed file should be restored to. + + Maps the relative source-tree path to the installed package's + STORAGE_FOLDER so that Dataset classes can find the files. + """ + output_path = Path(output_file) + storage_folder = _get_storage_folder() + + # Files under policyengine_us_data/storage/ get mapped to + # the installed package's STORAGE_FOLDER + storage_prefix = Path("policyengine_us_data/storage") + try: + relative = output_path.relative_to(storage_prefix) + return storage_folder / relative + except ValueError: + # Not under storage/ — use the path as-is (relative to cwd) + return output_path + + def restore_from_checkpoint(branch: str, output_file: str) -> bool: - """Restore output file from checkpoint volume if it exists.""" + """Restore output file from checkpoint volume to STORAGE_FOLDER. + + Writes to the installed package's storage directory so that + Dataset classes (which use STORAGE_FOLDER) can find the files. + """ checkpoint_path = get_checkpoint_path(branch, output_file) if checkpoint_path.exists() and checkpoint_path.stat().st_size > 0: - local_path = Path(output_file) + local_path = _resolve_local_path(output_file) local_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(checkpoint_path, local_path) + # Also restore to the source-tree relative path so that + # scripts run via subprocess (which use cwd-relative paths) + # can find the file. + source_path = Path(output_file) + if source_path != local_path: + source_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(checkpoint_path, source_path) print(f"Restored from checkpoint: {output_file}") return True return False @@ -157,12 +253,24 @@ def save_checkpoint( output_file: str, volume: modal.Volume, ) -> None: - """Save output file to checkpoint volume.""" - local_path = Path(output_file) + """Save output file to checkpoint volume. + + Checks both the installed package path and the source-tree + relative path to find the file. + """ + local_path = _resolve_local_path(output_file) + source_path = Path(output_file) + # Try installed path first, fall back to source-tree path + actual_path = None if local_path.exists() and local_path.stat().st_size > 0: + actual_path = local_path + elif source_path.exists() and source_path.stat().st_size > 0: + actual_path = source_path + + if actual_path: checkpoint_path = get_checkpoint_path(branch, output_file) checkpoint_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(local_path, checkpoint_path) + shutil.copy2(actual_path, checkpoint_path) with _volume_lock: volume.commit() print(f"Checkpointed: {output_file}") @@ -220,7 +328,7 @@ def run_script( Raises: subprocess.CalledProcessError: If the script fails. """ - cmd = ["uv", "run", "python", "-u", script_path] + cmd = ["python", "-u", script_path] if args: cmd.extend(args) run_env = env or os.environ.copy() @@ -318,7 +426,7 @@ def run_tests_with_checkpoints( print(f"Running tests: {module}") result = subprocess.run( - ["uv", "run", "python", "-u", "-m", "pytest", module, "-v"], + ["python", "-u", "-m", "pytest", module, "-v"], env=env, ) @@ -654,6 +762,152 @@ def build_datasets( return "Data build completed successfully" +@app.function( + image=image, + secrets=[hf_secret, gcp_secret], + volumes={ + VOLUME_MOUNT: checkpoint_volume, + PIPELINE_MOUNT: pipeline_volume, + }, + memory=32768, + cpu=8.0, + timeout=14400, + nonpreemptible=True, +) +def run_single_script( + script_name: str, + branch: str = "main", + run_tests: bool = False, +) -> str: + """Run a single dataset build script with checkpointing. + + Optionally runs associated integration tests after the build, + inside the same container where the data was just created. + + Args: + script_name: Short name (e.g. 'cps') or full path to the script. + branch: Git branch for checkpoint scoping. + run_tests: If True, run integration tests for this dataset + after building. + + Returns: + Status message. + + Raises: + subprocess.CalledProcessError: If the build or tests fail. + """ + setup_gcp_credentials() + os.chdir("/root/policyengine-us-data") + + # Reload volume to see writes from prior --script containers + checkpoint_volume.reload() + + # Resolve short name to full path + script_path = SCRIPT_SHORT_NAMES.get(script_name, script_name) + + # Handle download_prerequisites specially (no SCRIPT_OUTPUTS entry) + if script_name == "download_prerequisites": + run_script(script_path) + # Checkpoint prerequisite files so subsequent containers can + # restore them. + for prereq in PREREQUISITE_FILES: + save_checkpoint(branch, prereq, checkpoint_volume) + return f"Completed {script_name}" + + output_files = SCRIPT_OUTPUTS.get(script_path) + if output_files is None: + raise ValueError( + f"Unknown script: {script_name}. " + f"Valid names: {', '.join(SCRIPT_SHORT_NAMES.keys())}" + ) + + # Restore prerequisite files from checkpoint volume + for prereq in PREREQUISITE_FILES: + restore_from_checkpoint(branch, prereq) + + # Restore any existing checkpoints for dependencies + for dep_path, dep_outputs in SCRIPT_OUTPUTS.items(): + if dep_path == script_path: + continue + if isinstance(dep_outputs, str): + dep_outputs = [dep_outputs] + for dep_output in dep_outputs: + restore_from_checkpoint(branch, dep_output) + + run_script_with_checkpoint( + script_path, + output_files, + branch, + checkpoint_volume, + ) + + # Run associated integration tests inside this container + if run_tests: + test_paths = SCRIPT_TESTS.get(script_name, []) + if test_paths: + print(f"\n=== Running integration tests for {script_name} ===") + cmd = ["python", "-m", "pytest", "-v", "--tb=short"] + cmd.extend(test_paths) + subprocess.run(cmd, check=True, env=os.environ.copy()) + print(f"=== Tests passed for {script_name} ===") + else: + print(f"No integration tests defined for {script_name}") + + return f"Completed {script_name}" + + +@app.function( + image=image, + secrets=[hf_secret, gcp_secret], + volumes={ + VOLUME_MOUNT: checkpoint_volume, + PIPELINE_MOUNT: pipeline_volume, + }, + memory=32768, + cpu=8.0, + timeout=3600, + nonpreemptible=True, +) +def run_integration_test( + test_path: str, + branch: str = "main", +) -> str: + """Run integration tests inside Modal where built data exists. + + Restores all checkpointed artifacts (prerequisites + datasets), + then runs pytest on the given test path. + + Args: + test_path: Path to a test file or directory. + branch: Git branch for checkpoint scoping. + + Returns: + Status message. + + Raises: + subprocess.CalledProcessError: If tests fail. + """ + setup_gcp_credentials() + os.chdir("/root/policyengine-us-data") + + # Reload volume to see writes from prior containers + checkpoint_volume.reload() + + # Restore all prerequisites and dataset outputs + for prereq in PREREQUISITE_FILES: + restore_from_checkpoint(branch, prereq) + for dep_path, dep_outputs in SCRIPT_OUTPUTS.items(): + if isinstance(dep_outputs, str): + dep_outputs = [dep_outputs] + for dep_output in dep_outputs: + restore_from_checkpoint(branch, dep_output) + + print(f"\n=== Running integration test: {test_path} ===") + cmd = ["python", "-m", "pytest", test_path, "-v", "--tb=short"] + subprocess.run(cmd, check=True, env=os.environ.copy()) + return f"Tests passed: {test_path}" + + @app.local_entrypoint() def main( upload: bool = False, @@ -662,13 +916,30 @@ def main( clear_checkpoints: bool = False, skip_tests: bool = False, skip_enhanced_cps: bool = False, + script: str = "", + run_tests: bool = False, + test: str = "", ): - result = build_datasets.remote( - upload=upload, - branch=branch, - sequential=sequential, - clear_checkpoints=clear_checkpoints, - skip_tests=skip_tests, - skip_enhanced_cps=skip_enhanced_cps, - ) - print(result) + if test: + result = run_integration_test.remote( + test_path=test, + branch=branch, + ) + print(result) + elif script: + result = run_single_script.remote( + script_name=script, + branch=branch, + run_tests=run_tests, + ) + print(result) + else: + result = build_datasets.remote( + upload=upload, + branch=branch, + sequential=sequential, + clear_checkpoints=clear_checkpoints, + skip_tests=skip_tests, + skip_enhanced_cps=skip_enhanced_cps, + ) + print(result) diff --git a/modal_app/images.py b/modal_app/images.py index f62739d48..6ec80f25f 100644 --- a/modal_app/images.py +++ b/modal_app/images.py @@ -3,6 +3,11 @@ Bakes source code and dependencies into image layers at build time. Modal caches layers by content hash of copied files -- if code changes, the image rebuilds; if not, the cached layer is reused. + +Uses `uv pip install --system` to install packages directly into +the system Python (no venv). This matches the policyengine-api-v2 +simulation-api pattern: containers start with everything already +importable, no `uv run` wrapper needed. """ import subprocess @@ -63,7 +68,7 @@ def _base_image(extras: list[str] | None = None): .env(GIT_ENV) .run_commands( f"cd /root/policyengine-us-data && " - f"UV_HTTP_TIMEOUT=300 uv sync --frozen {extra_flags}" + f"UV_HTTP_TIMEOUT=300 uv pip install --system -e '.[dev]' {extra_flags}" ) ) diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 2630d0e15..83c26b5c4 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -139,6 +139,65 @@ def get_version() -> str: return pyproject["project"]["version"] +def generate_work_items(scope: str, db_path: str) -> List[Dict]: + """Auto-generate a flat list of work items based on scope. + + Args: + scope: One of 'all', 'national', 'state', 'congressional', + 'local', or 'test'. + db_path: Path to policy_data.db for querying districts. + + Returns: + List of work item dicts: [{"type": str, "id": str}, ...] + """ + from policyengine_us_data.calibration.calibration_utils import ( + get_all_cds_from_database, + STATE_CODES, + ) + from policyengine_us_data.calibration.publish_local_area import ( + get_district_friendly_name, + ) + + all_states = list(STATE_CODES.values()) + db_uri = f"sqlite:///{db_path}" + all_cds = get_all_cds_from_database(db_uri) + all_districts = [get_district_friendly_name(cd) for cd in all_cds] + all_cities = ["NYC"] + + items = [] + + if scope == "national": + items.append({"type": "national", "id": "US"}) + + elif scope == "state": + for s in all_states: + items.append({"type": "state", "id": s}) + + elif scope == "congressional": + for d in all_districts: + items.append({"type": "district", "id": d}) + + elif scope == "local": + for c in all_cities: + items.append({"type": "city", "id": c}) + + elif scope == "test": + items.append({"type": "national", "id": "US"}) + items.append({"type": "state", "id": "NY"}) + items.append({"type": "district", "id": "NV-01"}) + + else: # "all" or unrecognized + items.append({"type": "national", "id": "US"}) + for s in all_states: + items.append({"type": "state", "id": s}) + for d in all_districts: + items.append({"type": "district", "id": d}) + for c in all_cities: + items.append({"type": "city", "id": c}) + + return items + + def partition_work( states: List[str], districts: List[str], @@ -326,8 +385,6 @@ def build_areas_worker( work_items_json = json.dumps(work_items) worker_cmd = [ - "uv", - "run", "python", "modal_app/worker_script.py", "--work-items", @@ -390,6 +447,278 @@ def build_areas_worker( return results +# ── Queue-based architecture ────────────────────────────────── +# +# build_single_area: processes ONE work item per container (1 CPU). +# queue_coordinator: generates items from scope, spawns workers, +# collects results. + + +@app.function( + image=image, + secrets=[hf_secret, gcp_secret], + volumes={ + VOLUME_MOUNT: staging_volume, + "/pipeline": pipeline_volume, + }, + memory=16384, + cpu=1.0, + timeout=7200, + nonpreemptible=True, +) +def build_single_area( + work_item: Dict, + branch: str, + version: str, + calibration_inputs: Dict[str, str], + validate: bool = True, +) -> Dict: + """Build a single H5 file for one area. + + Each container processes exactly one work item (state, district, + city, or national), validates the output, and writes to the + staging volume. + + Args: + work_item: {"type": "state|district|city|national", "id": "XX"} + branch: Git branch (for repo setup). + version: Package version string. + calibration_inputs: Dict with weights, dataset, database paths + and n_clones/seed. + validate: Whether to run per-item validation. + + Returns: + Dict with completed, failed, errors, validation_rows keys. + """ + setup_gcp_credentials() + setup_repo(branch) + + output_dir = Path(VOLUME_MOUNT) / version + output_dir.mkdir(parents=True, exist_ok=True) + + work_items_json = json.dumps([work_item]) + + repo_root = Path("/root/policyengine-us-data") + cal_dir = repo_root / "policyengine_us_data" / "calibration" + + worker_cmd = [ + "python", + "modal_app/worker_script.py", + "--work-items", + work_items_json, + "--weights-path", + calibration_inputs["weights"], + "--dataset-path", + calibration_inputs["dataset"], + "--db-path", + calibration_inputs["database"], + "--output-dir", + str(output_dir), + "--target-config", + str(cal_dir / "target_config.yaml"), + "--validation-config", + str(cal_dir / "target_config_full.yaml"), + ] + if "n_clones" in calibration_inputs: + worker_cmd.extend(["--n-clones", str(calibration_inputs["n_clones"])]) + if "seed" in calibration_inputs: + worker_cmd.extend(["--seed", str(calibration_inputs["seed"])]) + if not validate: + worker_cmd.append("--no-validate") + + item_key = f"{work_item['type']}:{work_item['id']}" + print(f"Building {item_key}...") + + result = subprocess.run( + worker_cmd, + stdout=subprocess.PIPE, + text=True, + env=os.environ.copy(), + ) + + if result.returncode != 0: + print(f"FAILED {item_key}: {result.stderr[:200]}") + return { + "completed": [], + "failed": [item_key], + "errors": [{"item": item_key, "error": result.stderr}], + "validation_rows": [], + } + + try: + results = json.loads(result.stdout) + except json.JSONDecodeError: + results = { + "completed": [], + "failed": [item_key], + "errors": [ + { + "item": item_key, + "error": f"Failed to parse output: {result.stdout[:200]}", + } + ], + "validation_rows": [], + } + + staging_volume.commit() + print(f"Completed {item_key}") + return results + + +@app.function( + image=image, + secrets=[hf_secret, gcp_secret], + volumes={ + VOLUME_MOUNT: staging_volume, + "/pipeline": pipeline_volume, + }, + memory=8192, + cpu=1.0, + timeout=86400, + nonpreemptible=True, +) +def queue_coordinator( + scope: str = "all", + branch: str = "main", + n_clones: int = 430, + validate: bool = True, + max_parallel: int = 50, + run_id: str = "", +) -> Dict: + """Queue-based coordinator for H5 builds. + + Generates work items based on scope, spawns up to max_parallel + single-item workers, and collects results. + + Args: + scope: Dataset scope (all/national/state/congressional/local/test). + branch: Git branch. + n_clones: Number of clones for calibration. + validate: Whether to run per-item validation. + max_parallel: Maximum concurrent worker containers. + run_id: Optional run identifier. + + Returns: + Summary dict with completed count, failed items, and + validation results. + """ + setup_gcp_credentials() + setup_repo(branch) + + version = get_version() + if not run_id: + from policyengine_us_data.utils.run_id import generate_run_id + + sha = os.environ.get("GIT_COMMIT", "unknown") + run_id = generate_run_id(version, sha) + + print("=" * 60) + print(f"Queue Coordinator") + print(f" Run ID: {run_id}") + print(f" Scope: {scope}") + print(f" Branch: {branch}") + print("=" * 60) + + # Load pipeline artifacts + pipeline_volume.reload() + artifacts = Path("/pipeline/artifacts") + weights_path = artifacts / "calibration_weights.npy" + db_path = artifacts / "policy_data.db" + dataset_path = artifacts / "source_imputed_stratified_extended_cps.h5" + + for label, p in [ + ("weights", weights_path), + ("dataset", dataset_path), + ("database", db_path), + ]: + if not p.exists(): + raise RuntimeError( + f"Missing {label} on pipeline volume: {p}. " + f"Run upstream pipeline steps first." + ) + + calibration_inputs = { + "weights": str(weights_path), + "dataset": str(dataset_path), + "database": str(db_path), + "n_clones": n_clones, + "seed": 42, + } + + # Generate work items + items = generate_work_items(scope, str(db_path)) + print(f"Generated {len(items)} work items for scope '{scope}'") + + # Check for already-completed items on volume + version_dir = Path(VOLUME_MOUNT) / version + staging_volume.reload() + completed = get_completed_from_volume(version_dir) + remaining = [ + item for item in items if f"{item['type']}:{item['id']}" not in completed + ] + print(f"Already completed: {len(completed)}, remaining: {len(remaining)}") + + if not remaining: + print("All items already built!") + return { + "run_id": run_id, + "total": len(items), + "completed": len(items), + "failed": 0, + "errors": [], + "validation_rows": [], + } + + # Spawn workers — one per item, up to max_parallel + handles = [] + for item in remaining: + handle = build_single_area.spawn( + work_item=item, + branch=branch, + version=version, + calibration_inputs=calibration_inputs, + validate=validate, + ) + handles.append((item, handle)) + if len(handles) % 10 == 0: + print(f" Spawned {len(handles)}/{len(remaining)} workers...") + + print(f"Spawned {len(handles)} workers (max_parallel={max_parallel})") + + # Collect results + all_completed = list(completed) + all_errors = [] + all_validation_rows = [] + + for i, (item, handle) in enumerate(handles): + item_key = f"{item['type']}:{item['id']}" + try: + result = handle.get() + all_completed.extend(result.get("completed", [])) + all_errors.extend(result.get("errors", [])) + all_validation_rows.extend(result.get("validation_rows", [])) + status = "OK" if result.get("completed") else "FAILED" + print(f" [{i + 1}/{len(handles)}] {item_key}: {status}") + except Exception as e: + all_errors.append({"item": item_key, "error": str(e)}) + print(f" [{i + 1}/{len(handles)}] {item_key}: CRASHED - {e}") + + total_completed = len(all_completed) + total_failed = len(all_errors) + + print(f"\nQueue complete: {total_completed} completed, {total_failed} failed") + + return { + "run_id": run_id, + "scope": scope, + "total": len(items), + "completed": total_completed, + "failed": total_failed, + "errors": all_errors[:10], + "validation_rows": all_validation_rows, + } + + @app.function( image=image, secrets=[hf_secret], @@ -404,8 +733,6 @@ def validate_staging(branch: str, version: str, run_id: str = "") -> Dict: result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -467,8 +794,6 @@ def upload_to_staging( result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -559,8 +884,6 @@ def promote_publish(branch: str = "main", version: str = "", run_id: str = "") - result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -709,8 +1032,6 @@ def coordinate_publish( else: fp_result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -735,8 +1056,6 @@ def coordinate_publish( staging_volume.commit() result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -899,7 +1218,7 @@ def main( n_clones: int = 430, run_id: str = "", ): - """Local entrypoint for Modal CLI.""" + """Local entrypoint for Modal CLI (legacy partition-based).""" result = coordinate_publish.remote( branch=branch, num_workers=num_workers, @@ -913,6 +1232,32 @@ def main( print(result) +@app.local_entrypoint() +def main_queue( + scope: str = "all", + branch: str = "main", + n_clones: int = 430, + max_parallel: int = 50, + run_id: str = "", +): + """Queue-based entrypoint: one container per work item. + + Usage: + modal run modal_app/local_area.py::main_queue --scope=test + modal run modal_app/local_area.py::main_queue --scope=all --max-parallel=50 + """ + result = queue_coordinator.remote( + scope=scope, + branch=branch, + n_clones=n_clones, + max_parallel=max_parallel, + run_id=run_id, + ) + import json + + print(json.dumps(result, indent=2, default=str)) + + @app.function( image=image, secrets=[hf_secret, gcp_secret], @@ -1029,8 +1374,6 @@ def coordinate_national_publish( print("Running national H5 validation...") val_result = subprocess.run( [ - "uv", - "run", "python", "-m", "policyengine_us_data.calibration.validate_national_h5", @@ -1054,8 +1397,6 @@ def coordinate_national_publish( print(f"Uploading {national_h5} to HF staging...") result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -1129,8 +1470,6 @@ def promote_national_publish( result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 95d293d81..e60cab265 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -287,6 +287,7 @@ def _record_step( coordinate_national_publish, promote_publish, promote_national_publish, + queue_coordinator, ) app.include(_local_area_app) @@ -349,8 +350,6 @@ def stage_base_datasets( pairs_json = _json.dumps(files_with_paths) result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -413,8 +412,6 @@ def upload_run_diagnostics( result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -611,6 +608,7 @@ def run_pipeline( skip_national: bool = False, resume_run_id: str = None, clear_checkpoints: bool = False, + scope: str = "all", ) -> str: """Run the full pipeline end-to-end. @@ -888,17 +886,30 @@ def run_pipeline( step_start = time.time() # Spawn H5 builds (run on separate Modal containers) - print(f" Spawning regional H5 build ({num_workers} workers)...") - regional_h5_handle = coordinate_publish.spawn( - branch=branch, - num_workers=num_workers, - skip_upload=False, - n_clones=n_clones, - validate=True, - run_id=run_id, - expected_fingerprint=meta.fingerprint or "", - ) - print(f" → coordinate_publish fc: {regional_h5_handle.object_id}") + if scope != "all": + # Queue-based: one container per item, filtered by scope + print(f" Spawning queue-based H5 build (scope={scope})...") + regional_h5_handle = queue_coordinator.spawn( + scope=scope, + branch=branch, + n_clones=n_clones, + validate=True, + run_id=run_id, + ) + print(f" → queue_coordinator fc: {regional_h5_handle.object_id}") + else: + # Legacy partition-based: N workers with chunked items + print(f" Spawning regional H5 build ({num_workers} workers)...") + regional_h5_handle = coordinate_publish.spawn( + branch=branch, + num_workers=num_workers, + skip_upload=False, + n_clones=n_clones, + validate=True, + run_id=run_id, + expected_fingerprint=meta.fingerprint or "", + ) + print(f" → coordinate_publish fc: {regional_h5_handle.object_id}") national_h5_handle = None if not skip_national: @@ -1088,8 +1099,6 @@ def promote_run( try: result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" @@ -1143,8 +1152,6 @@ def promote_run( try: result = subprocess.run( [ - "uv", - "run", "python", "-c", f""" diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py index 30126e24e..71cfb42a7 100644 --- a/modal_app/remote_calibration_runner.py +++ b/modal_app/remote_calibration_runner.py @@ -173,8 +173,6 @@ def _fit_weights_impl( script_path = "policyengine_us_data/calibration/unified_calibration.py" cmd = [ - "uv", - "run", "python", script_path, "--device", @@ -234,8 +232,6 @@ def _fit_from_package_impl( script_path = "policyengine_us_data/calibration/unified_calibration.py" cmd = [ - "uv", - "run", "python", script_path, "--device", @@ -345,8 +341,6 @@ def _build_package_impl( pkg_path = f"{artifacts}/calibration_package.pkl" script_path = "policyengine_us_data/calibration/unified_calibration.py" cmd = [ - "uv", - "run", "python", script_path, "--device", diff --git a/policyengine_us_data/tests/test_calibration/__init__.py b/policyengine_us_data/tests/integration/__init__.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/__init__.py rename to policyengine_us_data/tests/integration/__init__.py diff --git a/policyengine_us_data/tests/integration/conftest.py b/policyengine_us_data/tests/integration/conftest.py new file mode 100644 index 000000000..827603ee1 --- /dev/null +++ b/policyengine_us_data/tests/integration/conftest.py @@ -0,0 +1,56 @@ +"""Integration test configuration. + +Skips tests when prerequisite data files are not available. +Provides shared fixtures for calibration database and dataset paths. +""" + +import pytest +from sqlalchemy import create_engine + +from policyengine_us_data.db.create_database_tables import ( + create_or_replace_views, +) +from policyengine_us_data.storage import STORAGE_FOLDER + +# ── Skip logic for missing datasets ─────────────────────────── + +NEEDS_ECPS = not (STORAGE_FOLDER / "enhanced_cps_2024.h5").exists() +NEEDS_CPS = not (STORAGE_FOLDER / "cps_2024.h5").exists() + +collect_ignore_glob = [] +if NEEDS_ECPS: + collect_ignore_glob.extend( + [ + "test_enhanced_cps.py", + "test_small_enhanced_cps.py", + "test_sparse_enhanced_cps.py", + "test_sipp_assets.py", + ] + ) +if NEEDS_CPS: + collect_ignore_glob.append("test_cps.py") + + +# ── Shared fixtures for calibration tests ───────────────────── + + +@pytest.fixture(scope="session", autouse=True) +def refresh_policy_db_views(): + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + if db_path.exists(): + engine = create_engine(f"sqlite:///{db_path}") + try: + create_or_replace_views(engine) + finally: + engine.dispose() + + +@pytest.fixture(scope="module") +def db_uri(): + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + return f"sqlite:///{db_path}" + + +@pytest.fixture(scope="module") +def dataset_path(): + return str(STORAGE_FOLDER / "source_imputed_stratified_extended_cps_2024.h5") diff --git a/policyengine_us_data/tests/test_datasets/test_acs.py b/policyengine_us_data/tests/integration/test_acs.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_acs.py rename to policyengine_us_data/tests/integration/test_acs.py diff --git a/policyengine_us_data/tests/test_datasets/test_census_cps.py b/policyengine_us_data/tests/integration/test_census_cps.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_census_cps.py rename to policyengine_us_data/tests/integration/test_census_cps.py diff --git a/policyengine_us_data/tests/test_datasets/test_cps.py b/policyengine_us_data/tests/integration/test_cps.py similarity index 63% rename from policyengine_us_data/tests/test_datasets/test_cps.py rename to policyengine_us_data/tests/integration/test_cps.py index 3073d4319..ed49a3e7a 100644 --- a/policyengine_us_data/tests/test_datasets/test_cps.py +++ b/policyengine_us_data/tests/integration/test_cps.py @@ -1,4 +1,31 @@ +"""Integration tests for CPS dataset (requires cps_2024.h5).""" + import numpy as np +import pytest + + +@pytest.fixture(scope="module") +def cps_sim(): + from policyengine_us_data.datasets.cps import CPS_2024 + from policyengine_us import Microsimulation + + return Microsimulation(dataset=CPS_2024) + + +# ── Sanity checks ───────────────────────────────────────────── + + +def test_cps_employment_income_positive(cps_sim): + total = cps_sim.calculate("employment_income").sum() + assert total > 5e12, f"CPS employment_income sum is {total:.2e}, expected > 5T." + + +def test_cps_household_count(cps_sim): + total_hh = cps_sim.calculate("household_weight").values.sum() + assert 100e6 < total_hh < 200e6, f"CPS total households = {total_hh:.2e}." + + +# ── Calibration checks ──────────────────────────────────────── def test_cps_has_auto_loan_interest(): diff --git a/policyengine_us_data/tests/test_database_build.py b/policyengine_us_data/tests/integration/test_database_build.py similarity index 100% rename from policyengine_us_data/tests/test_database_build.py rename to policyengine_us_data/tests/integration/test_database_build.py diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/integration/test_enhanced_cps.py similarity index 75% rename from policyengine_us_data/tests/test_datasets/test_enhanced_cps.py rename to policyengine_us_data/tests/integration/test_enhanced_cps.py index 3f5f0759b..53adc1901 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/integration/test_enhanced_cps.py @@ -1,3 +1,90 @@ +"""Integration tests for Enhanced CPS dataset (requires enhanced_cps_2024.h5).""" + +import pytest + + +@pytest.fixture(scope="module") +def ecps_sim(): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + + return Microsimulation(dataset=EnhancedCPS_2024) + + +# ── Sanity checks ───────────────────────────────────────────── + + +def test_ecps_employment_income_positive(ecps_sim): + """Employment income must be in the trillions, not zero.""" + total = ecps_sim.calculate("employment_income").sum() + assert total > 5e12, ( + f"employment_income sum is {total:.2e}, expected > 5T. " + "Likely missing employment_income_before_lsr in dataset." + ) + + +def test_ecps_self_employment_income_positive(ecps_sim): + total = ecps_sim.calculate("self_employment_income").sum() + assert total > 50e9, f"self_employment_income sum is {total:.2e}, expected > 50B." + + +def test_ecps_household_count(ecps_sim): + """Household count should be roughly 130-160M.""" + total_hh = ecps_sim.calculate("household_weight").values.sum() + assert 100e6 < total_hh < 200e6, ( + f"Total households = {total_hh:.2e}, expected 100M-200M." + ) + + +def test_ecps_person_count(ecps_sim): + """Weighted person count should be roughly 330M.""" + total_people = ecps_sim.calculate("household_weight", map_to="person").values.sum() + assert 250e6 < total_people < 400e6, ( + f"Total people = {total_people:.2e}, expected 250M-400M." + ) + + +def test_ecps_poverty_rate_reasonable(ecps_sim): + """SPM poverty rate should be 8-25%, not 40%+.""" + in_poverty = ecps_sim.calculate("person_in_poverty", map_to="person") + rate = in_poverty.mean() + assert 0.05 < rate < 0.30, ( + f"Poverty rate = {rate:.1%}, expected 5-30%. " + "If ~40%, income variables are likely zero." + ) + + +def test_ecps_income_tax_positive(ecps_sim): + """Federal income tax revenue should be in the trillions.""" + total = ecps_sim.calculate("income_tax").sum() + assert total > 1e12, f"income_tax sum is {total:.2e}, expected > 1T." + + +def test_ecps_mean_employment_income_reasonable(ecps_sim): + """Mean employment income per person should be $20k-$60k.""" + income = ecps_sim.calculate("employment_income", map_to="person") + mean = income.mean() + assert 15_000 < mean < 80_000, ( + f"Mean employment income = ${mean:,.0f}, expected $15k-$80k." + ) + + +def test_ecps_file_size(): + """Enhanced CPS H5 file should be >100MB.""" + from policyengine_us_data.storage import STORAGE_FOLDER + + path = STORAGE_FOLDER / "enhanced_cps_2024.h5" + if not path.exists(): + pytest.skip("enhanced_cps_2024.h5 not found") + size_mb = path.stat().st_size / (1024 * 1024) + assert size_mb > 100, ( + f"enhanced_cps_2024.h5 is only {size_mb:.1f}MB, expected >100MB" + ) + + +# ── Feature checks ──────────────────────────────────────────── + + def test_ecps_employment_income_direct(): """Direct check that employment income from the actual dataset is > 5T. diff --git a/tests/test_no_formula_variables_stored.py b/policyengine_us_data/tests/integration/test_extended_cps.py similarity index 100% rename from tests/test_no_formula_variables_stored.py rename to policyengine_us_data/tests/integration/test_extended_cps.py diff --git a/policyengine_us_data/tests/test_datasets/test_sipp_assets.py b/policyengine_us_data/tests/integration/test_sipp_assets.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_sipp_assets.py rename to policyengine_us_data/tests/integration/test_sipp_assets.py diff --git a/policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py b/policyengine_us_data/tests/integration/test_small_enhanced_cps.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py rename to policyengine_us_data/tests/integration/test_small_enhanced_cps.py diff --git a/policyengine_us_data/tests/test_calibration/test_xw_consistency.py b/policyengine_us_data/tests/integration/test_source_imputed_cps_consistency.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_xw_consistency.py rename to policyengine_us_data/tests/integration/test_source_imputed_cps_consistency.py diff --git a/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py b/policyengine_us_data/tests/integration/test_source_imputed_cps_masking.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py rename to policyengine_us_data/tests/integration/test_source_imputed_cps_masking.py diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/integration/test_sparse_enhanced_cps.py similarity index 89% rename from policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py rename to policyengine_us_data/tests/integration/test_sparse_enhanced_cps.py index d5db2a715..a2b3f6c02 100644 --- a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py +++ b/policyengine_us_data/tests/integration/test_sparse_enhanced_cps.py @@ -1,3 +1,5 @@ +"""Integration tests for Sparse Enhanced CPS dataset (requires enhanced_cps_2024.h5).""" + import pytest from pathlib import Path import logging @@ -25,6 +27,33 @@ def sim(data): return Microsimulation(dataset=data) +@pytest.fixture(scope="module") +def sparse_sim(): + path = STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5" + if not path.exists(): + pytest.skip("sparse_enhanced_cps_2024.h5 not found") + return Microsimulation(dataset=Dataset.from_file(path)) + + +# ── Sparse dataset sanity checks ────────────────────────────── + + +def test_sparse_household_count(sparse_sim): + total_hh = sparse_sim.calculate("household_weight").values.sum() + assert 100e6 < total_hh < 200e6, ( + f"Sparse total households = {total_hh:.2e}, expected 100M-200M." + ) + + +def test_sparse_poverty_rate_reasonable(sparse_sim): + in_poverty = sparse_sim.calculate("person_in_poverty", map_to="person") + rate = in_poverty.mean() + assert 0.05 < rate < 0.30, f"Sparse poverty rate = {rate:.1%}, expected 5-30%." + + +# ── Reweighting and calibration checks ──────────────────────── + + @pytest.mark.filterwarnings("ignore:DataFrame is highly fragmented") @pytest.mark.filterwarnings("ignore:The distutils package is deprecated") @pytest.mark.filterwarnings( diff --git a/policyengine_us_data/tests/test_calibration/conftest.py b/policyengine_us_data/tests/test_calibration/conftest.py deleted file mode 100644 index 9c7a21790..000000000 --- a/policyengine_us_data/tests/test_calibration/conftest.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Shared fixtures for local area calibration tests.""" - -import pytest -from sqlalchemy import create_engine - -from policyengine_us_data.db.create_database_tables import ( - create_or_replace_views, -) -from policyengine_us_data.storage import STORAGE_FOLDER - - -@pytest.fixture(scope="session", autouse=True) -def refresh_policy_db_views(): - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - if db_path.exists(): - engine = create_engine(f"sqlite:///{db_path}") - try: - create_or_replace_views(engine) - finally: - engine.dispose() - - -@pytest.fixture(scope="module") -def db_uri(): - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - return f"sqlite:///{db_path}" - - -@pytest.fixture(scope="module") -def dataset_path(): - return str(STORAGE_FOLDER / "source_imputed_stratified_extended_cps_2024.h5") diff --git a/policyengine_us_data/tests/test_datasets/conftest.py b/policyengine_us_data/tests/test_datasets/conftest.py deleted file mode 100644 index 4b886225e..000000000 --- a/policyengine_us_data/tests/test_datasets/conftest.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Skip dataset tests that need full data build artifacts. - -In basic CI (full_suite=false), H5 files are not built locally -and Microsimulation requires ~16GB RAM. These tests run inside -Modal containers (32GB) during full_suite=true builds. -""" - -from policyengine_us_data.storage import STORAGE_FOLDER - -NEEDS_ECPS = not (STORAGE_FOLDER / "enhanced_cps_2024.h5").exists() -NEEDS_CPS = not (STORAGE_FOLDER / "cps_2024.h5").exists() - -collect_ignore_glob = [] -if NEEDS_ECPS: - collect_ignore_glob.extend( - [ - "test_enhanced_cps.py", - "test_dataset_sanity.py", - "test_small_enhanced_cps.py", - "test_sparse_enhanced_cps.py", - "test_sipp_assets.py", - ] - ) -if NEEDS_CPS: - collect_ignore_glob.append("test_cps.py") diff --git a/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py b/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py deleted file mode 100644 index 1a8bdba4d..000000000 --- a/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Sanity checks for built datasets. - -Catch catastrophic data issues: missing income variables, wrong -population counts, corrupted files, or undersized H5 outputs. -These run after every data build and would have caught the -enhanced CPS overwrite bug (PR #569) where -employment_income_before_lsr was dropped, zeroing all income. -""" - -import pytest - - -@pytest.fixture(scope="module") -def ecps_sim(): - from policyengine_us_data.datasets.cps import EnhancedCPS_2024 - from policyengine_us import Microsimulation - - return Microsimulation(dataset=EnhancedCPS_2024) - - -@pytest.fixture(scope="module") -def cps_sim(): - from policyengine_us_data.datasets.cps import CPS_2024 - from policyengine_us import Microsimulation - - return Microsimulation(dataset=CPS_2024) - - -# ── Enhanced CPS sanity checks ────────────────────────────────── - - -def test_ecps_employment_income_positive(ecps_sim): - """Employment income must be in the trillions, not zero.""" - total = ecps_sim.calculate("employment_income").sum() - assert total > 5e12, ( - f"employment_income sum is {total:.2e}, expected > 5T. " - "Likely missing employment_income_before_lsr in dataset." - ) - - -def test_ecps_self_employment_income_positive(ecps_sim): - total = ecps_sim.calculate("self_employment_income").sum() - assert total > 50e9, f"self_employment_income sum is {total:.2e}, expected > 50B." - - -def test_ecps_household_count(ecps_sim): - """Household count should be roughly 130-160M.""" - total_hh = ecps_sim.calculate("household_weight").values.sum() - assert 100e6 < total_hh < 200e6, ( - f"Total households = {total_hh:.2e}, expected 100M-200M." - ) - - -def test_ecps_person_count(ecps_sim): - """Weighted person count should be roughly 330M.""" - total_people = ecps_sim.calculate("household_weight", map_to="person").values.sum() - assert 250e6 < total_people < 400e6, ( - f"Total people = {total_people:.2e}, expected 250M-400M." - ) - - -def test_ecps_poverty_rate_reasonable(ecps_sim): - """SPM poverty rate should be 8-25%, not 40%+.""" - in_poverty = ecps_sim.calculate("person_in_poverty", map_to="person") - rate = in_poverty.mean() - assert 0.05 < rate < 0.30, ( - f"Poverty rate = {rate:.1%}, expected 5-30%. " - "If ~40%, income variables are likely zero." - ) - - -def test_ecps_income_tax_positive(ecps_sim): - """Federal income tax revenue should be in the trillions.""" - total = ecps_sim.calculate("income_tax").sum() - assert total > 1e12, f"income_tax sum is {total:.2e}, expected > 1T." - - -def test_ecps_mean_employment_income_reasonable(ecps_sim): - """Mean employment income per person should be $20k-$60k.""" - income = ecps_sim.calculate("employment_income", map_to="person") - mean = income.mean() - assert 15_000 < mean < 80_000, ( - f"Mean employment income = ${mean:,.0f}, expected $15k-$80k." - ) - - -# ── CPS sanity checks ─────────────────────────────────────────── - - -def test_cps_employment_income_positive(cps_sim): - total = cps_sim.calculate("employment_income").sum() - assert total > 5e12, f"CPS employment_income sum is {total:.2e}, expected > 5T." - - -def test_cps_household_count(cps_sim): - total_hh = cps_sim.calculate("household_weight").values.sum() - assert 100e6 < total_hh < 200e6, f"CPS total households = {total_hh:.2e}." - - -# ── Sparse Enhanced CPS sanity checks ───────────────────────── - - -@pytest.fixture(scope="module") -def sparse_sim(): - from policyengine_core.data import Dataset - from policyengine_us import Microsimulation - from policyengine_us_data.storage import STORAGE_FOLDER - - path = STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5" - if not path.exists(): - pytest.skip("sparse_enhanced_cps_2024.h5 not found") - return Microsimulation(dataset=Dataset.from_file(path)) - - -def test_sparse_employment_income_positive(sparse_sim): - """Sparse dataset employment income must be in the trillions.""" - total = sparse_sim.calculate("employment_income").sum() - assert total > 5e12, f"Sparse employment_income sum is {total:.2e}, expected > 5T." - - -def test_sparse_household_count(sparse_sim): - total_hh = sparse_sim.calculate("household_weight").values.sum() - assert 100e6 < total_hh < 200e6, ( - f"Sparse total households = {total_hh:.2e}, expected 100M-200M." - ) - - -def test_sparse_poverty_rate_reasonable(sparse_sim): - in_poverty = sparse_sim.calculate("person_in_poverty", map_to="person") - rate = in_poverty.mean() - assert 0.05 < rate < 0.30, f"Sparse poverty rate = {rate:.1%}, expected 5-30%." - - -# ── File size checks ─────────────────────────────────────────── - - -def test_ecps_file_size(): - """Enhanced CPS H5 file should be >100MB (was 590MB before bug).""" - from policyengine_us_data.storage import STORAGE_FOLDER - - path = STORAGE_FOLDER / "enhanced_cps_2024.h5" - if not path.exists(): - pytest.skip("enhanced_cps_2024.h5 not found") - size_mb = path.stat().st_size / (1024 * 1024) - assert size_mb > 100, ( - f"enhanced_cps_2024.h5 is only {size_mb:.1f}MB, expected >100MB" - ) diff --git a/policyengine_us_data/tests/test_datasets/__init__.py b/policyengine_us_data/tests/unit/__init__.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/__init__.py rename to policyengine_us_data/tests/unit/__init__.py diff --git a/policyengine_us_data/tests/unit/calibration/__init__.py b/policyengine_us_data/tests/unit/calibration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/policyengine_us_data/tests/test_calibration/create_test_fixture.py b/policyengine_us_data/tests/unit/calibration/create_test_fixture.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/create_test_fixture.py rename to policyengine_us_data/tests/unit/calibration/create_test_fixture.py diff --git a/policyengine_us_data/tests/test_calibration/test_block_assignment.py b/policyengine_us_data/tests/unit/calibration/test_block_assignment.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_block_assignment.py rename to policyengine_us_data/tests/unit/calibration/test_block_assignment.py diff --git a/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py b/policyengine_us_data/tests/unit/calibration/test_clone_and_assign.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_clone_and_assign.py rename to policyengine_us_data/tests/unit/calibration/test_clone_and_assign.py diff --git a/policyengine_us_data/tests/test_calibration/test_county_assignment.py b/policyengine_us_data/tests/unit/calibration/test_county_assignment.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_county_assignment.py rename to policyengine_us_data/tests/unit/calibration/test_county_assignment.py diff --git a/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py b/policyengine_us_data/tests/unit/calibration/test_drop_target_groups.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_drop_target_groups.py rename to policyengine_us_data/tests/unit/calibration/test_drop_target_groups.py diff --git a/policyengine_us_data/tests/test_calibration/test_fixture_50hh.h5 b/policyengine_us_data/tests/unit/calibration/test_fixture_50hh.h5 similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_fixture_50hh.h5 rename to policyengine_us_data/tests/unit/calibration/test_fixture_50hh.h5 diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/unit/calibration/test_mortgage_interest.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_mortgage_interest.py rename to policyengine_us_data/tests/unit/calibration/test_mortgage_interest.py diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/unit/calibration/test_puf_impute.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_puf_impute.py rename to policyengine_us_data/tests/unit/calibration/test_puf_impute.py diff --git a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py b/policyengine_us_data/tests/unit/calibration/test_retirement_imputation.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_retirement_imputation.py rename to policyengine_us_data/tests/unit/calibration/test_retirement_imputation.py diff --git a/policyengine_us_data/tests/test_calibration/test_source_impute.py b/policyengine_us_data/tests/unit/calibration/test_source_impute.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_source_impute.py rename to policyengine_us_data/tests/unit/calibration/test_source_impute.py diff --git a/policyengine_us_data/tests/test_calibration/test_stacked_dataset_builder.py b/policyengine_us_data/tests/unit/calibration/test_stacked_dataset_builder.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_stacked_dataset_builder.py rename to policyengine_us_data/tests/unit/calibration/test_stacked_dataset_builder.py diff --git a/policyengine_us_data/tests/test_calibration/test_target_config.py b/policyengine_us_data/tests/unit/calibration/test_target_config.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_target_config.py rename to policyengine_us_data/tests/unit/calibration/test_target_config.py diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/unit/calibration/test_unified_calibration.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_unified_calibration.py rename to policyengine_us_data/tests/unit/calibration/test_unified_calibration.py diff --git a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py b/policyengine_us_data/tests/unit/calibration/test_unified_matrix_builder.py similarity index 99% rename from policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py rename to policyengine_us_data/tests/unit/calibration/test_unified_matrix_builder.py index 4317296d7..536160c08 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py +++ b/policyengine_us_data/tests/unit/calibration/test_unified_matrix_builder.py @@ -277,8 +277,8 @@ def test_inactive_targets_are_excluded(self): def test_legacy_target_overview_without_reform_id(self): b = self._make_builder() _create_legacy_target_overview(self.engine) + b._target_overview_columns = None try: - b._target_overview_columns = None df = b._query_targets({"domain_variables": ["aca_ptc"]}) self.assertGreater(len(df), 0) self.assertIn("reform_id", df.columns) diff --git a/policyengine_us_data/tests/test_calibration/test_validate_staging.py b/policyengine_us_data/tests/unit/calibration/test_validate_staging.py similarity index 100% rename from policyengine_us_data/tests/test_calibration/test_validate_staging.py rename to policyengine_us_data/tests/unit/calibration/test_validate_staging.py diff --git a/policyengine_us_data/tests/unit/datasets/__init__.py b/policyengine_us_data/tests/unit/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/unit/datasets/test_county_fips.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_county_fips.py rename to policyengine_us_data/tests/unit/datasets/test_county_fips.py diff --git a/policyengine_us_data/tests/test_datasets/test_cps_takeup.py b/policyengine_us_data/tests/unit/datasets/test_cps_takeup.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_cps_takeup.py rename to policyengine_us_data/tests/unit/datasets/test_cps_takeup.py diff --git a/policyengine_us_data/tests/test_datasets/test_disaggregate_puf.py b/policyengine_us_data/tests/unit/datasets/test_disaggregate_puf.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_disaggregate_puf.py rename to policyengine_us_data/tests/unit/datasets/test_disaggregate_puf.py diff --git a/policyengine_us_data/tests/test_datasets/test_irs_puf.py b/policyengine_us_data/tests/unit/datasets/test_irs_puf.py similarity index 100% rename from policyengine_us_data/tests/test_datasets/test_irs_puf.py rename to policyengine_us_data/tests/unit/datasets/test_irs_puf.py diff --git a/policyengine_us_data/tests/test_constraint_validation.py b/policyengine_us_data/tests/unit/test_constraint_validation.py similarity index 100% rename from policyengine_us_data/tests/test_constraint_validation.py rename to policyengine_us_data/tests/unit/test_constraint_validation.py diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/unit/test_database.py similarity index 100% rename from policyengine_us_data/tests/test_database.py rename to policyengine_us_data/tests/unit/test_database.py diff --git a/tests/test_etl_irs_soi_overlay.py b/policyengine_us_data/tests/unit/test_etl_irs_soi_overlay.py similarity index 100% rename from tests/test_etl_irs_soi_overlay.py rename to policyengine_us_data/tests/unit/test_etl_irs_soi_overlay.py diff --git a/policyengine_us_data/tests/test_etl_national_targets.py b/policyengine_us_data/tests/unit/test_etl_national_targets.py similarity index 92% rename from policyengine_us_data/tests/test_etl_national_targets.py rename to policyengine_us_data/tests/unit/test_etl_national_targets.py index 7e38e18be..10e0ece31 100644 --- a/policyengine_us_data/tests/test_etl_national_targets.py +++ b/policyengine_us_data/tests/unit/test_etl_national_targets.py @@ -8,7 +8,6 @@ create_database, ) from policyengine_us_data.db.etl_national_targets import ( - TAX_EXPENDITURE_REFORM_ID, load_national_targets, ) @@ -90,6 +89,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp tax_expenditure_df = pd.DataFrame( [ { + "reform_id": 1, "variable": "salt_deduction", "value": 21.247e9, "source": "Joint Committee on Taxation", @@ -97,6 +97,7 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp "year": 2024, }, { + "reform_id": 5, "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", @@ -124,16 +125,12 @@ def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeyp assert stale_rows assert all(not target.active for target in stale_rows) - reform_rows = ( - session.query(Target) - .filter(Target.reform_id == TAX_EXPENDITURE_REFORM_ID) - .all() - ) + reform_rows = session.query(Target).filter(Target.reform_id > 0).all() assert len(reform_rows) == 2 assert all(target.active for target in reform_rows) - assert {target.variable for target in reform_rows} == { - "salt_deduction", - "qualified_business_income_deduction", + assert {(target.variable, target.reform_id) for target in reform_rows} == { + ("salt_deduction", 1), + ("qualified_business_income_deduction", 5), } assert all( "Modeled as repeal-based income tax expenditure target" diff --git a/policyengine_us_data/tests/test_etl_state_income_tax.py b/policyengine_us_data/tests/unit/test_etl_state_income_tax.py similarity index 100% rename from policyengine_us_data/tests/test_etl_state_income_tax.py rename to policyengine_us_data/tests/unit/test_etl_state_income_tax.py diff --git a/policyengine_us_data/tests/test_extended_cps.py b/policyengine_us_data/tests/unit/test_extended_cps.py similarity index 100% rename from policyengine_us_data/tests/test_extended_cps.py rename to policyengine_us_data/tests/unit/test_extended_cps.py diff --git a/tests/test_h6_reform.py b/policyengine_us_data/tests/unit/test_h6_reform.py similarity index 100% rename from tests/test_h6_reform.py rename to policyengine_us_data/tests/unit/test_h6_reform.py diff --git a/policyengine_us_data/tests/test_import.py b/policyengine_us_data/tests/unit/test_import.py similarity index 100% rename from policyengine_us_data/tests/test_import.py rename to policyengine_us_data/tests/unit/test_import.py diff --git a/policyengine_us_data/tests/test_modal_resilience.py b/policyengine_us_data/tests/unit/test_modal_resilience.py similarity index 100% rename from policyengine_us_data/tests/test_modal_resilience.py rename to policyengine_us_data/tests/unit/test_modal_resilience.py diff --git a/policyengine_us_data/tests/test_pandas3_compatibility.py b/policyengine_us_data/tests/unit/test_pandas3_compatibility.py similarity index 100% rename from policyengine_us_data/tests/test_pandas3_compatibility.py rename to policyengine_us_data/tests/unit/test_pandas3_compatibility.py diff --git a/policyengine_us_data/tests/test_pipeline.py b/policyengine_us_data/tests/unit/test_pipeline.py similarity index 100% rename from policyengine_us_data/tests/test_pipeline.py rename to policyengine_us_data/tests/unit/test_pipeline.py diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/unit/test_puf_impute.py similarity index 100% rename from policyengine_us_data/tests/test_puf_impute.py rename to policyengine_us_data/tests/unit/test_puf_impute.py diff --git a/tests/test_refresh_soi_table_targets.py b/policyengine_us_data/tests/unit/test_refresh_soi_table_targets.py similarity index 99% rename from tests/test_refresh_soi_table_targets.py rename to policyengine_us_data/tests/unit/test_refresh_soi_table_targets.py index 2913b5047..2a491e2cd 100644 --- a/tests/test_refresh_soi_table_targets.py +++ b/policyengine_us_data/tests/unit/test_refresh_soi_table_targets.py @@ -4,9 +4,9 @@ import pandas as pd +# Navigate from tests/unit/ up to policyengine_us_data/ MODULE_PATH = ( - Path(__file__).resolve().parent.parent - / "policyengine_us_data" + Path(__file__).resolve().parent.parent.parent / "storage" / "calibration_targets" / "refresh_soi_table_targets.py" diff --git a/policyengine_us_data/tests/test_retirement_limits.py b/policyengine_us_data/tests/unit/test_retirement_limits.py similarity index 100% rename from policyengine_us_data/tests/test_retirement_limits.py rename to policyengine_us_data/tests/unit/test_retirement_limits.py diff --git a/policyengine_us_data/tests/test_schema_views_and_lookups.py b/policyengine_us_data/tests/unit/test_schema_views_and_lookups.py similarity index 100% rename from policyengine_us_data/tests/test_schema_views_and_lookups.py rename to policyengine_us_data/tests/unit/test_schema_views_and_lookups.py diff --git a/tests/test_soi_utils.py b/policyengine_us_data/tests/unit/test_soi_utils.py similarity index 97% rename from tests/test_soi_utils.py rename to policyengine_us_data/tests/unit/test_soi_utils.py index d73dce21e..2bf544608 100644 --- a/tests/test_soi_utils.py +++ b/policyengine_us_data/tests/unit/test_soi_utils.py @@ -7,8 +7,9 @@ import pandas as pd -REPO_ROOT = Path(__file__).resolve().parent.parent -PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data" +# Navigate from tests/unit/ up to policyengine_us_data/, then up to repo root +PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent +REPO_ROOT = PACKAGE_ROOT.parent def load_soi_module(): diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/unit/test_stochastic_variables.py similarity index 100% rename from policyengine_us_data/tests/test_stochastic_variables.py rename to policyengine_us_data/tests/unit/test_stochastic_variables.py diff --git a/policyengine_us_data/tests/test_validation_queries.py b/policyengine_us_data/tests/unit/test_validation_queries.py similarity index 100% rename from policyengine_us_data/tests/test_validation_queries.py rename to policyengine_us_data/tests/unit/test_validation_queries.py diff --git a/policyengine_us_data/tests/test_version_manifest.py b/policyengine_us_data/tests/unit/test_version_manifest.py similarity index 100% rename from policyengine_us_data/tests/test_version_manifest.py rename to policyengine_us_data/tests/unit/test_version_manifest.py diff --git a/tests/test_weeks_unemployed.py b/policyengine_us_data/tests/unit/test_weeks_unemployed.py similarity index 89% rename from tests/test_weeks_unemployed.py rename to policyengine_us_data/tests/unit/test_weeks_unemployed.py index d64d8b64c..c277243c5 100644 --- a/tests/test_weeks_unemployed.py +++ b/policyengine_us_data/tests/unit/test_weeks_unemployed.py @@ -14,8 +14,9 @@ class TestWeeksUnemployed: def test_lkweeks_in_person_columns(self): """Test that LKWEEKS is in PERSON_COLUMNS, not WKSUNEM.""" # Read the source file directly to check column names - census_cps_path = Path(__file__).parent.parent / ( - "policyengine_us_data/datasets/cps/census_cps.py" + # Navigate from tests/unit/ up to policyengine_us_data/ + census_cps_path = Path(__file__).parent.parent.parent / ( + "datasets/cps/census_cps.py" ) content = census_cps_path.read_text() @@ -27,9 +28,8 @@ def test_lkweeks_in_person_columns(self): def test_cps_uses_lkweeks(self): """Test that cps.py uses LKWEEKS, not WKSUNEM.""" - cps_path = Path(__file__).parent.parent / ( - "policyengine_us_data/datasets/cps/cps.py" - ) + # Navigate from tests/unit/ up to policyengine_us_data/ + cps_path = Path(__file__).parent.parent.parent / ("datasets/cps/cps.py") content = cps_path.read_text() # Check for correct variable reference diff --git a/pyproject.toml b/pyproject.toml index 46e23bfaf..61e17e4ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,9 @@ dev = [ "yaml-changelog>=0.1.7", "build", "tomli", - "itables", "towncrier>=24.8.0", + "itables", + "towncrier>=24.8.0", + "pytest-cov", ] @@ -79,7 +81,8 @@ include-package-data = true [tool.pytest.ini_options] addopts = "-v" testpaths = [ - "policyengine_us_data/tests", + "policyengine_us_data/tests/unit", + "policyengine_us_data/tests/integration", ] [tool.towncrier] diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py deleted file mode 100644 index 6ffa34c3e..000000000 --- a/tests/test_reproducibility.py +++ /dev/null @@ -1,272 +0,0 @@ -""" -Reproducibility tests for Enhanced CPS generation. - -These tests ensure the pipeline produces consistent results -and can be reproduced in different environments. -""" - -import pytest -import numpy as np -import pandas as pd -from pathlib import Path -import hashlib - - -class TestReproducibility: - """Test suite for reproducibility validation.""" - - def test_environment_setup(self): - """Test that required packages are installed.""" - required_packages = [ - "policyengine_us", - "policyengine_us_data", - "quantile_forest", - "pandas", - "numpy", - "torch", - ] - - for package in required_packages: - try: - __import__(package.replace("-", "_")) - except ImportError: - pytest.fail(f"Required package '{package}' not installed") - - def test_deterministic_imputation(self): - """Test that imputation produces deterministic results with fixed seed.""" - from policyengine_us_data.datasets.cps.enhanced_cps.imputation import ( - QuantileRegressionForestImputer, - ) - - # Create small test data - n_samples = 100 - predictors = pd.DataFrame( - { - "age": np.random.randint(18, 80, n_samples), - "sex": np.random.choice([1, 2], n_samples), - "filing_status": np.random.choice([1, 2], n_samples), - } - ) - - target = pd.Series(np.random.lognormal(10, 1, n_samples)) - - # Run imputation twice with same seed - imputer1 = QuantileRegressionForestImputer(random_state=42) - imputer1.fit(predictors, target) - result1 = imputer1.predict(predictors) - - imputer2 = QuantileRegressionForestImputer(random_state=42) - imputer2.fit(predictors, target) - result2 = imputer2.predict(predictors) - - # Results should be identical - np.testing.assert_array_almost_equal(result1, result2) - - def test_weight_optimization_convergence(self): - """Test that weight optimization converges consistently.""" - from policyengine_us_data.datasets.cps.enhanced_cps.reweight import ( - optimize_weights, - ) - - # Create test loss matrix - n_households = 100 - n_targets = 10 - - loss_matrix = np.random.rand(n_households, n_targets) - targets = np.random.rand(n_targets) * 1e6 - initial_weights = np.ones(n_households) - - # Run optimization twice - weights1, loss1 = optimize_weights( - loss_matrix, - targets, - initial_weights, - n_iterations=100, - dropout_rate=0.05, - seed=42, - ) - - weights2, loss2 = optimize_weights( - loss_matrix, - targets, - initial_weights, - n_iterations=100, - dropout_rate=0.05, - seed=42, - ) - - # Results should be very close - np.testing.assert_allclose(weights1, weights2, rtol=1e-5) - assert abs(loss1 - loss2) < 1e-6 - - def test_validation_metrics_stable(self): - """Test that validation metrics are stable across runs.""" - # This would load actual data in practice - # For now, test with synthetic data - - metrics = { - "gini_coefficient": 0.521, - "top_10_share": 0.472, - "top_1_share": 0.198, - "poverty_rate": 0.116, - } - - # In practice, would calculate from data - # Here we verify expected ranges - assert 0.50 <= metrics["gini_coefficient"] <= 0.55 - assert 0.45 <= metrics["top_10_share"] <= 0.50 - assert 0.18 <= metrics["top_1_share"] <= 0.22 - assert 0.10 <= metrics["poverty_rate"] <= 0.13 - - def test_output_checksums(self): - """Test that output files match expected checksums.""" - test_data_dir = Path("data/test") - - if not test_data_dir.exists(): - pytest.skip("Test data not generated") - - checksum_file = test_data_dir / "checksums.txt" - if not checksum_file.exists(): - pytest.skip("Checksum file not found") - - # Read expected checksums - expected_checksums = {} - with open(checksum_file) as f: - for line in f: - if line.strip(): - filename, checksum = line.strip().split(": ") - expected_checksums[filename] = checksum - - # Verify files - for filename, expected_checksum in expected_checksums.items(): - file_path = test_data_dir / filename - if file_path.exists() and filename != "checksums.txt": - with open(file_path, "rb") as f: - actual_checksum = hashlib.sha256(f.read()).hexdigest() - assert actual_checksum == expected_checksum, ( - f"Checksum mismatch for {filename}" - ) - - def test_memory_usage(self): - """Test that memory usage stays within bounds.""" - import psutil - import os - - process = psutil.Process(os.getpid()) - memory_before = process.memory_info().rss / 1024 / 1024 # MB - - # Run a small imputation task - n_samples = 10000 - data = pd.DataFrame( - { - "age": np.random.randint(18, 80, n_samples), - "income": np.random.lognormal(10, 1, n_samples), - } - ) - - # Process data - data["income_bracket"] = pd.qcut(data["income"], 10) - - memory_after = process.memory_info().rss / 1024 / 1024 # MB - memory_used = memory_after - memory_before - - # Should use less than 500MB for this small task - assert memory_used < 500, f"Used {memory_used:.1f}MB, expected <500MB" - - def test_platform_independence(self): - """Test that code works across platforms.""" - import platform - - system = platform.system() - assert system in [ - "Linux", - "Darwin", - "Windows", - ], f"Unsupported platform: {system}" - - # Test path handling - test_path = Path("data") / "test" / "file.csv" - assert str(test_path).replace("\\", "/") == "data/test/file.csv" - - def test_api_credentials_documented(self): - """Test that API credential requirements are documented.""" - readme_path = Path("REPRODUCTION.md") - assert readme_path.exists(), "REPRODUCTION.md not found" - - content = readme_path.read_text() - - # Check for credential documentation - required_sections = [ - "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN", - "CENSUS_API_KEY", - "PUF Data Access", - ] - - for section in required_sections: - assert section in content, f"Missing documentation for '{section}'" - - def test_synthetic_data_generation(self): - """Test that synthetic data can be generated for testing.""" - from scripts.generate_test_data import ( - generate_synthetic_cps, - generate_synthetic_puf, - ) - - # Generate small datasets - households, persons = generate_synthetic_cps(n_households=10) - puf = generate_synthetic_puf(n_returns=50) - - # Verify structure - assert len(households) == 10 - assert len(persons) > 10 # Multiple persons per household - assert len(puf) == 50 - - # Verify required columns - assert "household_id" in households.columns - assert "person_id" in persons.columns - assert "wages" in puf.columns - - def test_smoke_test_pipeline(self): - """Run a minimal version of the full pipeline.""" - # This test would be marked as slow and only run in CI - pytest.skip("Full pipeline test - run with --runslow") - - # Would include: - # 1. Load test data - # 2. Run imputation on subset - # 3. Run reweighting with few targets - # 4. Validate outputs exist - - def test_documentation_completeness(self): - """Test that all necessary documentation exists.""" - required_docs = [ - "README.md", - "REPRODUCTION.md", - "CLAUDE.md", - "docs/methodology.md", - "docs/data.md", - ] - - for doc in required_docs: - doc_path = Path(doc) - assert doc_path.exists(), f"Missing documentation: {doc}" - - # Check not empty - content = doc_path.read_text() - assert len(content) > 100, f"Documentation too short: {doc}" - - -@pytest.mark.slow -class TestFullReproduction: - """Full reproduction tests (run with --runslow flag).""" - - def test_full_pipeline_subset(self): - """Test full pipeline on data subset.""" - # This would run the complete pipeline on a small subset - # Taking ~10 minutes instead of hours - pass - - def test_validation_dashboard(self): - """Test that validation dashboard can be generated.""" - # Would test dashboard generation - pass diff --git a/uv.lock b/uv.lock index e554b94ef..404713f5e 100644 --- a/uv.lock +++ b/uv.lock @@ -364,6 +364,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, ] +[[package]] +name = "coverage" +version = "7.13.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" }, + { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" }, + { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" }, + { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159, upload-time = "2026-03-17T10:30:47.204Z" }, + { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270, upload-time = "2026-03-17T10:30:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/93/89/7ffc4ba0f5d0a55c1e84ea7cee39c9fc06af7b170513d83fbf3bbefce280/coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256", size = 257538, upload-time = "2026-03-17T10:30:50.77Z" }, + { url = "https://files.pythonhosted.org/packages/81/bd/73ddf85f93f7e6fa83e77ccecb6162d9415c79007b4bc124008a4995e4a7/coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c", size = 251821, upload-time = "2026-03-17T10:30:52.5Z" }, + { url = "https://files.pythonhosted.org/packages/a0/81/278aff4e8dec4926a0bcb9486320752811f543a3ce5b602cc7a29978d073/coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5", size = 253191, upload-time = "2026-03-17T10:30:54.543Z" }, + { url = "https://files.pythonhosted.org/packages/70/ee/fe1621488e2e0a58d7e94c4800f0d96f79671553488d401a612bebae324b/coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09", size = 251337, upload-time = "2026-03-17T10:30:56.663Z" }, + { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404, upload-time = "2026-03-17T10:30:58.427Z" }, + { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903, upload-time = "2026-03-17T10:31:00.093Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780, upload-time = "2026-03-17T10:31:01.916Z" }, + { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093, upload-time = "2026-03-17T10:31:03.642Z" }, + { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900, upload-time = "2026-03-17T10:31:05.651Z" }, + { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515, upload-time = "2026-03-17T10:31:07.293Z" }, + { url = "https://files.pythonhosted.org/packages/74/8c/74fedc9663dcf168b0a059d4ea756ecae4da77a489048f94b5f512a8d0b3/coverage-7.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ec4af212df513e399cf11610cc27063f1586419e814755ab362e50a85ea69c1", size = 219576, upload-time = "2026-03-17T10:31:09.045Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c9/44fb661c55062f0818a6ffd2685c67aa30816200d5f2817543717d4b92eb/coverage-7.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:941617e518602e2d64942c88ec8499f7fbd49d3f6c4327d3a71d43a1973032f3", size = 219942, upload-time = "2026-03-17T10:31:10.708Z" }, + { url = "https://files.pythonhosted.org/packages/5f/13/93419671cee82b780bab7ea96b67c8ef448f5f295f36bf5031154ec9a790/coverage-7.13.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:da305e9937617ee95c2e39d8ff9f040e0487cbf1ac174f777ed5eddd7a7c1f26", size = 250935, upload-time = "2026-03-17T10:31:12.392Z" }, + { url = "https://files.pythonhosted.org/packages/ac/68/1666e3a4462f8202d836920114fa7a5ee9275d1fa45366d336c551a162dd/coverage-7.13.5-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:78e696e1cc714e57e8b25760b33a8b1026b7048d270140d25dafe1b0a1ee05a3", size = 253541, upload-time = "2026-03-17T10:31:14.247Z" }, + { url = "https://files.pythonhosted.org/packages/4e/5e/3ee3b835647be646dcf3c65a7c6c18f87c27326a858f72ab22c12730773d/coverage-7.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02ca0eed225b2ff301c474aeeeae27d26e2537942aa0f87491d3e147e784a82b", size = 254780, upload-time = "2026-03-17T10:31:16.193Z" }, + { url = "https://files.pythonhosted.org/packages/44/b3/cb5bd1a04cfcc49ede6cd8409d80bee17661167686741e041abc7ee1b9a9/coverage-7.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:04690832cbea4e4663d9149e05dba142546ca05cb1848816760e7f58285c970a", size = 256912, upload-time = "2026-03-17T10:31:17.89Z" }, + { url = "https://files.pythonhosted.org/packages/1b/66/c1dceb7b9714473800b075f5c8a84f4588f887a90eb8645282031676e242/coverage-7.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0590e44dd2745c696a778f7bab6aa95256de2cbc8b8cff4f7db8ff09813d6969", size = 251165, upload-time = "2026-03-17T10:31:19.605Z" }, + { url = "https://files.pythonhosted.org/packages/b7/62/5502b73b97aa2e53ea22a39cf8649ff44827bef76d90bf638777daa27a9d/coverage-7.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7cfad2d6d81dd298ab6b89fe72c3b7b05ec7544bdda3b707ddaecff8d25c161", size = 252908, upload-time = "2026-03-17T10:31:21.312Z" }, + { url = "https://files.pythonhosted.org/packages/7d/37/7792c2d69854397ca77a55c4646e5897c467928b0e27f2d235d83b5d08c6/coverage-7.13.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e092b9499de38ae0fbfbc603a74660eb6ff3e869e507b50d85a13b6db9863e15", size = 250873, upload-time = "2026-03-17T10:31:23.565Z" }, + { url = "https://files.pythonhosted.org/packages/a3/23/bc866fb6163be52a8a9e5d708ba0d3b1283c12158cefca0a8bbb6e247a43/coverage-7.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:48c39bc4a04d983a54a705a6389512883d4a3b9862991b3617d547940e9f52b1", size = 255030, upload-time = "2026-03-17T10:31:25.58Z" }, + { url = "https://files.pythonhosted.org/packages/7d/8b/ef67e1c222ef49860701d346b8bbb70881bef283bd5f6cbba68a39a086c7/coverage-7.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2d3807015f138ffea1ed9afeeb8624fd781703f2858b62a8dd8da5a0994c57b6", size = 250694, upload-time = "2026-03-17T10:31:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/46/0d/866d1f74f0acddbb906db212e096dee77a8e2158ca5e6bb44729f9d93298/coverage-7.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee2aa19e03161671ec964004fb74b2257805d9710bf14a5c704558b9d8dbaf17", size = 252469, upload-time = "2026-03-17T10:31:29.472Z" }, + { url = "https://files.pythonhosted.org/packages/7a/f5/be742fec31118f02ce42b21c6af187ad6a344fed546b56ca60caacc6a9a0/coverage-7.13.5-cp313-cp313-win32.whl", hash = "sha256:ce1998c0483007608c8382f4ff50164bfc5bd07a2246dd272aa4043b75e61e85", size = 222112, upload-time = "2026-03-17T10:31:31.526Z" }, + { url = "https://files.pythonhosted.org/packages/66/40/7732d648ab9d069a46e686043241f01206348e2bbf128daea85be4d6414b/coverage-7.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:631efb83f01569670a5e866ceb80fe483e7c159fac6f167e6571522636104a0b", size = 222923, upload-time = "2026-03-17T10:31:33.633Z" }, + { url = "https://files.pythonhosted.org/packages/48/af/fea819c12a095781f6ccd504890aaddaf88b8fab263c4940e82c7b770124/coverage-7.13.5-cp313-cp313-win_arm64.whl", hash = "sha256:f4cd16206ad171cbc2470dbea9103cf9a7607d5fe8c242fdf1edf36174020664", size = 221540, upload-time = "2026-03-17T10:31:35.445Z" }, + { url = "https://files.pythonhosted.org/packages/23/d2/17879af479df7fbbd44bd528a31692a48f6b25055d16482fdf5cdb633805/coverage-7.13.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0428cbef5783ad91fe240f673cc1f76b25e74bbfe1a13115e4aa30d3f538162d", size = 220262, upload-time = "2026-03-17T10:31:37.184Z" }, + { url = "https://files.pythonhosted.org/packages/5b/4c/d20e554f988c8f91d6a02c5118f9abbbf73a8768a3048cb4962230d5743f/coverage-7.13.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e0b216a19534b2427cc201a26c25da4a48633f29a487c61258643e89d28200c0", size = 220617, upload-time = "2026-03-17T10:31:39.245Z" }, + { url = "https://files.pythonhosted.org/packages/29/9c/f9f5277b95184f764b24e7231e166dfdb5780a46d408a2ac665969416d61/coverage-7.13.5-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:972a9cd27894afe4bc2b1480107054e062df08e671df7c2f18c205e805ccd806", size = 261912, upload-time = "2026-03-17T10:31:41.324Z" }, + { url = "https://files.pythonhosted.org/packages/d5/f6/7f1ab39393eeb50cfe4747ae8ef0e4fc564b989225aa1152e13a180d74f8/coverage-7.13.5-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4b59148601efcd2bac8c4dbf1f0ad6391693ccf7a74b8205781751637076aee3", size = 263987, upload-time = "2026-03-17T10:31:43.724Z" }, + { url = "https://files.pythonhosted.org/packages/a0/d7/62c084fb489ed9c6fbdf57e006752e7c516ea46fd690e5ed8b8617c7d52e/coverage-7.13.5-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:505d7083c8b0c87a8fa8c07370c285847c1f77739b22e299ad75a6af6c32c5c9", size = 266416, upload-time = "2026-03-17T10:31:45.769Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f6/df63d8660e1a0bff6125947afda112a0502736f470d62ca68b288ea762d8/coverage-7.13.5-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:60365289c3741e4db327e7baff2a4aaacf22f788e80fa4683393891b70a89fbd", size = 267558, upload-time = "2026-03-17T10:31:48.293Z" }, + { url = "https://files.pythonhosted.org/packages/5b/02/353ca81d36779bd108f6d384425f7139ac3c58c750dcfaafe5d0bee6436b/coverage-7.13.5-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b88c69c8ef5d4b6fe7dea66d6636056a0f6a7527c440e890cf9259011f5e606", size = 261163, upload-time = "2026-03-17T10:31:50.125Z" }, + { url = "https://files.pythonhosted.org/packages/2c/16/2e79106d5749bcaf3aee6d309123548e3276517cd7851faa8da213bc61bf/coverage-7.13.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5b13955d31d1633cf9376908089b7cebe7d15ddad7aeaabcbe969a595a97e95e", size = 263981, upload-time = "2026-03-17T10:31:51.961Z" }, + { url = "https://files.pythonhosted.org/packages/29/c7/c29e0c59ffa6942030ae6f50b88ae49988e7e8da06de7ecdbf49c6d4feae/coverage-7.13.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f70c9ab2595c56f81a89620e22899eea8b212a4041bd728ac6f4a28bf5d3ddd0", size = 261604, upload-time = "2026-03-17T10:31:53.872Z" }, + { url = "https://files.pythonhosted.org/packages/40/48/097cdc3db342f34006a308ab41c3a7c11c3f0d84750d340f45d88a782e00/coverage-7.13.5-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:084b84a8c63e8d6fc7e3931b316a9bcafca1458d753c539db82d31ed20091a87", size = 265321, upload-time = "2026-03-17T10:31:55.997Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/4994af354689e14fd03a75f8ec85a9a68d94e0188bbdab3fc1516b55e512/coverage-7.13.5-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad14385487393e386e2ea988b09d62dd42c397662ac2dabc3832d71253eee479", size = 260502, upload-time = "2026-03-17T10:31:58.308Z" }, + { url = "https://files.pythonhosted.org/packages/22/c6/9bb9ef55903e628033560885f5c31aa227e46878118b63ab15dc7ba87797/coverage-7.13.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f2c47b36fe7709a6e83bfadf4eefb90bd25fbe4014d715224c4316f808e59a2", size = 262688, upload-time = "2026-03-17T10:32:00.141Z" }, + { url = "https://files.pythonhosted.org/packages/14/4f/f5df9007e50b15e53e01edea486814783a7f019893733d9e4d6caad75557/coverage-7.13.5-cp313-cp313t-win32.whl", hash = "sha256:67e9bc5449801fad0e5dff329499fb090ba4c5800b86805c80617b4e29809b2a", size = 222788, upload-time = "2026-03-17T10:32:02.246Z" }, + { url = "https://files.pythonhosted.org/packages/e1/98/aa7fccaa97d0f3192bec013c4e6fd6d294a6ed44b640e6bb61f479e00ed5/coverage-7.13.5-cp313-cp313t-win_amd64.whl", hash = "sha256:da86cdcf10d2519e10cabb8ac2de03da1bcb6e4853790b7fbd48523332e3a819", size = 223851, upload-time = "2026-03-17T10:32:04.416Z" }, + { url = "https://files.pythonhosted.org/packages/3d/8b/e5c469f7352651e5f013198e9e21f97510b23de957dd06a84071683b4b60/coverage-7.13.5-cp313-cp313t-win_arm64.whl", hash = "sha256:0ecf12ecb326fe2c339d93fc131816f3a7367d223db37817208905c89bded911", size = 222104, upload-time = "2026-03-17T10:32:06.65Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" }, +] + [[package]] name = "datetime" version = "6.0" @@ -610,6 +664,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -617,6 +672,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1855,6 +1911,7 @@ dev = [ { name = "jupyter-book" }, { name = "mystmd" }, { name = "pytest" }, + { name = "pytest-cov" }, { name = "quantile-forest" }, { name = "ruff" }, { name = "tabulate" }, @@ -1900,6 +1957,7 @@ dev = [ { name = "jupyter-book" }, { name = "mystmd", specifier = ">=1.7.0" }, { name = "pytest" }, + { name = "pytest-cov" }, { name = "quantile-forest" }, { name = "ruff", specifier = ">=0.9.0" }, { name = "tabulate" }, @@ -2149,6 +2207,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, ] +[[package]] +name = "pytest-cov" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage" }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"