From f4675e57bdc017b92771913e01706894eb43675a Mon Sep 17 00:00:00 2001 From: Alireza Hajebrahimi <6937697+iarata@users.noreply.github.com> Date: Thu, 26 Mar 2026 13:00:44 +0100 Subject: [PATCH] Refactored v0.2.0 --- .github/workflows/ci.yml | 29 + .github/workflows/release.yml | 132 ++ CHANGELOG.md | 32 +- README.md | 214 +-- docs/_static/.gitkeep | 1 + docs/_templates/.gitkeep | 1 + docs/api.rst | 142 ++ docs/architecture.rst | 13 + docs/cli.rst | 13 + docs/conf.py | 24 + docs/index.rst | 11 + docs/overview.rst | 8 + examples/demo.ipynb | 634 ++------ examples/disc.py | 97 +- examples/generated_types.py | 332 ++-- examples/main.py | 80 +- pyproject.toml | 29 +- src/carp/__init__.py | 22 +- src/carp/cli.py | 137 +- src/carp/commandline/__init__.py | 1 + src/carp/commandline/app.py | 56 + src/carp/commandline/common.py | 48 + src/carp/commandline/convert.py | 28 + src/carp/commandline/count.py | 23 + src/carp/commandline/export.py | 54 + src/carp/commandline/participants.py | 22 + src/carp/commandline/schema.py | 22 + src/carp/constants.py | 7 + src/carp/core/__init__.py | 18 + src/carp/core/dependencies.py | 33 + src/carp/core/fields.py | 56 + src/carp/core/files.py | 53 + src/carp/core/models.py | 28 + src/carp/core/naming.py | 20 + src/carp/export/__init__.py | 5 + src/carp/export/service.py | 93 ++ src/carp/frames/__init__.py | 5 + src/carp/frames/service.py | 139 ++ src/carp/participants/__init__.py | 7 + src/carp/participants/directory.py | 152 ++ src/carp/participants/parser.py | 78 + src/carp/participants/service.py | 51 + src/carp/participants/view.py | 105 ++ src/carp/plotting/__init__.py | 11 +- src/carp/plotting/map_viz.py | 416 ----- src/carp/plotting/prepare.py | 81 + src/carp/plotting/render.py | 56 + src/carp/plotting/service.py | 130 ++ src/carp/reader.py | 1417 ----------------- src/carp/records/__init__.py | 5 + src/carp/records/service.py | 81 + src/carp/schema/__init__.py | 5 + src/carp/schema/service.py | 30 + src/carp/study.py | 47 + src/carp/types/__init__.py | 5 + src/carp/types/infer.py | 64 + src/carp/types/render.py | 97 ++ src/carp/types/service.py | 28 + tests/conftest.py | 33 + .../multi_phase/phase_a/data-streams.json | 109 ++ .../multi_phase/phase_a/participant-data.json | 64 + .../multi_phase/phase_b/data-streams.json | 133 ++ .../multi_phase/phase_b/participant-data.json | 46 + tests/test_cli.py | 57 + tests/test_core.py | 55 + tests/test_edge_frames_plotting.py | 84 + tests/test_edge_types_cli.py | 103 ++ tests/test_export.py | 24 + tests/test_frames.py | 29 + tests/test_participants.py | 42 + tests/test_real_data.py | 23 + tests/test_records_schema.py | 29 + tests/test_structure.py | 22 + tests/test_types_plotting.py | 44 + uv.lock | 272 ++++ 75 files changed, 3700 insertions(+), 2997 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release.yml create mode 100644 docs/_static/.gitkeep create mode 100644 docs/_templates/.gitkeep create mode 100644 docs/api.rst create mode 100644 docs/architecture.rst create mode 100644 docs/cli.rst create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/overview.rst create mode 100644 src/carp/commandline/__init__.py create mode 100644 src/carp/commandline/app.py create mode 100644 src/carp/commandline/common.py create mode 100644 src/carp/commandline/convert.py create mode 100644 src/carp/commandline/count.py create mode 100644 src/carp/commandline/export.py create mode 100644 src/carp/commandline/participants.py create mode 100644 src/carp/commandline/schema.py create mode 100644 src/carp/constants.py create mode 100644 src/carp/core/__init__.py create mode 100644 src/carp/core/dependencies.py create mode 100644 src/carp/core/fields.py create mode 100644 src/carp/core/files.py create mode 100644 src/carp/core/models.py create mode 100644 src/carp/core/naming.py create mode 100644 src/carp/export/__init__.py create mode 100644 src/carp/export/service.py create mode 100644 src/carp/frames/__init__.py create mode 100644 src/carp/frames/service.py create mode 100644 src/carp/participants/__init__.py create mode 100644 src/carp/participants/directory.py create mode 100644 src/carp/participants/parser.py create mode 100644 src/carp/participants/service.py create mode 100644 src/carp/participants/view.py delete mode 100644 src/carp/plotting/map_viz.py create mode 100644 src/carp/plotting/prepare.py create mode 100644 src/carp/plotting/render.py create mode 100644 src/carp/plotting/service.py delete mode 100644 src/carp/reader.py create mode 100644 src/carp/records/__init__.py create mode 100644 src/carp/records/service.py create mode 100644 src/carp/schema/__init__.py create mode 100644 src/carp/schema/service.py create mode 100644 src/carp/study.py create mode 100644 src/carp/types/__init__.py create mode 100644 src/carp/types/infer.py create mode 100644 src/carp/types/render.py create mode 100644 src/carp/types/service.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/multi_phase/phase_a/data-streams.json create mode 100644 tests/fixtures/multi_phase/phase_a/participant-data.json create mode 100644 tests/fixtures/multi_phase/phase_b/data-streams.json create mode 100644 tests/fixtures/multi_phase/phase_b/participant-data.json create mode 100644 tests/test_cli.py create mode 100644 tests/test_core.py create mode 100644 tests/test_edge_frames_plotting.py create mode 100644 tests/test_edge_types_cli.py create mode 100644 tests/test_export.py create mode 100644 tests/test_frames.py create mode 100644 tests/test_participants.py create mode 100644 tests/test_real_data.py create mode 100644 tests/test_records_schema.py create mode 100644 tests/test_structure.py create mode 100644 tests/test_types_plotting.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5546673 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,29 @@ +name: ci + +on: + push: + branches: + - "**" + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install pytest pytest-cov mypy ruff sphinx sphinx-rtd-theme pandas pyarrow folium matplotlib + - name: Lint + run: ruff check src examples tests docs + - name: Type check + run: mypy src/carp + - name: Test + run: pytest --cov=src/carp --cov-branch --cov-fail-under=100 + - name: Build docs + run: sphinx-build -b html docs docs/_build/html diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..05eebb6 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,132 @@ +name: release + +on: + push: + tags: + - "**" + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +jobs: + validate_tag: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + tag: ${{ steps.version.outputs.tag }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - id: version + name: Validate tag against package version + run: | + version=$(python - <<'PY' + import pathlib + import tomllib + + project = tomllib.loads(pathlib.Path("pyproject.toml").read_text()) + print(project["project"]["version"]) + PY + ) + tag="${GITHUB_REF_NAME}" + if [ "${tag}" != "${version}" ] && [ "${tag}" != "v${version}" ]; then + echo "Tag ${tag} does not match package version ${version}." >&2 + exit 1 + fi + echo "version=${version}" >> "${GITHUB_OUTPUT}" + echo "tag=${tag}" >> "${GITHUB_OUTPUT}" + + test: + needs: validate_tag + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install pytest pytest-cov pandas pyarrow folium matplotlib + - name: Run tests + run: pytest --cov=src/carp --cov-branch --cov-fail-under=100 + + quality: + needs: validate_tag + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install pytest pytest-cov mypy ruff sphinx sphinx-rtd-theme pandas pyarrow folium matplotlib + - name: Lint + run: ruff check src examples tests docs + - name: Type check + run: mypy src/carp + - name: Build docs + run: sphinx-build -W -b html docs docs/_build/html + + build: + needs: [test, quality] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Build distributions + run: | + python -m pip install --upgrade pip + python -m pip install build twine + python -m build + python -m twine check dist/* + - uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + + publish_pypi: + needs: build + runs-on: ubuntu-latest + environment: + name: pypi + permissions: + id-token: write + steps: + - uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - uses: pypa/gh-action-pypi-publish@release/v1 + + publish_github: + needs: [validate_tag, publish_pypi] + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Generate checksums + run: shasum -a 256 dist/* > dist/SHA256SUMS.txt + - uses: softprops/action-gh-release@v2 + with: + name: Release ${{ needs.validate_tag.outputs.tag }} + tag_name: ${{ needs.validate_tag.outputs.tag }} + generate_release_notes: true + files: dist/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 681e02e..7932cbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,35 @@ # Changelog -All notable changes to this project will be documented in this file. +## [0.2.0] - 2026-03-26 -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +### Added + +- New `CarpStudy` public API as the primary entrypoint for CARP study analysis +- Modular service layout under `carp.core`, `participants`, `records`, `schema`, `export`, `frames`, `types`, `plotting`, and `commandline` +- Self-contained pytest suite with committed multi-phase fixtures and optional `sleep-data` smoke coverage +- 100% line and branch coverage enforcement for `src/carp` +- Sphinx documentation site with autodoc and Napoleon support +- GitHub Actions CI for linting, type-checking, tests, and docs builds +- Tag-driven CD workflow that validates version tags, publishes to PyPI, and creates GitHub releases +- Dedicated `test` and `docs` dependency groups + +### Changed + +- Replaced the legacy method-heavy design with a thin `CarpStudy` composition root and focused services +- Kept the `carp` CLI command set stable while rewriting the implementation behind modular handlers +- Switched plotting defaults to `dk.cachet.carp.location` +- Made parquet filenames namespace-aware to avoid same-name type collisions +- Added Google-style docstrings and expanded type annotations across the package +- Refreshed the README, example scripts, generated type example, and notebook to use the new API +- Normalized Ruff, MyPy, coverage, and documentation build configuration in `pyproject.toml` + +### Removed -## [Unreleased] +- Legacy `carp.reader` monolith +- Legacy `carp.plotting.map_viz` module +- Old `CarpDataStream`-centric example usage and stale plotting/type-generation references -## [0.1.0] - 2024-12-02 +## [0.1.0] ### Added diff --git a/README.md b/README.md index 87a5fb1..574cfb4 100644 --- a/README.md +++ b/README.md @@ -4,202 +4,76 @@ [![Python versions](https://img.shields.io/pypi/pyversions/carp-analytics-python.svg)](https://pypi.org/project/carp-analytics-python/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -A high-performance Python library for processing and analysing data from [CARP](https://carp.computerome.dk/) (Copenhagen Research Platform) studies. - -> [!BETA] -> The CARP package is at the beta phase and the APIs and methods might change more often. - -## Features - -- **Schema Discovery**: Automatically scans and infers the schema of the data -- **Data Grouping**: Efficiently groups data by any field (e.g., data type, device ID) into separate files -- **Parquet Export**: Convert JSON data to Parquet for faster subsequent analysis -- **Participant Management**: Link and track participants across multiple study phases -- **Visualization**: Generate location heatmaps and other visualizations -- **Pandas Integration**: Seamlessly work with DataFrames - -## Installation - -```bash -pip install carp-analytics-python -``` - -### With Optional Dependencies - -```bash -# For pandas/parquet support -pip install carp-analytics-python[pandas] - -# For visualization support -pip install carp-analytics-python[viz] - -# For scientific computing (numpy, scipy, scikit-learn) -pip install carp-analytics-python[science] - -# Install everything -pip install carp-analytics-python[all] -``` - -### Development Installation - -```bash -git clone https://github.com/carp-dk/carp-analytics-python.git -cd carp-analytics-python - -# Using uv (recommended) -uv sync - -# Or using pip -pip install -e . -``` +`carp-analytics-python` is a Python library for working with CARP study data. It focuses on streaming JSON records, participant lookup, schema discovery, export, parquet conversion, and optional plotting. ## Quick Start ```python -from carp import CarpDataStream +from carp import CarpStudy -# Initialize with a data file -data = CarpDataStream("data/study-phase-1/data-streams.json") - -# Scan and print the schema -data.print_schema() - -# Convert to Parquet for faster analysis -data.convert_to_parquet("output_parquet") - -# Load data as a DataFrame -df = data.get_dataframe("dk.cachet.carp.stepcount", "output_parquet") -print(df.head()) +study = CarpStudy("sleep-data/phase-1-1/data-streams.json") +print(study.records.count()) +print(study.participants.summary_rows()[0]) ``` -## Working with Participants - -```python -from carp import CarpDataStream - -# Load data from multiple phases -data = CarpDataStream([ - "data/phase-1/data-streams.json", - "data/phase-2/data-streams.json", -]) - -# Print participant summary -data.print_participants() - -# Access participant data via email -participant = data.participant("user@example.com") - -# Get participant info -print(participant.info()) - -# Get available data types for this participant -participant.print_data_types() - -# Get a DataFrame of step count data -df = participant.dataframe("dk.cachet.carp.stepcount", "output_parquet") -``` +## Main API -## Data Export +`CarpStudy` is the primary entrypoint. ```python -# Export specific data type to JSON -data.export_to_json("heartbeat_data.json", data_type="dk.cachet.carp.heartbeat") +from carp import CarpStudy -# Group data by data type -data.group_by_field("dataStream.dataType.name", "output_by_type") +study = CarpStudy([ + "sleep-data/phase-1-1/data-streams.json", + "sleep-data/phase-2-1/data-streams.json", +]) -# Group data by participant -data.group_by_participant("output_by_participant") +study.schema.scan() +study.export.export_json("output.json", data_type="dk.cachet.carp.stepcount") +study.frames.convert_to_parquet("output_parquet") +study.participant("alice@example.com").info() ``` -## Visualization +## CLI -```python -# Generate location heatmap for a participant -participant = data.participant("user@example.com") -participant.visualize.location(output_file="user_locations.html") +```bash +carp schema sleep-data/phase-1-1/data-streams.json +carp count sleep-data/phase-1-1/data-streams.json +carp participants sleep-data/phase-1-1/data-streams.json +carp export sleep-data/phase-1-1/data-streams.json -o output.json -t dk.cachet.carp.stepcount +carp group sleep-data/phase-1-1/data-streams.json -o grouped_output +carp convert sleep-data/phase-1-1/data-streams.json -o output_parquet ``` -## Command Line Interface +## Documentation -The package includes a CLI for common operations: +The docs are built with Sphinx, `autodoc`, and `napoleon`. ```bash -# Show schema of data files -carp schema data/study/data-streams.json - -# Convert JSON to Parquet -carp convert data/study/data-streams.json -o output_parquet - -# Count items in data files -carp count data/study/data-streams.json - -# List participants -carp participants data/study/data-streams.json - -# Export filtered data -carp export data/study/data-streams.json -o output.json -t dk.cachet.carp.stepcount - -# Group data by field -carp group data/study/data-streams.json -f dataStream.dataType.name -o grouped_output +python -m pip install sphinx sphinx-rtd-theme +sphinx-build -b html docs docs/_build/html ``` -## API Reference - -### `CarpDataStream` - -The main class for working with CARP data streams. - -| Method | Description | -|--------|-------------| -| `scan_schema()` | Scan and infer the data schema | -| `print_schema()` | Print the inferred schema as a table | -| `convert_to_parquet(output_dir)` | Convert JSON to Parquet files | -| `get_dataframe(data_type, parquet_dir)` | Load data as a pandas DataFrame | -| `export_to_json(output_path, data_type)` | Export data to JSON file | -| `group_by_field(field_path, output_dir)` | Group data by a specific field | -| `participant(email)` | Access participant data via fluent API | -| `print_participants()` | Print participant summary table | - -### `ParticipantAccessor` - -Fluent API for accessing individual participant data. +## Release Automation -| Method | Description | -|--------|-------------| -| `info()` | Get participant information as a dictionary | -| `print_info()` | Print participant info as a table | -| `all_data(data_type)` | Generator for all participant data | -| `data_types()` | Get all unique data types | -| `dataframe(data_type, parquet_dir)` | Get data as a pandas DataFrame | -| `visualize.location()` | Generate location heatmap | +Pushing a new version tag triggers the release workflow. The tag must match the +package version in `pyproject.toml` as either `0.1.0` or `v0.1.0`. -## Requirements +The release workflow reruns tests, linting, type checks, docs builds, and +package builds before it publishes the distributions to PyPI and attaches the +same artifacts to a GitHub release. -- Python 3.10+ -- ijson (for streaming JSON parsing) -- rich (for terminal output) -- tqdm (for progress bars) +PyPI publishing uses GitHub Actions trusted publishing. Configure a trusted +publisher on PyPI for this repository and the `release` workflow, with the +`pypi` environment enabled in GitHub. -Optional: -- pandas, pyarrow (for DataFrame and Parquet support) -- matplotlib, folium (for visualization) -- numpy, scipy, scikit-learn (for scientific computing) +## Examples -## Contributing - -Contributions are welcome! Please feel free to submit a Pull Request. - -1. Fork the repository -2. Create your feature branch (`git checkout -b feature/featA`) -3. Commit your changes (`git commit -m 'Add some featA'`) -4. Push to the branch (`git push origin feature/featA`) -5. Open a Pull Request - -## Licence - -This project is licensed under the MIT Licence - see the [Licence](LICENSE) file for details. +```bash +python examples/main.py sleep-data/phase-1-1/data-streams.json +python examples/disc.py sleep-data/phase-1-1/data-streams.json +``` -## Acknowledgments +## Optional Dependencies -- [CARP - Copenhagen Research Platform](https://carp.dk/) +`pandas` and `pyarrow` enable dataframe and parquet support. `folium` enables plotting. diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/docs/_static/.gitkeep @@ -0,0 +1 @@ + diff --git a/docs/_templates/.gitkeep b/docs/_templates/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/docs/_templates/.gitkeep @@ -0,0 +1 @@ + diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..45844b3 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,142 @@ +API Reference +============= + +.. automodule:: carp + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.study + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.core.models + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.core.fields + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.core.files + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.core.naming + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.core.dependencies + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.participants.parser + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.participants.directory + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.participants.view + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.participants.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.records.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.schema.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.export.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.frames.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.types.infer + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.types.render + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.types.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.plotting.prepare + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.plotting.render + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.plotting.service + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.app + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.common + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.schema + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.count + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.participants + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.export + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: carp.commandline.convert + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/architecture.rst b/docs/architecture.rst new file mode 100644 index 0000000..4500287 --- /dev/null +++ b/docs/architecture.rst @@ -0,0 +1,13 @@ +Architecture +============ + +The package is intentionally split into small services: + +* ``carp.study`` composes the public `CarpStudy` entrypoint. +* ``carp.participants`` handles participant parsing and lookup. +* ``carp.records`` streams and filters JSON records. +* ``carp.schema`` infers measurement schemas. +* ``carp.export`` writes JSON output and grouped files. +* ``carp.frames`` loads pandas dataframes and writes parquet files. +* ``carp.types`` generates dataclasses from sampled records. +* ``carp.plotting`` renders HTML maps for participant data. diff --git a/docs/cli.rst b/docs/cli.rst new file mode 100644 index 0000000..e15895a --- /dev/null +++ b/docs/cli.rst @@ -0,0 +1,13 @@ +CLI +=== + +The command line interface exposes the same core flows as the Python API. + +.. code-block:: bash + + carp schema sleep-data/phase-1-1/data-streams.json + carp count sleep-data/phase-1-1/data-streams.json + carp participants sleep-data/phase-1-1/data-streams.json + carp export sleep-data/phase-1-1/data-streams.json -o output.json -t dk.cachet.carp.stepcount + carp group sleep-data/phase-1-1/data-streams.json -o grouped_output + carp convert sleep-data/phase-1-1/data-streams.json -o output_parquet diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ab221f3 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,24 @@ +"""Sphinx configuration for CARP Analytics.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +project_root = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(project_root / "src")) + +project = "CARP Analytics Python" +author = "CARP Team" +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", +] +autodoc_typehints = "description" +napoleon_google_docstring = True +napoleon_numpy_docstring = False +templates_path = ["_templates"] +exclude_patterns = ["_build"] +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..e024384 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,11 @@ +CARP Analytics Python +===================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents + + overview + api + cli + architecture diff --git a/docs/overview.rst b/docs/overview.rst new file mode 100644 index 0000000..ce5e00c --- /dev/null +++ b/docs/overview.rst @@ -0,0 +1,8 @@ +Overview +======== + +`carp-analytics-python` is built around :class:`carp.study.CarpStudy`. +It provides services for records, participants, schema discovery, export, +dataframe conversion, type generation, and plotting. + +The package is documented with Google-style docstrings and Sphinx autodoc. diff --git a/examples/demo.ipynb b/examples/demo.ipynb index a6bfc40..a8b8e60 100644 --- a/examples/demo.ipynb +++ b/examples/demo.ipynb @@ -1,504 +1,134 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "c33366d9", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "\n", - "if \"src\" not in sys.path:\n", - " sys.path.append(str(Path.cwd() / \"src\"))\n", - "\n", - "from sleepiness import SleepinessData\n", - "\n", - "file_paths = [\n", - " \"sleep-data/phase-1-1/data-streams.json\",\n", - " \"sleep-data/phase-2-1/data-streams.json\",\n", - " \"sleep-data/phase-3-1/data-streams.json\"\n", - "]\n", - "# OR\n", - "# file_paths = \"data/phase-1-1/data-streams.json\"\n", - "\n", - "sd = SleepinessData(file_paths)" - ] - }, - { - "cell_type": "markdown", - "id": "52dc794a", - "metadata": {}, - "source": [ - "## Participant Data Integration\n", - "When loading multiple data folders, the library automatically loads `participant-data.json` from each folder and unifies participants across folders (using email/SSN as identifiers)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18e00b45", - "metadata": {}, - "outputs": [], - "source": [ - "# View all participants across all loaded data folders\n", - "sd.print_participants()" - ] - }, - { - "cell_type": "markdown", - "id": "914c2bed", - "metadata": {}, - "source": [ - "### Data with Participant Info\n", - "Iterate through data items enriched with participant information:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d35b3bd5", - "metadata": {}, - "outputs": [], - "source": [ - "# Get participant info for a specific deployment\n", - "# for item in sd._get_item_generator():\n", - "# deployment_id = item.get('studyDeploymentId')\n", - "# if deployment_id:\n", - "# participant = sd.get_participant(deployment_id)\n", - "# if participant:\n", - "# print(f\"Deployment: {deployment_id[:30]}...\")\n", - "# print(f\" Unified ID: {participant.unified_participant_id}\")\n", - "# print(f\" Email: {participant.email}\")\n", - "# print(f\" Source folder: {participant.source_folder}\")\n", - "# break\n", - "\n", - "# Get participant info\n", - "sd.participant(\"test@example.com\").info()\n", - "sd.participant(\"test@example.com\").print_info()\n", - "\n", - "# Get all data for this participant\n", - "count = 0\n", - "for item in sd.participant(\"test@example.com\").all_data():\n", - " print(item)\n", - " count += 1\n", - " if count >= 5:\n", - " print(\"Limit output for demo\")\n", - " break\n", - "\n", - "# Filter by data type\n", - "for item in sd.participant(\"test@example.com\").all_data(\"dk.cachet.carp.location\"):\n", - " print(item)\n", - "\n", - "# See available fields\n", - "sd.participant(\"test@example.com\").available_fields()\n", - "sd.participant(\"test@example.com\").print_available_fields()\n", - "\n", - "# See data types available\n", - "sd.participant(\"test@example.com\").data_types()\n", - "sd.participant(\"test@example.com\").print_data_types()\n", - "\n", - "# Get count\n", - "sd.participant(\"test@example.com\").count()\n", - "\n", - "# Get DataFrame\n", - "df = sd.participant(\"test@example.com\").dataframe(\"dk.cachet.carp.stepcount\")\n", - "\n", - "# Check if exists\n", - "sd.participant(\"test@example.com\").exists" - ] - }, - { - "cell_type": "markdown", - "id": "6145e273", - "metadata": {}, - "source": [ - "### DataFrame with Participant Info\n", - "Get a DataFrame enriched with participant columns:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "507cb8e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Get DataFrame with participant columns\n", - "df = sd.get_dataframe_with_participants(\"dk.cachet.carp.stepcount\")\n", - "if df is not None and not df.empty:\n", - " print(df[['participant_id', 'participant_email', 'participant_folder']].head())" - ] - }, - { - "cell_type": "markdown", - "id": "d158d50b", - "metadata": {}, - "source": [ - "### Visualize Participant Data on Map\n", - "Generate a heatmap aggregating data for a specific participant across all their deployments:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "761607c2", - "metadata": {}, - "outputs": [], - "source": [ - "from sleepiness.plotting import LocationVisualizer\n", - "\n", - "# Create visualizer\n", - "viz = LocationVisualizer(sd)\n", - "\n", - "# Plot heatmap for a specific participant (e.g., P0002 who appears in all 3 phases)\n", - "viz.plot_participant_heatmap(\n", - " unified_participant_id=\"P0002\", # Choose a participant from the summary table\n", - " output_file=\"participant_heatmap.html\",\n", - " location_type=\"dk.cachet.carp.location\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f9477bd0", - "metadata": {}, - "source": [ - "## 1. Schema Discovery\n", - "Scan the file to understand the structure of the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5877bce4", - "metadata": {}, - "outputs": [], - "source": [ - "sd.print_schema()" - ] - }, - { - "cell_type": "markdown", - "id": "77f655a6", - "metadata": {}, - "source": [ - "### Generate Type Definitions\n", - "You can generate a Python module with dataclasses representing the data schema. This allows for type-safe access to the data, including nested JSON objects." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6bbdbd5", - "metadata": {}, - "outputs": [], - "source": [ - "import importlib\n", - "import sleepiness.reader\n", - "importlib.reload(sleepiness.reader)\n", - "\n", - "# Re-initialize sd to ensure latest code is used\n", - "sd = sleepiness.reader.SleepinessData(file_paths)\n", - "sd.generate_type_definitions(output_file=\"generated_types.py\", sample_size=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2fd20bb", - "metadata": {}, - "outputs": [], - "source": [ - "# Example usage of generated types\n", - "try:\n", - " import generated_types\n", - " import importlib\n", - " importlib.reload(generated_types)\n", - " \n", - " # Read one item and convert\n", - " gen = sd._get_item_generator()\n", - " item = next(gen)\n", - " \n", - " obj = generated_types.SleepinessItem.from_dict(item)\n", - " print(f\"Converted object type: {type(obj)}\")\n", - " if obj.dataStream and obj.dataStream.dataType:\n", - " print(f\"Data Stream: {obj.dataStream.dataType.name}\")\n", - "except ImportError:\n", - " print(\"Could not import generated_types. Please restart kernel or check file.\")\n", - "except Exception as e:\n", - " print(f\"Error: {e}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62ff430f", - "metadata": {}, - "outputs": [], - "source": [ - "item = next(sd._get_item_generator())\n", - "obj = generated_types.SleepinessItem.from_dict(item)\n", - "sd.generate_type_definitions(output_file=\"generated_types.py\", sample_size=500)\n", - "\n", - "item = next(sd._get_item_generator())\n", - "obj = generated_types.SleepinessItem.from_dict(item)\n", - "\n", - "# Type-safe access\n", - "print(obj.dataStream.dataType.name)" - ] - }, - { - "cell_type": "markdown", - "id": "f243a62f", - "metadata": {}, - "source": [ - "## 2. Count Items\n", - "Count the total number of records in the file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "814969c3", - "metadata": {}, - "outputs": [], - "source": [ - "count = sd.count_items()\n", - "print(f\"Total items: {count}\")" - ] - }, - { - "cell_type": "markdown", - "id": "3f357eba", - "metadata": {}, - "source": [ - "## 3. Grouping Data\n", - "Split the large JSON file into smaller files based on the data type." - ] - }, - { - "cell_type": "markdown", - "id": "0e151d64", - "metadata": {}, - "source": [ - "### Explore Available Fields\n", - "You can scan a sample of the data to list all available fields in dot-notation. This is helpful for deciding which field to group by." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c3c6be4", - "metadata": {}, - "outputs": [], - "source": [ - "fields = sd.list_all_fields(sample_size=500)\n", - "print(\"Available fields for grouping:\")\n", - "for f in fields:\n", - " print(f\" - {f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56ffebb8", - "metadata": {}, - "outputs": [], - "source": [ - "output_groups = \"output_groups\"\n", - "# sd.group_by_field(\"dataStream.studyDeploymentId\", output_groups)\n", - "sd.group_by_email(output_groups)" - ] - }, - { - "cell_type": "markdown", - "id": "3f9f3497", - "metadata": {}, - "source": [ - "## 4. Export to JSON\n", - "Export a specific data type to a separate JSON file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e9aa571", - "metadata": {}, - "outputs": [], - "source": [ - "sd.export_to_json(\"heartbeat.json\", data_type=\"dk.cachet.carp.heartbeat\")" - ] - }, - { - "cell_type": "markdown", - "id": "fc1c9eb8", - "metadata": {}, - "source": [ - "## 5. Convert to Parquet\n", - "Convert the data to Parquet format for efficient storage and loading." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a648c91", - "metadata": {}, - "outputs": [], - "source": [ - "parquet_dir = \"output_parquet\"\n", - "sd.convert_to_parquet(parquet_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "b5f02117", - "metadata": {}, - "source": [ - "## 6. Load DataFrame\n", - "Load data into a pandas DataFrame, utilizing the Parquet files if available." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d9112e1", - "metadata": {}, - "outputs": [], - "source": [ - "# Load stepcount data\n", - "df = sd.get_dataframe(\"dk.cachet.carp.completedtask\", parquet_dir)\n", - "\n", - "if df is not None:\n", - " print(f\"Loaded {len(df)} records\")\n", - " display(df.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7472a455", - "metadata": {}, - "outputs": [], - "source": [ - "# df first row\n", - "df.iloc[313].measurement" - ] - }, - { - "cell_type": "markdown", - "id": "b10095ea", - "metadata": {}, - "source": [ - "## 7. Plotting\n", - "Generate a heatmap of user locations and overlay step count data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7d3b527", - "metadata": {}, - "outputs": [], - "source": [ - "from sleepiness.plotting import LocationVisualizer\n", - "\n", - "# Initialize visualizer\n", - "viz = LocationVisualizer(sd)\n", - "\n", - "# Pick a user ID (you can find one from the grouping step or list_all_fields)\n", - "# For demo purposes, let's try to find a valid ID from the loaded dataframe if available, \n", - "# or just use a hardcoded one if you know it.\n", - "study_deployment_id = \"0efd5a7f-6428-48db-8099-8d65a62606b4\" # Example ID\n", - "\n", - "# Generate heatmap\n", - "# Note: Ensure you have 'dk.cachet.carp.geolocation' and 'dk.cachet.carp.stepcount' data available\n", - "# You might need to run convert_to_parquet first if you haven't.\n", - "\n", - "\n", - "viz.plot_user_heatmap(\n", - " study_deployment_id=study_deployment_id,\n", - " location_type=\"dk.cachet.carp.location\", # Adjust type name if different\n", - " step_type=\"dk.cachet.carp.stepcount\", # Adjust type name if different\n", - " output_file=\"user_heatmap.html\"\n", - ")\n", - "\n", - "# Display the map in the notebook\n", - "# from IPython.display import IFrame\n", - "# IFrame(src='user_heatmap.html', width=700, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "63223a42", - "metadata": {}, - "source": [ - "### Plotting with Type-Safe Objects\n", - "You can also convert the data to type-safe objects and pass them directly to the visualizer. This is useful if you want to manipulate the objects before plotting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "997894b1", - "metadata": {}, - "outputs": [], - "source": [ - "# 1. Get DataFrames\n", - "df_loc = sd.get_dataframe(\"dk.cachet.carp.location\", parquet_dir)\n", - "df_steps = sd.get_dataframe(\"dk.cachet.carp.stepcount\", parquet_dir)\n", - "\n", - "# 2. Filter by User\n", - "# Using the same ID as above\n", - "if df_loc is not None and not df_loc.empty:\n", - " df_loc_user = df_loc[df_loc['studyDeploymentId'] == study_deployment_id]\n", - " df_steps_user = df_steps[df_steps['studyDeploymentId'] == study_deployment_id] if df_steps is not None else pd.DataFrame()\n", - "\n", - " # 3. Convert to Objects\n", - " # Note: generated_types.SleepinessItem.from_dict expects a dictionary structure matching the JSON.\n", - " # If df_loc comes from Parquet, it might have nested columns as dicts (if read correctly) or flat columns.\n", - " # Let's assume it has nested columns or we convert it.\n", - " \n", - " # If the dataframe has nested dicts (e.g. 'measurement' column contains dicts):\n", - " location_items = [generated_types.SleepinessItem.from_dict(row) for row in df_loc_user.to_dict('records')]\n", - " step_items = [generated_types.SleepinessItem.from_dict(row) for row in df_steps_user.to_dict('records')]\n", - " \n", - " print(f\"Converted {len(location_items)} location items and {len(step_items)} step items.\")\n", - "\n", - " # 4. Plot\n", - " viz.plot_heatmap_from_items(\n", - " location_items=location_items,\n", - " step_items=step_items,\n", - " output_file=\"user_heatmap_objects.html\"\n", - " )\n", - " \n", - " # Display\n", - " # IFrame(src='user_heatmap_objects.html', width=700, height=600)\n", - "else:\n", - " print(\"No data found to plot.\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "carp-analytics-python (3.13.5)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CARP Analytics Notebook Example\n", + "\n", + "This notebook shows the current `CarpStudy` API with the bundled `sleep-data` dataset or the committed test fixtures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "if \"src\" not in sys.path:\n", + " sys.path.append(str(Path.cwd() / \"src\"))\n", + "\n", + "from carp import CarpStudy\n", + "\n", + "\n", + "def default_paths() -> list[Path]:\n", + " sleep_paths = sorted(Path(\"sleep-data\").glob(\"phase-*/data-streams.json\"))\n", + " if sleep_paths:\n", + " return sleep_paths\n", + " return sorted(Path(\"tests/fixtures/multi_phase\").glob(\"*/data-streams.json\"))\n", + "\n", + "\n", + "file_paths = default_paths()\n", + "study = CarpStudy(file_paths)\n", + "file_paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Total records: {study.records.count():,}\")\n", + "print(f\"Data types: {study.records.data_types()}\")\n", + "study.schema.scan()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "participant_rows = study.participants.summary_rows()\n", + "participant_rows[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_email = next((row[\"emails\"] for row in participant_rows if row[\"emails\"] != \"N/A\"), None)\n", + "participant = study.participant(example_email) if example_email else None\n", + "participant.info() if participant else None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " step_frame = study.frames.get_dataframe(\"dk.cachet.carp.stepcount\")\n", + " step_frame.head()\n", + "except RuntimeError as exc:\n", + " print(exc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generated_path = Path(\"examples/generated_types.py\")\n", + "study.types.generate(generated_path, sample_size=25)\n", + "\n", + "import generated_types\n", + "\n", + "first_record = next(study.records.iter_records())\n", + "generated_types.StudyItem.from_dict(first_record)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if participant is not None:\n", + " try:\n", + " participant.plot_location(output_file=\"examples/user_heatmap.html\")\n", + " except RuntimeError as exc:\n", + " print(exc)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/disc.py b/examples/disc.py index 4de8498..0832a0a 100644 --- a/examples/disc.py +++ b/examples/disc.py @@ -1,59 +1,38 @@ -# discover_schema.py -import ijson -from collections import defaultdict -import yaml - -def discover_schema(file_path): - schema = defaultdict(set) - - with open(file_path, 'rb') as f: - parser = ijson.parse(f) - current_path = [] - for prefix, event, value in parser: - current_path = prefix.split('.') - full_path = '.'.join(current_path) - - if event == 'map_key': - current_path.append(value) - continue - elif event in ('start_map', 'start_array'): - pass - elif event in ('end_map', 'end_array'): - if current_path: - current_path.pop() - continue - - # leaf value - if value is None: - type_name = 'null' - elif event == 'string': - type_name = 'string' - elif event in ('number', 'integer'): - type_name = 'number' - elif event == 'boolean': - type_name = 'boolean' - else: - type_name = event - - schema['.'.join(current_path)].add(type_name) - - # Convert to nice nested dict - nested = {} - for path, types in schema.items(): - parts = path.split('.') - d = nested - for part in parts[:-1]: - if part not in d: - d[part] = {'_type': 'object', '_children': {}} - elif '_children' not in d[part]: - d[part]['_children'] = {} - d = d[part]['_children'] - key = parts[-1] - d[key] = {'_type': list(types)} if len(types) > 1 else {'_type': list(types)[0]} - - return nested - -if __name__ == '__main__': - import sys - schema = discover_schema(sys.argv[1]) - print(yaml.dump(schema, default_flow_style=False, sort_keys=False)) \ No newline at end of file +"""Compact schema-discovery example for `CarpStudy`.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +from carp import CarpStudy + + +def _default_paths() -> list[Path]: + """Return bundled data-stream files for schema discovery.""" + + sleep_paths = sorted(Path("sleep-data").glob("phase-*/data-streams.json")) + if sleep_paths: + return sleep_paths + return sorted(Path("tests/fixtures/multi_phase").glob("*/data-streams.json")) + + +def main() -> int: + """Load a study and print schema and field examples.""" + + file_paths = [Path(arg) for arg in sys.argv[1:]] or _default_paths() + study = CarpStudy(file_paths, load_participants=False) + print("Observed data types:") + for data_type in study.records.data_types(): + print(f" - {data_type}") + print("\nSchema summary:") + for data_type, fields in study.schema.scan().items(): + print(f" {data_type}: {', '.join(fields)}") + print("\nSample field paths:") + for field in study.records.list_fields(sample_size=3)[:12]: + print(f" - {field}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/generated_types.py b/examples/generated_types.py index afd1dbe..7e8f3bf 100644 --- a/examples/generated_types.py +++ b/examples/generated_types.py @@ -1,250 +1,122 @@ -# Auto-generated type definitions +"""Example generated dataclasses for CARP study records.""" from __future__ import annotations -from dataclasses import dataclass -from typing import List, Optional, Any, Dict + import json +from dataclasses import dataclass +from typing import Any -def parse_json_field(value): - if isinstance(value, str): - try: - return json.loads(value) - except: - return value - return value - -@dataclass -class SleepinessItem: - sequenceId: int = None - studyDeploymentId: str = None - deviceRoleName: str = None - measurement: Measurement = None - triggerIds: List[int] = None - syncPoint: SyncPoint = None - dataStream: DataStream = None - @classmethod - def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('sequenceId') - instance.sequenceId = val - val = obj.get('studyDeploymentId') - instance.studyDeploymentId = val - val = obj.get('deviceRoleName') - instance.deviceRoleName = val - val = obj.get('measurement') - if val is not None: - instance.measurement = Measurement.from_dict(val) - val = obj.get('triggerIds') - instance.triggerIds = val - val = obj.get('syncPoint') - if val is not None: - instance.syncPoint = SyncPoint.from_dict(val) - val = obj.get('dataStream') - if val is not None: - instance.dataStream = DataStream.from_dict(val) - return instance - -@dataclass -class Measurement: - sensorStartTime: int = None - data: Data = None +def parse_json_field(value: Any) -> Any: + """Parse JSON text when a field stores serialized payload data.""" + + if not isinstance(value, str): + return value + try: + return json.loads(value) + except json.JSONDecodeError: + return value + + +@dataclass(slots=True) +class DataType: + """Data-type metadata for one CARP record.""" + + namespace: str | None = None + name: str | None = None @classmethod def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('sensorStartTime') - instance.sensorStartTime = val - val = obj.get('data') - if val is not None: - instance.data = Data.from_dict(val) - return instance - -@dataclass -class Data: - __type: str = None - period: int = None - deviceType: str = None - deviceRoleName: str = None - batteryLevel: int = None - batteryStatus: str = None - screenEvent: str = None - type_: str = None - confidence: int = None - triggerId: int = None - taskName: str = None - destinationDeviceRoleName: str = None - control: str = None - steps: int = None - time: str = None - speed: float = None - isMock: bool = None - heading: float = None - accuracy: float = None - altitude: float = None - latitude: float = None - longitude: float = None - speedAccuracy: float = None - headingAccuracy: float = None - verticalAccuracy: float = None - elapsedRealtimeNanos: int = None - elapsedRealtimeUncertaintyNanos: float = None - date: str = None - sunset: str = None - country: str = None - sunrise: str = None - tempMax: float = None - tempMin: float = None - areaName: str = None - humidity: float = None - pressure: float = None - windSpeed: float = None - cloudiness: float = None - windDegree: float = None - temperature: float = None - weatherMain: str = None - weatherDescription: str = None + """Build a data-type object from a dictionary.""" + + return obj if not isinstance(obj, dict) else cls(obj.get("namespace"), obj.get("name")) + + +@dataclass(slots=True) +class DataStream: + """Stream metadata attached to a CARP record.""" + + studyDeploymentId: str | None = None + deviceRoleName: str | None = None + dataType: DataType | None = None @classmethod def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('__type') - instance.__type = val - val = obj.get('period') - instance.period = val - val = obj.get('deviceType') - instance.deviceType = val - val = obj.get('deviceRoleName') - instance.deviceRoleName = val - val = obj.get('batteryLevel') - instance.batteryLevel = val - val = obj.get('batteryStatus') - instance.batteryStatus = val - val = obj.get('screenEvent') - instance.screenEvent = val - val = obj.get('type') - instance.type_ = val - val = obj.get('confidence') - instance.confidence = val - val = obj.get('triggerId') - instance.triggerId = val - val = obj.get('taskName') - instance.taskName = val - val = obj.get('destinationDeviceRoleName') - instance.destinationDeviceRoleName = val - val = obj.get('control') - instance.control = val - val = obj.get('steps') - instance.steps = val - val = obj.get('time') - instance.time = val - val = obj.get('speed') - instance.speed = val - val = obj.get('isMock') - instance.isMock = val - val = obj.get('heading') - instance.heading = val - val = obj.get('accuracy') - instance.accuracy = val - val = obj.get('altitude') - instance.altitude = val - val = obj.get('latitude') - instance.latitude = val - val = obj.get('longitude') - instance.longitude = val - val = obj.get('speedAccuracy') - instance.speedAccuracy = val - val = obj.get('headingAccuracy') - instance.headingAccuracy = val - val = obj.get('verticalAccuracy') - instance.verticalAccuracy = val - val = obj.get('elapsedRealtimeNanos') - instance.elapsedRealtimeNanos = val - val = obj.get('elapsedRealtimeUncertaintyNanos') - instance.elapsedRealtimeUncertaintyNanos = val - val = obj.get('date') - instance.date = val - val = obj.get('sunset') - instance.sunset = val - val = obj.get('country') - instance.country = val - val = obj.get('sunrise') - instance.sunrise = val - val = obj.get('tempMax') - instance.tempMax = val - val = obj.get('tempMin') - instance.tempMin = val - val = obj.get('areaName') - instance.areaName = val - val = obj.get('humidity') - instance.humidity = val - val = obj.get('pressure') - instance.pressure = val - val = obj.get('windSpeed') - instance.windSpeed = val - val = obj.get('cloudiness') - instance.cloudiness = val - val = obj.get('windDegree') - instance.windDegree = val - val = obj.get('temperature') - instance.temperature = val - val = obj.get('weatherMain') - instance.weatherMain = val - val = obj.get('weatherDescription') - instance.weatherDescription = val - return instance - -@dataclass -class SyncPoint: - synchronizedOn: str = None - sensorTimestampAtSyncPoint: int = None - relativeClockSpeed: float = None + """Build stream metadata from a dictionary.""" + + if not isinstance(obj, dict): + return obj + return cls( + studyDeploymentId=obj.get("studyDeploymentId"), + deviceRoleName=obj.get("deviceRoleName"), + dataType=DataType.from_dict(obj.get("dataType")), + ) + + +@dataclass(slots=True) +class MeasurementData: + """Common measurement payload used in the examples.""" + + steps: int | None = None + latitude: float | None = None + longitude: float | None = None + response_json: Any = None @classmethod def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('synchronizedOn') - instance.synchronizedOn = val - val = obj.get('sensorTimestampAtSyncPoint') - instance.sensorTimestampAtSyncPoint = val - val = obj.get('relativeClockSpeed') - instance.relativeClockSpeed = val - return instance - -@dataclass -class DataStream: - studyDeploymentId: str = None - deviceRoleName: str = None - dataType: DataType = None + """Build a measurement payload from a dictionary.""" + + if not isinstance(obj, dict): + return obj + return cls( + steps=obj.get("steps"), + latitude=obj.get("latitude"), + longitude=obj.get("longitude"), + response_json=parse_json_field(obj.get("response_json")), + ) + + +@dataclass(slots=True) +class Measurement: + """Measurement wrapper for one CARP record.""" + + sensorStartTime: int | None = None + data: MeasurementData | None = None @classmethod def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('studyDeploymentId') - instance.studyDeploymentId = val - val = obj.get('deviceRoleName') - instance.deviceRoleName = val - val = obj.get('dataType') - if val is not None: - instance.dataType = DataType.from_dict(val) - return instance - -@dataclass -class DataType: - namespace: str = None - name: str = None + """Build a measurement object from a dictionary.""" + + if not isinstance(obj, dict): + return obj + return cls( + sensorStartTime=obj.get("sensorStartTime"), + data=MeasurementData.from_dict(obj.get("data")), + ) + + +@dataclass(slots=True) +class StudyItem: + """Example typed CARP record used by the examples notebook.""" + + sequenceId: int | None = None + studyDeploymentId: str | None = None + deviceRoleName: str | None = None + triggerIds: list[Any] | None = None + measurement: Measurement | None = None + dataStream: DataStream | None = None @classmethod def from_dict(cls, obj: Any) -> Any: - if not isinstance(obj, dict): return obj - instance = cls() - val = obj.get('namespace') - instance.namespace = val - val = obj.get('name') - instance.name = val - return instance + """Build a typed study item from a dictionary.""" + + if not isinstance(obj, dict): + return obj + return cls( + sequenceId=obj.get("sequenceId"), + studyDeploymentId=obj.get("studyDeploymentId"), + deviceRoleName=obj.get("deviceRoleName"), + triggerIds=obj.get("triggerIds"), + measurement=Measurement.from_dict(obj.get("measurement")), + dataStream=DataStream.from_dict(obj.get("dataStream")), + ) diff --git a/examples/main.py b/examples/main.py index 24edf10..0ff30ed 100644 --- a/examples/main.py +++ b/examples/main.py @@ -1,41 +1,49 @@ #!/usr/bin/env python3 -""" -Example script demonstrating basic usage of the carp-analytics-python library. +"""End-to-end example usage for `CarpStudy`.""" -Run from the project root after installing the package: - python examples/main.py data/study/data-streams.json -""" +from __future__ import annotations -from carp import CarpDataStream import sys +from pathlib import Path -def main(): - file_path = "data/study/data-streams.json" - if len(sys.argv) > 1: - file_path = sys.argv[1] - - print(f"Loading {file_path}...") - data = CarpDataStream(file_path) - - # Scan and print schema - print("Scanning schema...") - data.print_schema() - - # Example: Grouping data by data type - # output_dir = "output_groups" - # print(f"Grouping data into {output_dir}...") - # data.group_by_field("dataStream.dataType.name", output_dir) - - # Convert to Parquet - parquet_dir = "output_parquet" - data.convert_to_parquet(parquet_dir) - - # Load back as DataFrame - df = data.get_dataframe("dk.cachet.carp.stepcount", parquet_dir) - if df is not None: - print(f"Loaded {len(df)} stepcount records.") - print(df.head()) - - -if __name__ == '__main__': - main() \ No newline at end of file +from carp import CarpStudy + + +def _default_paths() -> list[Path]: + """Return bundled study paths for the example.""" + + sleep_paths = sorted(Path("sleep-data").glob("phase-*/data-streams.json")) + if sleep_paths: + return sleep_paths + fixture_root = Path("tests/fixtures/multi_phase") + return sorted(fixture_root.glob("*/data-streams.json")) + + +def main() -> int: + """Run the example against one or more study files.""" + + file_paths = [Path(arg) for arg in sys.argv[1:]] or _default_paths() + study = CarpStudy(file_paths, load_participants=True) + print(f"Loaded {len(file_paths)} study file(s)") + print(f"Total records: {study.records.count():,}") + print(f"Data types: {', '.join(study.records.data_types())}") + rows = study.participants.summary_rows() + print(f"Unified participants: {len(rows)}") + for row in rows[:3]: + print(f" {row['unified_id']}: {row['emails']} ({row['deployments']} deployments)") + example_email = next((row["emails"] for row in rows if row["emails"] != "N/A"), None) + if example_email: + participant = study.participant(example_email) + print(f"Example participant: {participant.info()}") + try: + step_frame = study.frames.get_dataframe("dk.cachet.carp.stepcount") + except RuntimeError as exc: + print(f"Skipping dataframe example: {exc}") + else: + print("Step-count preview:") + print(step_frame.head().to_string(index=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index 0388c4e..f657906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,17 @@ dev = [ "mypy>=1.5.0", "ruff>=0.1.0", "pre-commit>=3.4.0", + "sphinx>=8.1.3", + "sphinx-rtd-theme>=3.1.0", +] +test = [ + "pandas>=2.0.0", + "pyarrow>=14.0.0", + "matplotlib>=3.7.0", + "folium>=0.14.0", +] +docs = [ + "sphinx>=8.0.0", ] [build-system] @@ -101,7 +112,10 @@ packages = ["src/carp"] [tool.ruff] target-version = "py310" -line-length = 100 +line-length = 140 +extend-exclude = ["examples/demo.ipynb"] + +[tool.ruff.lint] select = [ "E", # pycodestyle errors "W", # pycodestyle warnings @@ -116,7 +130,7 @@ ignore = [ "B008", # do not perform function calls in argument defaults ] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["carp"] [tool.mypy] @@ -126,16 +140,25 @@ warn_unused_configs = true disallow_untyped_defs = true exclude = ["examples/", "tests/"] +[[tool.mypy.overrides]] +module = ["ijson", "pandas", "pyarrow", "pyarrow.*", "folium", "folium.*"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] -addopts = "-v --tb=short" +addopts = "-v --tb=short --cov=src/carp --cov-branch --cov-report=term-missing" +markers = [ + "real_data: optional tests that use local sleep-data when available", +] [tool.coverage.run] source = ["src/carp"] branch = true [tool.coverage.report] +fail_under = 100 +show_missing = true exclude_lines = [ "pragma: no cover", "if TYPE_CHECKING:", diff --git a/src/carp/__init__.py b/src/carp/__init__.py index 40bd9bd..e24f3e6 100644 --- a/src/carp/__init__.py +++ b/src/carp/__init__.py @@ -1,20 +1,6 @@ -""" -CARP Analytics Python - A high-performance library for processing CARP study data. +"""Public package interface for CARP Analytics.""" -This library provides tools for streaming, processing, and analysing large JSON -data streams from CARP (Copenhagen Research Platform) clinical and research studies. -""" +from .study import CarpStudy -from .reader import CarpDataStream, ParticipantManager, ParticipantInfo, ParticipantAccessor - -__version__ = "0.1.0" -__author__ = "Copenhagen Research Platform" -__email__ = "support@carp.dk" - -__all__ = [ - "CarpDataStream", - "ParticipantManager", - "ParticipantInfo", - "ParticipantAccessor", - "__version__", -] +__all__ = ["CarpStudy"] +__version__ = "0.2.0" diff --git a/src/carp/cli.py b/src/carp/cli.py index 586416f..e751160 100644 --- a/src/carp/cli.py +++ b/src/carp/cli.py @@ -1,139 +1,10 @@ -""" -Command-line interface for CARP Analytics Python. -""" +"""Command-line entrypoint for CARP Analytics.""" -import argparse -import sys - -from rich.console import Console - -console = Console() +from __future__ import annotations +import sys -def main() -> int: - """Main entry point for the CLI.""" - parser = argparse.ArgumentParser( - prog="carp", - description="CARP Analytics - Process and analyze data from CARP research studies", - ) - parser.add_argument( - "--version", - action="store_true", - help="Show version and exit", - ) - - subparsers = parser.add_subparsers(dest="command", help="Available commands") - - # Schema command - schema_parser = subparsers.add_parser("schema", help="Scan and print data schema") - schema_parser.add_argument("files", nargs="+", help="JSON data files to process") - - # Convert command - convert_parser = subparsers.add_parser("convert", help="Convert JSON to Parquet") - convert_parser.add_argument("files", nargs="+", help="JSON data files to convert") - convert_parser.add_argument( - "-o", "--output", - default="output_parquet", - help="Output directory for Parquet files (default: output_parquet)", - ) - convert_parser.add_argument( - "--batch-size", - type=int, - default=10000, - help="Batch size for conversion (default: 10000)", - ) - - # Count command - count_parser = subparsers.add_parser("count", help="Count items in data files") - count_parser.add_argument("files", nargs="+", help="JSON data files to count") - - # Participants command - participants_parser = subparsers.add_parser( - "participants", - help="List participants from data files", - ) - participants_parser.add_argument("files", nargs="+", help="JSON data files to process") - - # Export command - export_parser = subparsers.add_parser("export", help="Export data to JSON") - export_parser.add_argument("files", nargs="+", help="JSON data files to process") - export_parser.add_argument( - "-o", "--output", - required=True, - help="Output JSON file path", - ) - export_parser.add_argument( - "-t", "--type", - dest="data_type", - help="Filter by data type (e.g., dk.cachet.carp.stepcount)", - ) - - # Group command - group_parser = subparsers.add_parser("group", help="Group data by field") - group_parser.add_argument("files", nargs="+", help="JSON data files to process") - group_parser.add_argument( - "-f", "--field", - default="dataStream.dataType.name", - help="Field path to group by (default: dataStream.dataType.name)", - ) - group_parser.add_argument( - "-o", "--output", - default="output_grouped", - help="Output directory (default: output_grouped)", - ) - - args = parser.parse_args() - - if args.version: - from carp import __version__ - console.print(f"carp-analytics-python version {__version__}") - return 0 - - if not args.command: - parser.print_help() - return 0 - - # Import here to avoid slow startup for --help - from carp import CarpDataStream - - try: - if args.command == "schema": - sd = CarpDataStream(args.files, load_participants=False) - sd.print_schema() - - elif args.command == "convert": - sd = CarpDataStream(args.files, load_participants=False) - sd.convert_to_parquet(args.output, batch_size=args.batch_size) - - elif args.command == "count": - sd = CarpDataStream(args.files, load_participants=False) - count = sd.count_items() - console.print(f"[bold green]Total items: {count:,}[/bold green]") - - elif args.command == "participants": - sd = CarpDataStream(args.files, load_participants=True) - sd.print_participants() - - elif args.command == "export": - sd = CarpDataStream(args.files, load_participants=False) - sd.export_to_json(args.output, data_type=args.data_type) - - elif args.command == "group": - sd = CarpDataStream(args.files, load_participants=False) - sd.group_by_field(args.field, args.output) - - except FileNotFoundError as e: - console.print(f"[bold red]Error: {e}[/bold red]") - return 1 - except KeyboardInterrupt: - console.print("\n[yellow]Interrupted.[/yellow]") - return 130 - except Exception as e: - console.print(f"[bold red]Error: {e}[/bold red]") - return 1 - - return 0 - +from carp.commandline.app import main if __name__ == "__main__": sys.exit(main()) diff --git a/src/carp/commandline/__init__.py b/src/carp/commandline/__init__.py new file mode 100644 index 0000000..9472d5e --- /dev/null +++ b/src/carp/commandline/__init__.py @@ -0,0 +1 @@ +"""Command-line support for CARP Analytics.""" diff --git a/src/carp/commandline/app.py b/src/carp/commandline/app.py new file mode 100644 index 0000000..90e5a9f --- /dev/null +++ b/src/carp/commandline/app.py @@ -0,0 +1,56 @@ +"""Argument parsing and dispatch for the CARP CLI.""" + +from __future__ import annotations + +import argparse + +from .common import console, print_version +from .convert import register as register_convert +from .count import register as register_count +from .export import register_export, register_group +from .participants import register as register_participants +from .schema import register as register_schema + + +def _build_parser() -> argparse.ArgumentParser: + """Construct the top-level CLI parser.""" + + parser = argparse.ArgumentParser( + prog="carp", + description="CARP Analytics - Process and analyze data from CARP research studies", + ) + parser.add_argument("--version", action="store_true", help="Show version and exit") + subparsers = parser.add_subparsers(dest="command", help="Available commands") + for register in ( + register_schema, + register_convert, + register_count, + register_participants, + register_export, + register_group, + ): + register(subparsers) + return parser + + +def main(argv: list[str] | None = None) -> int: + """Run the CARP command-line interface.""" + + parser = _build_parser() + args = parser.parse_args(argv) + if args.version: + return print_version() + if not args.command: + parser.print_help() + return 0 + try: + return int(args.handler(args)) + except FileNotFoundError as exc: + console.print(f"[bold red]Error: {exc}[/bold red]") + return 1 + except KeyboardInterrupt: + console.print("\n[yellow]Interrupted.[/yellow]") + return 130 + except Exception as exc: + console.print(f"[bold red]Error: {exc}[/bold red]") + return 1 diff --git a/src/carp/commandline/common.py b/src/carp/commandline/common.py new file mode 100644 index 0000000..0f558f4 --- /dev/null +++ b/src/carp/commandline/common.py @@ -0,0 +1,48 @@ +"""Shared CLI helpers and presenters.""" + +from __future__ import annotations + +from typing import Any + +from rich.console import Console +from rich.table import Table + +from carp import __version__ +from carp.study import CarpStudy + +console = Console() + + +def build_study(files: Any, load_participants: bool = True) -> CarpStudy: + """Construct a study from CLI arguments.""" + + return CarpStudy(files, load_participants=load_participants) + + +def print_version() -> int: + """Print the package version and return a success status.""" + + console.print(f"carp-analytics-python version {__version__}") + return 0 + + +def print_schema(schema_map: dict[str, list[str]]) -> None: + """Render a schema table.""" + + table = Table(title="Inferred Schema") + table.add_column("Data Type", style="cyan") + table.add_column("Fields", style="magenta") + for data_type, fields in schema_map.items(): + table.add_row(data_type, ", ".join(fields)) + console.print(table) + + +def print_participants(rows: list[dict[str, str]]) -> None: + """Render participant summary rows.""" + + table = Table(title="Participants Summary") + for column in ("unified_id", "deployments", "folders", "emails", "ssns", "names"): + table.add_column(column.replace("_", " ").title()) + for row in rows: + table.add_row(*(row[key] for key in row)) + console.print(table) diff --git a/src/carp/commandline/convert.py b/src/carp/commandline/convert.py new file mode 100644 index 0000000..fede4d2 --- /dev/null +++ b/src/carp/commandline/convert.py @@ -0,0 +1,28 @@ +"""CLI command for parquet conversion.""" + +from __future__ import annotations + +from typing import Any + +from .common import build_study, console + + +def register(subparsers: Any) -> None: + """Register the `convert` subcommand.""" + + parser = subparsers.add_parser("convert", help="Convert JSON to Parquet") + parser.add_argument("files", nargs="+", help="JSON data files to convert") + parser.add_argument("-o", "--output", default="output_parquet", help="Output directory") + parser.add_argument("--batch-size", type=int, default=10_000, help="Batch size") + parser.set_defaults(handler=run) + + +def run(args: Any) -> int: + """Execute the `convert` subcommand.""" + + files = build_study(args.files, load_participants=False).frames.convert_to_parquet( + args.output, + batch_size=args.batch_size, + ) + console.print(f"[bold green]Created {len(files)} parquet files.[/bold green]") + return 0 diff --git a/src/carp/commandline/count.py b/src/carp/commandline/count.py new file mode 100644 index 0000000..37eebfa --- /dev/null +++ b/src/carp/commandline/count.py @@ -0,0 +1,23 @@ +"""CLI command for record counting.""" + +from __future__ import annotations + +from typing import Any + +from .common import build_study, console + + +def register(subparsers: Any) -> None: + """Register the `count` subcommand.""" + + parser = subparsers.add_parser("count", help="Count items in data files") + parser.add_argument("files", nargs="+", help="JSON data files to count") + parser.set_defaults(handler=run) + + +def run(args: Any) -> int: + """Execute the `count` subcommand.""" + + count = build_study(args.files, load_participants=False).records.count() + console.print(f"[bold green]Total items: {count:,}[/bold green]") + return 0 diff --git a/src/carp/commandline/export.py b/src/carp/commandline/export.py new file mode 100644 index 0000000..b8be9b9 --- /dev/null +++ b/src/carp/commandline/export.py @@ -0,0 +1,54 @@ +"""CLI commands for exporting study data.""" + +from __future__ import annotations + +from typing import Any + +from .common import build_study, console + + +def register_export(subparsers: Any) -> None: + """Register the `export` subcommand.""" + + parser = subparsers.add_parser("export", help="Export data to JSON") + parser.add_argument("files", nargs="+", help="JSON data files to process") + parser.add_argument("-o", "--output", required=True, help="Output JSON file path") + parser.add_argument("-t", "--type", dest="data_type", help="Filter by data type") + parser.set_defaults(handler=run_export) + + +def register_group(subparsers: Any) -> None: + """Register the `group` subcommand.""" + + parser = subparsers.add_parser("group", help="Group data by field") + parser.add_argument("files", nargs="+", help="JSON data files to process") + parser.add_argument( + "-f", + "--field", + default="dataStream.dataType.name", + help="Field path to group by", + ) + parser.add_argument("-o", "--output", default="output_grouped", help="Output directory") + parser.set_defaults(handler=run_group) + + +def run_export(args: Any) -> int: + """Execute the `export` subcommand.""" + + output = build_study(args.files, load_participants=False).export.export_json( + args.output, + args.data_type, + ) + console.print(f"[bold green]Exported data to {output}[/bold green]") + return 0 + + +def run_group(args: Any) -> int: + """Execute the `group` subcommand.""" + + files = build_study(args.files, load_participants=False).export.group_by_field( + args.field, + args.output, + ) + console.print(f"[bold green]Created {len(files)} grouped files.[/bold green]") + return 0 diff --git a/src/carp/commandline/participants.py b/src/carp/commandline/participants.py new file mode 100644 index 0000000..a58ddb5 --- /dev/null +++ b/src/carp/commandline/participants.py @@ -0,0 +1,22 @@ +"""CLI command for participant summaries.""" + +from __future__ import annotations + +from typing import Any + +from .common import build_study, print_participants + + +def register(subparsers: Any) -> None: + """Register the `participants` subcommand.""" + + parser = subparsers.add_parser("participants", help="List participants from data files") + parser.add_argument("files", nargs="+", help="JSON data files to process") + parser.set_defaults(handler=run) + + +def run(args: Any) -> int: + """Execute the `participants` subcommand.""" + + print_participants(build_study(args.files).participants.summary_rows()) + return 0 diff --git a/src/carp/commandline/schema.py b/src/carp/commandline/schema.py new file mode 100644 index 0000000..c0d1386 --- /dev/null +++ b/src/carp/commandline/schema.py @@ -0,0 +1,22 @@ +"""CLI command for schema discovery.""" + +from __future__ import annotations + +from typing import Any + +from .common import build_study, print_schema + + +def register(subparsers: Any) -> None: + """Register the `schema` subcommand.""" + + parser = subparsers.add_parser("schema", help="Scan and print data schema") + parser.add_argument("files", nargs="+", help="JSON data files to process") + parser.set_defaults(handler=run) + + +def run(args: Any) -> int: + """Execute the `schema` subcommand.""" + + print_schema(build_study(args.files, load_participants=False).schema.scan()) + return 0 diff --git a/src/carp/constants.py b/src/carp/constants.py new file mode 100644 index 0000000..f7ad041 --- /dev/null +++ b/src/carp/constants.py @@ -0,0 +1,7 @@ +"""Shared constants for CARP Analytics.""" + +DATA_STREAM_FILE = "data-streams.json" +PARTICIPANT_FILE = "participant-data.json" +DEFAULT_LOCATION_TYPE = "dk.cachet.carp.location" +DEFAULT_STEP_TYPE = "dk.cachet.carp.stepcount" +UNKNOWN_VALUE = "unknown" diff --git a/src/carp/core/__init__.py b/src/carp/core/__init__.py new file mode 100644 index 0000000..d99bee6 --- /dev/null +++ b/src/carp/core/__init__.py @@ -0,0 +1,18 @@ +"""Core utilities shared across CARP Analytics services.""" + +from .fields import collect_field_paths, deployment_id_from_record, full_data_type +from .files import JsonArrayWriter, iter_json_array, resolve_paths +from .models import ParticipantInfo +from .naming import parquet_stem, sanitize_filename + +__all__ = [ + "JsonArrayWriter", + "ParticipantInfo", + "collect_field_paths", + "deployment_id_from_record", + "full_data_type", + "iter_json_array", + "parquet_stem", + "resolve_paths", + "sanitize_filename", +] diff --git a/src/carp/core/dependencies.py b/src/carp/core/dependencies.py new file mode 100644 index 0000000..767a266 --- /dev/null +++ b/src/carp/core/dependencies.py @@ -0,0 +1,33 @@ +"""Optional dependency helpers.""" + +from __future__ import annotations + +import importlib +import importlib.util +from typing import Any + + +def module_available(module_name: str) -> bool: + """Return whether a module can be imported.""" + + return importlib.util.find_spec(module_name) is not None + + +def import_or_raise(module_name: str, extra_name: str) -> Any: + """Import a dependency or raise a helpful runtime error. + + Args: + module_name: Importable module name. + extra_name: Package extra or install hint shown to the user. + + Returns: + The imported module. + + Raises: + RuntimeError: If the dependency is unavailable. + """ + + try: + return importlib.import_module(module_name) + except ImportError as exc: # pragma: no cover - exercised through callers. + raise RuntimeError(f"{module_name} is required for this feature. Install the `{extra_name}` extras.") from exc diff --git a/src/carp/core/fields.py b/src/carp/core/fields.py new file mode 100644 index 0000000..2ae681a --- /dev/null +++ b/src/carp/core/fields.py @@ -0,0 +1,56 @@ +"""Helpers for nested CARP record structures.""" + +from __future__ import annotations + +from typing import Any + +from carp.constants import UNKNOWN_VALUE + + +def get_nested_value(value: Any, path: str, default: Any = None) -> Any: + """Resolve a dot-separated path from nested dictionaries.""" + + current = value + for part in path.split("."): + if not isinstance(current, dict): + return default + current = current.get(part) + if current is None: + return default + return current + + +def collect_field_paths(value: Any, prefix: str = "") -> set[str]: + """Collect dot-separated field paths from nested dictionaries.""" + + paths: set[str] = set() + if isinstance(value, dict): + for key, child in value.items(): + path = f"{prefix}.{key}" if prefix else key + paths.add(path) + paths.update(collect_field_paths(child, path)) + elif isinstance(value, list): + if prefix: + paths.add(f"{prefix}[]") + if value: + paths.update(collect_field_paths(value[0], f"{prefix}[]")) + return paths + + +def full_data_type(item: dict[str, Any]) -> str: + """Return the fully qualified data type for a CARP record.""" + + data_type = get_nested_value(item, "dataStream.dataType", {}) + namespace = data_type.get("namespace", UNKNOWN_VALUE) + name = data_type.get("name", UNKNOWN_VALUE) + return f"{namespace}.{name}" + + +def deployment_id_from_record(item: dict[str, Any]) -> str | None: + """Return the deployment identifier for a CARP record.""" + + top_level = item.get("studyDeploymentId") + if isinstance(top_level, str): + return top_level + nested = get_nested_value(item, "dataStream.studyDeploymentId") + return nested if isinstance(nested, str) else None diff --git a/src/carp/core/files.py b/src/carp/core/files.py new file mode 100644 index 0000000..51ed658 --- /dev/null +++ b/src/carp/core/files.py @@ -0,0 +1,53 @@ +"""Filesystem helpers for CARP Analytics.""" + +from __future__ import annotations + +import json +from collections.abc import Iterator, Sequence +from pathlib import Path +from typing import Any + +import ijson + + +def resolve_paths(file_paths: str | Path | Sequence[str | Path]) -> tuple[Path, ...]: + """Validate and normalize data-stream paths.""" + + raw_paths = [file_paths] if isinstance(file_paths, (str, Path)) else list(file_paths) + resolved = tuple(Path(path) for path in raw_paths) + for path in resolved: + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + return resolved + + +def iter_json_array(file_path: Path) -> Iterator[dict[str, Any]]: + """Stream JSON array items from disk using `ijson`.""" + + with file_path.open("rb") as handle: + yield from ijson.items(handle, "item", use_float=True) + + +class JsonArrayWriter: + """Incrementally write JSON arrays without buffering the full payload.""" + + def __init__(self, output_path: Path): + self.output_path = output_path + self.output_path.parent.mkdir(parents=True, exist_ok=True) + self._handle = self.output_path.open("w", encoding="utf-8") + self._first_item = True + self._handle.write("[") + + def write(self, item: dict[str, Any]) -> None: + """Append one JSON object to the array.""" + + if not self._first_item: + self._handle.write(",") + json.dump(item, self._handle) + self._first_item = False + + def close(self) -> None: + """Finalize and close the output file.""" + + self._handle.write("]") + self._handle.close() diff --git a/src/carp/core/models.py b/src/carp/core/models.py new file mode 100644 index 0000000..e05b467 --- /dev/null +++ b/src/carp/core/models.py @@ -0,0 +1,28 @@ +"""Domain models shared by multiple subsystems.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass(slots=True) +class ParticipantInfo: + """Normalized participant metadata for one deployment.""" + + study_deployment_id: str + role_name: str = "Participant" + full_name: str | None = None + sex: str | None = None + ssn: str | None = None + user_id: str | None = None + email: str | None = None + consent_signed: bool = False + consent_timestamp: str | None = None + source_folder: str | None = None + unified_participant_id: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Return a JSON-serializable representation of the participant.""" + + return asdict(self) diff --git a/src/carp/core/naming.py b/src/carp/core/naming.py new file mode 100644 index 0000000..f143977 --- /dev/null +++ b/src/carp/core/naming.py @@ -0,0 +1,20 @@ +"""File and identifier naming helpers.""" + +from __future__ import annotations + +from carp.constants import UNKNOWN_VALUE + + +def sanitize_filename(value: str, allowed: str = "-_") -> str: + """Return a filesystem-safe representation of a string.""" + + safe = "".join(char for char in value if char.isalnum() or char in allowed).strip() + return safe or UNKNOWN_VALUE + + +def parquet_stem(data_type: str) -> str: + """Return a namespace-aware parquet stem for a data type.""" + + namespace, _, name = data_type.rpartition(".") + stem = f"{namespace}__{name}" if namespace else data_type + return sanitize_filename(stem, allowed="-_.") diff --git a/src/carp/export/__init__.py b/src/carp/export/__init__.py new file mode 100644 index 0000000..4d036f8 --- /dev/null +++ b/src/carp/export/__init__.py @@ -0,0 +1,5 @@ +"""Data export services.""" + +from .service import ExportService + +__all__ = ["ExportService"] diff --git a/src/carp/export/service.py b/src/carp/export/service.py new file mode 100644 index 0000000..a04ba98 --- /dev/null +++ b/src/carp/export/service.py @@ -0,0 +1,93 @@ +"""JSON export and grouping services.""" + +from __future__ import annotations + +from collections.abc import Callable, Iterable +from pathlib import Path +from typing import Any + +from carp.constants import UNKNOWN_VALUE +from carp.core.fields import get_nested_value +from carp.core.files import JsonArrayWriter +from carp.core.naming import sanitize_filename + + +class ExportService: + """Export CARP records to JSON arrays.""" + + def __init__(self, records: Any) -> None: + self._records = records + + def export_json(self, output_path: str | Path, data_type: str | None = None) -> Path: + """Write matching records to a JSON array file.""" + + writer = JsonArrayWriter(Path(output_path)) + try: + for item in self._records.iter_records(data_type): + writer.write(item) + finally: + writer.close() + return Path(output_path) + + def group_by_field(self, field_path: str, output_dir: str | Path) -> list[Path]: + """Group records by a nested field path.""" + + def key_factory(item: dict[str, Any]) -> str: + value = get_nested_value(item, field_path, UNKNOWN_VALUE) + return sanitize_filename(str(value), allowed="-_.@") + + return self._write_groups(Path(output_dir), self._records.iter_records(), key_factory) + + def group_by_participant(self, output_dir: str | Path, data_type: str | None = None) -> list[Path]: + """Group records by unified participant identifier.""" + + def key_factory(item: dict[str, Any]) -> str: + participant = item.get("_participant", {}) + return sanitize_filename( + str(participant.get("unified_participant_id", UNKNOWN_VALUE)), + allowed="-_.@", + ) + + return self._write_groups(Path(output_dir), self._records.iter_with_participants(data_type), key_factory) + + def group_by_identity( + self, + field_name: str, + output_dir: str | Path, + data_type: str | None = None, + ) -> list[Path]: + """Group records by a participant identity field.""" + + def key_factory(item: dict[str, Any]) -> str: + participant = item.get("_participant", {}) + value = participant.get(field_name) or UNKNOWN_VALUE + return sanitize_filename(str(value), allowed="-_.@") + + return self._write_groups( + Path(output_dir), + self._records.iter_with_participants(data_type), + key_factory, + ) + + def _write_groups( + self, + output_dir: Path, + items: Iterable[dict[str, Any]], + key_factory: Callable[[dict[str, Any]], str], + ) -> list[Path]: + """Write grouped JSON files and return created paths.""" + + writers: dict[str, JsonArrayWriter] = {} + output_dir.mkdir(parents=True, exist_ok=True) + try: + for item in items: + key = key_factory(item) + writer = writers.get(key) + if writer is None: + writer = JsonArrayWriter(output_dir / f"{key}.json") + writers[key] = writer + writer.write(item) + finally: + for writer in writers.values(): + writer.close() + return sorted(writer.output_path for writer in writers.values()) diff --git a/src/carp/frames/__init__.py b/src/carp/frames/__init__.py new file mode 100644 index 0000000..5353942 --- /dev/null +++ b/src/carp/frames/__init__.py @@ -0,0 +1,5 @@ +"""Dataframe and parquet services.""" + +from .service import FrameService + +__all__ = ["FrameService"] diff --git a/src/carp/frames/service.py b/src/carp/frames/service.py new file mode 100644 index 0000000..4b87173 --- /dev/null +++ b/src/carp/frames/service.py @@ -0,0 +1,139 @@ +"""Dataframe loading and parquet conversion for CARP studies.""" + +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path +from typing import Any + +from carp.core.dependencies import import_or_raise +from carp.core.naming import parquet_stem + + +class FrameService: + """Load CARP data as dataframes or parquet files.""" + + def __init__(self, records: Any, participant_directory: Any) -> None: + self._records = records + self._participants = participant_directory + + def parquet_path(self, data_type: str, output_dir: str | Path) -> Path: + """Return the namespace-aware parquet path for a data type.""" + + return Path(output_dir) / f"{parquet_stem(data_type)}.parquet" + + def get_dataframe(self, data_type: str, parquet_dir: str | Path | None = None) -> Any: + """Return a dataframe for one data type.""" + + pandas = import_or_raise("pandas", "pandas") + if parquet_dir: + parquet_path = self.parquet_path(data_type, parquet_dir) + if parquet_path.exists(): + return pandas.read_parquet(parquet_path) + return pandas.DataFrame(list(self._records.iter_records(data_type))) + + def get_dataframe_with_participants( + self, + data_type: str, + parquet_dir: str | Path | None = None, + ) -> Any: + """Return a dataframe enriched with participant metadata.""" + + pandas = import_or_raise("pandas", "pandas") + frame = self.get_dataframe(data_type, parquet_dir) + if frame.empty: + return frame + deployment_ids = self._deployment_series(frame) + participant_rows = deployment_ids.apply(self._participant_row) + return pandas.concat([frame, participant_rows], axis=1) + + def convert_to_parquet( + self, + output_dir: str | Path, + batch_size: int = 10_000, + ) -> list[Path]: + """Convert the study to namespace-aware parquet files.""" + + pyarrow = import_or_raise("pyarrow", "pandas") + parquet = import_or_raise("pyarrow.parquet", "pandas") + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + buffers: dict[str, list[dict[str, Any]]] = defaultdict(list) + writers: dict[str, Any] = {} + try: + for item in self._records.iter_records(): + data_type = self._records.data_type(item) + buffers[data_type].append(item) + if len(buffers[data_type]) >= batch_size: + self._flush_buffer(pyarrow, parquet, output_path, data_type, buffers, writers) + finally: + for data_type, buffer in buffers.items(): + if buffer: + self._flush_buffer(pyarrow, parquet, output_path, data_type, buffers, writers) + for writer in writers.values(): + writer.close() + return sorted(self.parquet_path(data_type, output_path) for data_type in writers) + + def _participant_row(self, deployment_id: str | None) -> Any: + """Return participant columns for one deployment identifier.""" + + pandas = import_or_raise("pandas", "pandas") + participant = self._participants.get_participant(deployment_id or "") + if not participant: + return pandas.Series( + { + "participant_id": None, + "participant_email": None, + "participant_folder": None, + } + ) + return pandas.Series( + { + "participant_id": participant.unified_participant_id, + "participant_email": participant.email, + "participant_folder": participant.source_folder, + } + ) + + def _deployment_series(self, frame: Any) -> Any: + """Return deployment identifiers from a dataframe.""" + + if "studyDeploymentId" in frame.columns: + return frame["studyDeploymentId"] + return frame["dataStream"].apply(lambda value: value.get("studyDeploymentId") if isinstance(value, dict) else None) + + def _flush_buffer( + self, + pyarrow: Any, + parquet: Any, + output_path: Path, + data_type: str, + buffers: dict[str, list[dict[str, Any]]], + writers: dict[str, Any], + ) -> None: + """Flush one buffered parquet batch to disk.""" + + table = pyarrow.Table.from_pylist(buffers[data_type]) + path = self.parquet_path(data_type, output_path) + writer = writers.get(data_type) + if writer is None: + writers[data_type] = parquet.ParquetWriter(path, table.schema) + writer = writers[data_type] + elif not table.schema.equals(writer.schema): + table = self._align_table(pyarrow, table, writer.schema) + writer.write_table(table) + buffers[data_type].clear() + + def _align_table(self, pyarrow: Any, table: Any, schema: Any) -> Any: + """Align a batch to an existing parquet schema.""" + + columns = [] + for field in schema: + if field.name not in table.column_names: + columns.append(pyarrow.nulls(len(table), type=field.type)) + continue + column = table[field.name] + if not column.type.equals(field.type): + column = column.cast(field.type) + columns.append(column) + return pyarrow.Table.from_arrays(columns, schema=schema) diff --git a/src/carp/participants/__init__.py b/src/carp/participants/__init__.py new file mode 100644 index 0000000..7bb3835 --- /dev/null +++ b/src/carp/participants/__init__.py @@ -0,0 +1,7 @@ +"""Participant services and models.""" + +from .directory import ParticipantDirectory +from .service import ParticipantService +from .view import ParticipantView + +__all__ = ["ParticipantDirectory", "ParticipantService", "ParticipantView"] diff --git a/src/carp/participants/directory.py b/src/carp/participants/directory.py new file mode 100644 index 0000000..445efa3 --- /dev/null +++ b/src/carp/participants/directory.py @@ -0,0 +1,152 @@ +"""Participant lookup and unification services.""" + +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +from carp.constants import PARTICIPANT_FILE +from carp.core.models import ParticipantInfo + +from .parser import load_participant_file + + +def _normalize(value: str | None) -> str | None: + """Normalize string identifiers for matching.""" + + if not value: + return None + clean = value.strip().lower() + return clean or None + + +class ParticipantDirectory: + """Store participant metadata across one or more study phases.""" + + def __init__(self, participants_by_deployment: dict[str, ParticipantInfo] | None = None): + self.participants_by_deployment = participants_by_deployment or {} + self.unified_participants: dict[str, list[ParticipantInfo]] = {} + self._counter = 0 + if self.participants_by_deployment: + self._unify() + + @classmethod + def from_folders(cls, folders: tuple[Path, ...]) -> ParticipantDirectory: + """Build a participant directory from phase folders.""" + + participants: dict[str, ParticipantInfo] = {} + for folder in folders: + file_path = folder / PARTICIPANT_FILE + if file_path.exists(): + participants.update(load_participant_file(file_path)) + return cls(participants) + + def get_participant(self, deployment_id: str) -> ParticipantInfo | None: + """Return one participant by deployment identifier.""" + + return self.participants_by_deployment.get(deployment_id) + + def get_unified_participant(self, unified_id: str) -> list[ParticipantInfo]: + """Return all deployments for one unified participant.""" + + return list(self.unified_participants.get(unified_id, [])) + + def find_by_email(self, email: str) -> list[ParticipantInfo]: + """Find all participant deployments matching an email address.""" + + target = _normalize(email) + return [p for p in self.participants_by_deployment.values() if _normalize(p.email) == target] + + def find_by_ssn(self, ssn: str) -> list[ParticipantInfo]: + """Find all participant deployments matching an SSN.""" + + return [p for p in self.participants_by_deployment.values() if p.ssn == ssn] + + def find_by_name(self, name: str) -> list[ParticipantInfo]: + """Find all participant deployments matching a full name.""" + + target = _normalize(name) + return [p for p in self.participants_by_deployment.values() if _normalize(p.full_name) == target] + + def deployment_ids(self, field_name: str, value: str) -> tuple[str, ...]: + """Return deployment identifiers for a participant lookup.""" + + matches = getattr(self, f"find_by_{field_name}")(value) + return tuple(participant.study_deployment_id for participant in matches) + + def summary_rows(self) -> list[dict[str, str]]: + """Return human-readable participant summary rows.""" + + rows: list[dict[str, str]] = [] + for unified_id, participants in self.unified_participants.items(): + folders = sorted({p.source_folder for p in participants if p.source_folder}) + emails = sorted({p.email for p in participants if p.email}) + ssns = sorted({p.ssn for p in participants if p.ssn}) + names = sorted({p.full_name for p in participants if p.full_name}) + rows.append( + { + "unified_id": unified_id, + "deployments": str(len(participants)), + "folders": ", ".join(folders) or "N/A", + "emails": ", ".join(emails) or "N/A", + "ssns": ", ".join(ssns) or "N/A", + "names": ", ".join(names) or "N/A", + } + ) + return rows + + def _register_group(self, participants: list[ParticipantInfo], assigned: set[str]) -> None: + """Register one unified participant group.""" + + unified_id = f"P{self._counter:04d}" + self._counter += 1 + for participant in participants: + participant.unified_participant_id = unified_id + assigned.add(participant.study_deployment_id) + self.unified_participants[unified_id] = participants + + def _unify(self) -> None: + """Assign unified participant identifiers across phases.""" + + assigned: set[str] = set() + matchers = ("email", "ssn", "name") + grouped: dict[str, dict[str, list[ParticipantInfo]]] = { + "email": defaultdict(list), + "ssn": defaultdict(list), + "name": defaultdict(list), + } + for participant in self.participants_by_deployment.values(): + if email := _normalize(participant.email): + grouped["email"][email].append(participant) + if participant.ssn: + grouped["ssn"][participant.ssn].append(participant) + if name := _normalize(participant.full_name): + grouped["name"][name].append(participant) + for matcher in matchers: + for participants in grouped[matcher].values(): + pending = [participant for participant in participants if participant.study_deployment_id not in assigned] + if pending: + self._register_group(pending, assigned) + for participant in self.participants_by_deployment.values(): + if participant.study_deployment_id not in assigned: + self._register_group([participant], assigned) + self._propagate() + + def _propagate(self) -> None: + """Share the best known metadata across unified deployments.""" + + for participants in self.unified_participants.values(): + fields = { + "full_name": next((p.full_name for p in participants if p.full_name), None), + "sex": next((p.sex for p in participants if p.sex), None), + "ssn": next((p.ssn for p in participants if p.ssn), None), + "email": next((p.email for p in participants if p.email), None), + "user_id": next((p.user_id for p in participants if p.user_id), None), + "consent_timestamp": next((p.consent_timestamp for p in participants if p.consent_timestamp), None), + } + signed = any(p.consent_signed for p in participants) + for participant in participants: + participant.consent_signed = signed + for field_name, value in fields.items(): + if value and not getattr(participant, field_name): + setattr(participant, field_name, value) diff --git a/src/carp/participants/parser.py b/src/carp/participants/parser.py new file mode 100644 index 0000000..654d589 --- /dev/null +++ b/src/carp/participants/parser.py @@ -0,0 +1,78 @@ +"""Parsing helpers for `participant-data.json` files.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from carp.core.models import ParticipantInfo + + +def _coerce_full_name(value: object) -> str | None: + """Normalize CARP full-name payloads.""" + + if isinstance(value, str): + return value or None + if not isinstance(value, dict): + return None + parts = [value.get(key) for key in ("firstName", "middleName", "lastName")] + clean = [part.strip() for part in parts if isinstance(part, str) and part.strip()] + return " ".join(clean) or None + + +def _coerce_ssn(value: object) -> str | None: + """Normalize CARP SSN payloads.""" + + if isinstance(value, str): + return value or None + if isinstance(value, dict): + nested = value.get("socialSecurityNumber") + return str(nested) if nested else None + return None + + +def _apply_consent(participant: ParticipantInfo, value: object) -> None: + """Populate consent-related participant fields.""" + + if not isinstance(value, dict): + return + participant.consent_signed = True + participant.consent_timestamp = value.get("signedTimestamp") + participant.user_id = value.get("userId") + participant.email = value.get("name") + if participant.full_name: + return + consent_payload = value.get("consent") + if not isinstance(consent_payload, str): + return + try: + signature = json.loads(consent_payload).get("signature", {}) + except json.JSONDecodeError: + return + first_name = (signature.get("firstName") or "").strip() + last_name = (signature.get("lastName") or "").strip() + participant.full_name = f"{first_name} {last_name}".strip() or None + + +def load_participant_file(file_path: Path) -> dict[str, ParticipantInfo]: + """Load participant records from a single phase folder.""" + + participants: dict[str, ParticipantInfo] = {} + data = json.loads(file_path.read_text(encoding="utf-8")) + for entry in data: + deployment_id = entry.get("studyDeploymentId") + if not deployment_id: + continue + for role in entry.get("roles", []): + info = ParticipantInfo( + study_deployment_id=deployment_id, + role_name=role.get("roleName", "Participant"), + source_folder=file_path.parent.name, + ) + role_data = role.get("data", {}) + info.full_name = _coerce_full_name(role_data.get("dk.carp.webservices.input.full_name")) + info.sex = role_data.get("dk.cachet.carp.input.sex") + info.ssn = _coerce_ssn(role_data.get("dk.carp.webservices.input.ssn")) + _apply_consent(info, role_data.get("dk.carp.webservices.input.informed_consent")) + participants[deployment_id] = info + return participants diff --git a/src/carp/participants/service.py b/src/carp/participants/service.py new file mode 100644 index 0000000..3a1847a --- /dev/null +++ b/src/carp/participants/service.py @@ -0,0 +1,51 @@ +"""High-level participant service for `CarpStudy`.""" + +from __future__ import annotations + +from typing import Any + +from .directory import ParticipantDirectory +from .view import ParticipantView + + +class ParticipantService: + """Expose participant-centric queries and views.""" + + def __init__(self, study: Any, directory: ParticipantDirectory) -> None: + self._study = study + self._directory = directory + + def view(self, email: str) -> ParticipantView: + """Return a participant-scoped view by email.""" + + return ParticipantView(self._study, email) + + def by_email(self, email: str) -> list[Any]: + """Return participant deployments for an email address.""" + + return self._directory.find_by_email(email) + + def by_ssn(self, ssn: str) -> list[Any]: + """Return participant deployments for an SSN.""" + + return self._directory.find_by_ssn(ssn) + + def by_name(self, name: str) -> list[Any]: + """Return participant deployments for a full name.""" + + return self._directory.find_by_name(name) + + def deployment_ids(self, field_name: str, value: str) -> tuple[str, ...]: + """Return deployment identifiers for a participant lookup.""" + + return self._directory.deployment_ids(field_name, value) + + def unified(self, unified_id: str) -> list[Any]: + """Return deployments for a unified participant identifier.""" + + return self._directory.get_unified_participant(unified_id) + + def summary_rows(self) -> list[dict[str, str]]: + """Return participant summary rows for presentation layers.""" + + return self._directory.summary_rows() diff --git a/src/carp/participants/view.py b/src/carp/participants/view.py new file mode 100644 index 0000000..383c28c --- /dev/null +++ b/src/carp/participants/view.py @@ -0,0 +1,105 @@ +"""Participant-centric study accessors.""" + +from __future__ import annotations + +from collections.abc import Iterator +from typing import Any + + +class ParticipantView: + """Provide participant-scoped access to study data.""" + + def __init__(self, study: Any, email: str): + self._study = study + self._email = email + + @property + def participants(self) -> list[Any]: + """Return underlying participant models for the view.""" + + return list(self._study.participants.by_email(self._email)) + + @property + def deployment_ids(self) -> tuple[str, ...]: + """Return deployment identifiers for the participant.""" + + return tuple(self._study.participants.deployment_ids("email", self._email)) + + @property + def exists(self) -> bool: + """Return whether the participant exists in the study.""" + + return bool(self.participants) + + def info(self) -> dict[str, Any] | None: + """Return merged participant metadata.""" + + if not self.participants: + return None + base = self.participants[0] + return { + "email": self._email, + "unified_id": base.unified_participant_id, + "full_name": base.full_name, + "ssn": base.ssn, + "sex": base.sex, + "user_id": base.user_id, + "consent_signed": base.consent_signed, + "consent_timestamp": base.consent_timestamp, + "folders": sorted({p.source_folder for p in self.participants if p.source_folder}), + "deployment_ids": sorted(self.deployment_ids), + "num_deployments": len(self.deployment_ids), + } + + def iter_records(self, data_type: str | None = None) -> Iterator[dict[str, Any]]: + """Yield participant records with an optional data-type filter.""" + + yield from self._study.records.iter_records(data_type, self.deployment_ids) + + def available_fields(self, sample_size: int = 100) -> list[str]: + """Return participant-visible field paths.""" + + fields: set[str] = set() + for index, item in enumerate(self.iter_records()): + if index >= sample_size: + break + fields.update(self._study.records.collect_fields(item)) + return sorted(fields) + + def data_types(self) -> list[str]: + """Return unique data types for the participant.""" + + return sorted({self._study.records.data_type(item) for item in self.iter_records()}) + + def count(self, data_type: str | None = None) -> int: + """Return the number of participant records.""" + + return sum(1 for _ in self.iter_records(data_type)) + + def dataframe(self, data_type: str, parquet_dir: str | None = None) -> Any: + """Return a dataframe filtered to the participant.""" + + frame = self._study.frames.get_dataframe(data_type, parquet_dir) + if frame is None or frame.empty: + return frame + deployment_ids = self._study.plots.candidate_series( + frame, + ["studyDeploymentId", "dataStream.studyDeploymentId"], + ) + return frame if deployment_ids is None else frame[deployment_ids.isin(self.deployment_ids)] + + def plot_location( + self, + output_file: str | None = None, + parquet_dir: str | None = None, + include_steps: bool = True, + ) -> str | None: + """Render a location plot for the participant.""" + + result = self._study.plots.participant( + self._email, + output_file=output_file, + parquet_dir=parquet_dir, + include_steps=include_steps, + ) + return None if result is None else str(result) diff --git a/src/carp/plotting/__init__.py b/src/carp/plotting/__init__.py index 2a8f3b3..97cd027 100644 --- a/src/carp/plotting/__init__.py +++ b/src/carp/plotting/__init__.py @@ -1,10 +1,5 @@ -""" -Visualization module for CARP Analytics data. +"""Plotting services for CARP studies.""" -This module provides visualization tools for participant location data, -including heatmaps and geographic visualizations. -""" +from .service import PlotService -from .map_viz import LocationVisualizer, ParticipantVisualizer - -__all__ = ["LocationVisualizer", "ParticipantVisualizer"] +__all__ = ["PlotService"] diff --git a/src/carp/plotting/map_viz.py b/src/carp/plotting/map_viz.py deleted file mode 100644 index 3c567d5..0000000 --- a/src/carp/plotting/map_viz.py +++ /dev/null @@ -1,416 +0,0 @@ -import pandas as pd -import folium -from folium.plugins import HeatMap -from typing import Optional, List, Any, Set, TYPE_CHECKING - -if TYPE_CHECKING: - from ..reader import CarpDataStream - -from rich.console import Console - -console = Console() - - -class ParticipantVisualizer: - """ - Fluent API for visualizing participant data. - Usage: sd.participant("email").visualize.location() - """ - - def __init__(self, sleepiness_data: 'CarpDataStream', deployment_ids: Set[str], email: str): - self._sd = sleepiness_data - self._deployment_ids = deployment_ids - self._email = email - - def location( - self, - output_file: Optional[str] = None, - location_type: str = "dk.cachet.carp.geolocation", - step_type: str = "dk.cachet.carp.stepcount", - include_steps: bool = True, - parquet_dir: Optional[str] = "output_parquet" - ) -> Optional[str]: - """ - Generate a location heatmap for this participant. - - Args: - output_file: Output HTML file path. Defaults to "{email}_location.html" - location_type: Data type for location data - step_type: Data type for step count data - include_steps: Whether to overlay step count markers - parquet_dir: Directory with parquet files for faster loading - - Returns: - Path to the generated HTML file, or None if no data found - """ - if output_file is None: - # Sanitize email for filename - safe_email = self._email.replace("@", "_at_").replace(".", "_") - output_file = f"{safe_email}_location.html" - - console.print(f"[bold blue]Generating location heatmap for {self._email}...[/bold blue]") - - # Load location data - df_loc = self._sd.get_dataframe(location_type, parquet_dir) - - if df_loc is None or df_loc.empty: - console.print(f"[bold red]No location data found for type {location_type}[/bold red]") - return None - - # Filter by deployment IDs - user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_loc is not None: - df_loc = df_loc[user_series_loc.isin(self._deployment_ids)] - - if df_loc.empty: - console.print(f"[bold red]No location data found for {self._email}[/bold red]") - return None - - # Load step data if requested - df_steps = pd.DataFrame() - if include_steps: - df_steps_raw = self._sd.get_dataframe(step_type, parquet_dir) - if df_steps_raw is not None and not df_steps_raw.empty: - user_series_steps = self._get_field(df_steps_raw, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_steps is not None: - df_steps = df_steps_raw[user_series_steps.isin(self._deployment_ids)] - - # Extract coordinates - df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude']) - df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude']) - df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime']) - - if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all(): - console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]") - return None - - # Extract step data - if not df_steps.empty: - df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps']) - df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime']) - - # Render the map - self._render_map(df_loc, df_steps, output_file) - return output_file - - def _get_field(self, df: pd.DataFrame, candidates: List[str]) -> Optional[pd.Series]: - """Extract a series from dataframe using candidate field paths.""" - for path in candidates: - if path in df.columns: - return df[path] - - parts = path.split('.') - if parts[0] in df.columns: - try: - series = df[parts[0]] - for part in parts[1:]: - series = series.apply(lambda x: x.get(part) if isinstance(x, dict) else None) - return series - except Exception: - pass - return None - - def _render_map(self, df_loc: pd.DataFrame, df_steps: pd.DataFrame, output_file: str): - """Render the heatmap to an HTML file.""" - df_loc = df_loc.dropna(subset=['_lat', '_lon']) - - if df_loc.empty: - console.print("[bold red]No valid coordinates found after filtering[/bold red]") - return - - center_lat = df_loc['_lat'].mean() - center_lon = df_loc['_lon'].mean() - - m = folium.Map(location=[center_lat, center_lon], zoom_start=12) - - # Add heatmap layer - heat_data = df_loc[['_lat', '_lon']].values.tolist() - HeatMap(heat_data).add_to(m) - - # Add step markers - if not df_steps.empty and '_steps' in df_steps.columns and '_time' in df_steps.columns: - if '_time' in df_loc.columns: - df_loc_sorted = df_loc.sort_values('_time') - df_steps_sorted = df_steps.sort_values('_time') - - df_loc_sorted['_time'] = df_loc_sorted['_time'].astype('int64') - df_steps_sorted['_time'] = df_steps_sorted['_time'].astype('int64') - - merged = pd.merge_asof( - df_steps_sorted, - df_loc_sorted[['_time', '_lat', '_lon']], - on='_time', - direction='nearest', - tolerance=300_000_000 # 5 minutes in microseconds - ) - - for _, row in merged.iterrows(): - if pd.notnull(row['_lat']) and pd.notnull(row['_lon']) and pd.notnull(row['_steps']): - steps = row['_steps'] - if steps > 0: - folium.CircleMarker( - location=[row['_lat'], row['_lon']], - radius=min(max(steps / 10, 3), 20), - popup=f"Steps: {steps}
Time: {row['_time']}", - color="blue", - fill=True, - fill_color="blue" - ).add_to(m) - - m.save(output_file) - console.print(f"[bold green]Heatmap saved to {output_file}[/bold green]") - - -class LocationVisualizer: - def __init__(self, sd: 'CarpDataStream'): - self.sd = sd - - def _get_field(self, df: pd.DataFrame, candidates: List[str]) -> Optional[pd.Series]: - """ - Tries to extract a series from the dataframe using a list of candidate field paths. - Supports dot-notation for nested dict columns. - """ - for path in candidates: - if path in df.columns: - return df[path] - - # Try nested - parts = path.split('.') - if parts[0] in df.columns: - try: - series = df[parts[0]] - for part in parts[1:]: - # Handle None/NaN - series = series.apply(lambda x: x.get(part) if isinstance(x, dict) else None) - return series - except Exception: - pass - return None - - def _render_map(self, df_loc: pd.DataFrame, df_steps: pd.DataFrame, output_file: str): - """ - Internal method to render the map from prepared dataframes. - Expects df_loc to have _lat, _lon, _time columns. - Expects df_steps to have _steps, _time columns. - """ - # Drop NaNs in location - df_loc = df_loc.dropna(subset=['_lat', '_lon']) - - if df_loc.empty: - console.print("[bold red]No valid coordinates found after filtering[/bold red]") - return - - # Create Map - center_lat = df_loc['_lat'].mean() - center_lon = df_loc['_lon'].mean() - - m = folium.Map(location=[center_lat, center_lon], zoom_start=12) - - # Add Heatmap - heat_data = df_loc[['_lat', '_lon']].values.tolist() - HeatMap(heat_data).add_to(m) - - # Add Step Markers - if not df_steps.empty: - if '_steps' in df_steps.columns and '_time' in df_steps.columns and '_time' in df_loc.columns: - # Sort by time - df_loc = df_loc.sort_values('_time') - df_steps = df_steps.sort_values('_time') - - # Ensure types match - df_loc['_time'] = df_loc['_time'].astype('int64') - df_steps['_time'] = df_steps['_time'].astype('int64') - - merged = pd.merge_asof( - df_steps, - df_loc[['_time', '_lat', '_lon']], - on='_time', - direction='nearest', - tolerance=300_000_000 # 5 minutes in microseconds - ) - - for idx, row in merged.iterrows(): - if pd.notnull(row['_lat']) and pd.notnull(row['_lon']) and pd.notnull(row['_steps']): - steps = row['_steps'] - if steps > 0: - folium.CircleMarker( - location=[row['_lat'], row['_lon']], - radius=min(max(steps / 10, 3), 20), - popup=f"Steps: {steps}
Time: {row['_time']}", - color="blue", - fill=True, - fill_color="blue" - ).add_to(m) - - # Save - m.save(output_file) - console.print(f"[bold green]Heatmap saved to {output_file}[/bold green]") - - def plot_heatmap_from_items( - self, - location_items: List[Any], - step_items: Optional[List[Any]] = None, - output_file: str = "user_heatmap.html" - ): - """ - Generates a heatmap from a list of type-safe objects (e.g. generated SleepinessItem). - """ - console.print(f"[bold blue]Generating heatmap from {len(location_items)} location items...[/bold blue]") - - # Helper to safely get attributes - def get_attr(obj, path): - parts = path.split('.') - curr = obj - for p in parts: - if curr is None: - return None - curr = getattr(curr, p, None) - return curr - - # Extract Location Data - loc_data = [] - for item in location_items: - lat = get_attr(item, 'measurement.data.latitude') - lon = get_attr(item, 'measurement.data.longitude') - time = get_attr(item, 'measurement.sensorStartTime') - - if lat is not None and lon is not None: - loc_data.append({'_lat': lat, '_lon': lon, '_time': time}) - - df_loc = pd.DataFrame(loc_data) - - if df_loc.empty: - console.print("[bold red]No valid coordinates found in location items[/bold red]") - return - - # Extract Step Data - df_steps = pd.DataFrame() - if step_items: - step_data = [] - for item in step_items: - steps = get_attr(item, 'measurement.data.steps') - time = get_attr(item, 'measurement.sensorStartTime') - if steps is not None: - step_data.append({'_steps': steps, '_time': time}) - df_steps = pd.DataFrame(step_data) - - self._render_map(df_loc, df_steps, output_file) - - def plot_user_heatmap( - self, - study_deployment_id: str, - location_type: str = "dk.cachet.carp.geolocation", - step_type: str = "dk.cachet.carp.stepcount", - parquet_dir: Optional[str] = "output_parquet", - output_file: str = "user_heatmap.html" - ): - """ - Generates a heatmap of user locations and overlays step count data. - """ - console.print(f"[bold blue]Generating heatmap for user {study_deployment_id}...[/bold blue]") - - # 1. Load Data - df_loc = self.sd.get_dataframe(location_type, parquet_dir) - df_steps = self.sd.get_dataframe(step_type, parquet_dir) - - if df_loc is None or df_loc.empty: - console.print(f"[bold red]No location data found for type {location_type}[/bold red]") - return - - if df_steps is None: - console.print(f"[yellow]No step data found for type {step_type}. Plotting location only.[/yellow]") - df_steps = pd.DataFrame() - - # 2. Filter by User - user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_loc is not None: - df_loc = df_loc[user_series_loc == study_deployment_id] - - if df_loc.empty: - console.print(f"[bold red]No location data found for user {study_deployment_id}[/bold red]") - return - - if not df_steps.empty: - user_series_steps = self._get_field(df_steps, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_steps is not None: - df_steps = df_steps[user_series_steps == study_deployment_id] - - # 3. Extract Coordinates and Time - df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude']) - df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude']) - df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime']) - - if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all(): - console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]") - return - - # 6. Add Step Markers - if not df_steps.empty: - df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps']) - df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime']) - - self._render_map(df_loc, df_steps, output_file) - - def plot_participant_heatmap( - self, - unified_participant_id: str, - location_type: str = "dk.cachet.carp.geolocation", - step_type: str = "dk.cachet.carp.stepcount", - parquet_dir: Optional[str] = "output_parquet", - output_file: str = "participant_heatmap.html" - ): - """ - Generates a heatmap for a specific unified participant across all their deployments. - This aggregates data from all phases/folders for the same participant. - """ - # Get all deployment IDs for this participant - participants = self.sd.participant_manager.get_unified_participant(unified_participant_id) - if not participants: - console.print(f"[bold red]No participant found with ID {unified_participant_id}[/bold red]") - return - - deployment_ids = [p.study_deployment_id for p in participants] - console.print(f"[bold blue]Generating heatmap for participant {unified_participant_id} " - f"({len(deployment_ids)} deployments)...[/bold blue]") - - # 1. Load Data - df_loc = self.sd.get_dataframe(location_type, parquet_dir) - df_steps = self.sd.get_dataframe(step_type, parquet_dir) - - if df_loc is None or df_loc.empty: - console.print(f"[bold red]No location data found for type {location_type}[/bold red]") - return - - if df_steps is None: - console.print(f"[yellow]No step data found for type {step_type}. Plotting location only.[/yellow]") - df_steps = pd.DataFrame() - - # 2. Filter by all User deployments - user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_loc is not None: - df_loc = df_loc[user_series_loc.isin(deployment_ids)] - - if df_loc.empty: - console.print(f"[bold red]No location data found for participant {unified_participant_id}[/bold red]") - return - - if not df_steps.empty: - user_series_steps = self._get_field(df_steps, ['studyDeploymentId', 'dataStream.studyDeploymentId']) - if user_series_steps is not None: - df_steps = df_steps[user_series_steps.isin(deployment_ids)] - - # 3. Extract Coordinates and Time - df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude']) - df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude']) - df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime']) - - if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all(): - console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]") - return - - # 4. Add Step Markers - if not df_steps.empty: - df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps']) - df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime']) - - self._render_map(df_loc, df_steps, output_file) diff --git a/src/carp/plotting/prepare.py b/src/carp/plotting/prepare.py new file mode 100644 index 0000000..9dc40a4 --- /dev/null +++ b/src/carp/plotting/prepare.py @@ -0,0 +1,81 @@ +"""Plot-data preparation helpers.""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +from carp.core.dependencies import import_or_raise + + +def _extract_part(value: Any, part: str) -> Any: + """Extract one nested key from a dictionary value.""" + + return value.get(part) if isinstance(value, dict) else None + + +def candidate_series(frame: Any, candidates: Iterable[str]) -> Any: + """Return the first matching dataframe series for the given candidates.""" + + for path in candidates: + if path in frame.columns: + return frame[path] + parts = path.split(".") + if parts[0] not in frame.columns: + continue + series = frame[parts[0]] + for part in parts[1:]: + series = series.apply(_extract_part, args=(part,)) + return series + return None + + +def prepare_location_frame(frame: Any) -> Any: + """Add normalized plotting columns to a location dataframe.""" + + location = frame.copy() + location["_lat"] = candidate_series(location, ["measurement.data.latitude", "latitude"]) + location["_lon"] = candidate_series(location, ["measurement.data.longitude", "longitude"]) + location["_time"] = candidate_series( + location, + ["measurement.sensorStartTime", "sensorStartTime"], + ) + return location + + +def prepare_step_frame(frame: Any) -> Any: + """Add normalized plotting columns to a step dataframe.""" + + steps = frame.copy() + steps["_steps"] = candidate_series(steps, ["measurement.data.steps", "steps"]) + steps["_time"] = candidate_series(steps, ["measurement.sensorStartTime", "sensorStartTime"]) + return steps + + +def frames_from_items(location_items: list[Any], step_items: list[Any] | None = None) -> tuple[Any, Any]: + """Build plotting dataframes from type-safe objects.""" + + pandas = import_or_raise("pandas", "viz") + + def attr_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + current = getattr(current, part, None) + if current is None: + return None + return current + + location_rows = [] + for item in location_items: + latitude = attr_path(item, "measurement.data.latitude") + longitude = attr_path(item, "measurement.data.longitude") + timestamp = attr_path(item, "measurement.sensorStartTime") + if latitude is not None and longitude is not None: + location_rows.append({"_lat": latitude, "_lon": longitude, "_time": timestamp}) + step_rows = [] + for item in step_items or []: + steps = attr_path(item, "measurement.data.steps") + timestamp = attr_path(item, "measurement.sensorStartTime") + if steps is not None: + step_rows.append({"_steps": steps, "_time": timestamp}) + return pandas.DataFrame(location_rows), pandas.DataFrame(step_rows) diff --git a/src/carp/plotting/render.py b/src/carp/plotting/render.py new file mode 100644 index 0000000..d1a32c2 --- /dev/null +++ b/src/carp/plotting/render.py @@ -0,0 +1,56 @@ +"""HTML map rendering helpers.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from carp.core.dependencies import import_or_raise + + +def render_heatmap(location_frame: Any, step_frame: Any, output_path: str | Path) -> str | None: + """Render a heatmap and optional step markers to HTML.""" + + pandas = import_or_raise("pandas", "viz") + folium = import_or_raise("folium", "viz") + heatmap = import_or_raise("folium.plugins", "viz").HeatMap + if {"_lat", "_lon"} - set(location_frame.columns): + return None + location = location_frame.dropna(subset=["_lat", "_lon"]) + if location.empty: + return None + map_view = folium.Map(location=[location["_lat"].mean(), location["_lon"].mean()], zoom_start=12) + heatmap(location[["_lat", "_lon"]].values.tolist()).add_to(map_view) + if not step_frame.empty and {"_steps", "_time"} <= set(step_frame.columns): + merged = _merge_steps(pandas, location, step_frame) + for _, row in merged.iterrows(): + if row["_steps"] and pandas.notnull(row["_lat"]) and pandas.notnull(row["_lon"]): + folium.CircleMarker( + location=[row["_lat"], row["_lon"]], + radius=min(max(row["_steps"] / 10, 3), 20), + popup=f"Steps: {row['_steps']}
Time: {row['_time']}", + color="blue", + fill=True, + fill_color="blue", + ).add_to(map_view) + path = Path(output_path) + map_view.save(path) + return str(path) + + +def _merge_steps(pandas: Any, location: Any, step_frame: Any) -> Any: + """Merge step markers onto the nearest location timestamps.""" + + if "_time" not in location.columns or "_time" not in step_frame.columns: + return step_frame.iloc[0:0] + sorted_location = location.sort_values("_time").copy() + sorted_steps = step_frame.sort_values("_time").copy() + sorted_location["_time"] = sorted_location["_time"].astype("int64") + sorted_steps["_time"] = sorted_steps["_time"].astype("int64") + return pandas.merge_asof( + sorted_steps, + sorted_location[["_time", "_lat", "_lon"]], + on="_time", + direction="nearest", + tolerance=300_000_000, + ) diff --git a/src/carp/plotting/service.py b/src/carp/plotting/service.py new file mode 100644 index 0000000..f4db132 --- /dev/null +++ b/src/carp/plotting/service.py @@ -0,0 +1,130 @@ +"""High-level plotting service for study and participant data.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from carp.constants import DEFAULT_LOCATION_TYPE, DEFAULT_STEP_TYPE +from carp.core.naming import sanitize_filename + +from .prepare import candidate_series, frames_from_items, prepare_location_frame, prepare_step_frame +from .render import render_heatmap + + +class PlotService: + """Render HTML maps from study data or typed objects.""" + + def __init__(self, frames: Any, participants: Any) -> None: + self._frames = frames + self._participants = participants + self.candidate_series = candidate_series + + def participant( + self, + email: str, + output_file: str | None = None, + location_type: str = DEFAULT_LOCATION_TYPE, + step_type: str = DEFAULT_STEP_TYPE, + parquet_dir: str | None = None, + include_steps: bool = True, + ) -> str | None: + """Render a participant heatmap from an email address.""" + + view = self._participants.view(email) + if not view.exists: + return None + default_name = sanitize_filename(email.replace("@", "_at_"), allowed="-_.") + return self._plot_for_deployments( + view.deployment_ids, + output_file or f"{default_name}_location.html", + location_type, + step_type, + parquet_dir, + include_steps, + ) + + def deployment( + self, + deployment_id: str, + output_file: str = "deployment_heatmap.html", + location_type: str = DEFAULT_LOCATION_TYPE, + step_type: str = DEFAULT_STEP_TYPE, + parquet_dir: str | None = None, + include_steps: bool = True, + ) -> str | None: + """Render a heatmap for a single deployment.""" + + return self._plot_for_deployments( + (deployment_id,), + output_file, + location_type, + step_type, + parquet_dir, + include_steps, + ) + + def unified( + self, + unified_id: str, + output_file: str = "participant_heatmap.html", + location_type: str = DEFAULT_LOCATION_TYPE, + step_type: str = DEFAULT_STEP_TYPE, + parquet_dir: str | None = None, + include_steps: bool = True, + ) -> str | None: + """Render a heatmap for a unified participant.""" + + deployment_ids = tuple(participant.study_deployment_id for participant in self._participants.unified(unified_id)) + if not deployment_ids: + return None + return self._plot_for_deployments( + deployment_ids, + output_file, + location_type, + step_type, + parquet_dir, + include_steps, + ) + + def from_items( + self, + location_items: list[Any], + step_items: list[Any] | None = None, + output_file: str = "user_heatmap.html", + ) -> str | None: + """Render a heatmap from type-safe Python objects.""" + + location_frame, step_frame = frames_from_items(location_items, step_items) + return render_heatmap(location_frame, step_frame, output_file) + + def _plot_for_deployments( + self, + deployment_ids: tuple[str, ...], + output_file: str, + location_type: str, + step_type: str, + parquet_dir: str | None, + include_steps: bool, + ) -> str | None: + """Render a heatmap for a set of deployments.""" + + location_frame = self._frames.get_dataframe(location_type, parquet_dir) + if location_frame.empty: + return None + location_ids = candidate_series(location_frame, ["studyDeploymentId", "dataStream.studyDeploymentId"]) + if location_ids is None: + return None + filtered_location = prepare_location_frame(location_frame[location_ids.isin(deployment_ids)]) + if filtered_location.empty: + return None + if not include_steps: + return render_heatmap(filtered_location, filtered_location.iloc[0:0], output_file) + step_frame = self._frames.get_dataframe(step_type, parquet_dir) + if step_frame.empty: + return render_heatmap(filtered_location, step_frame, output_file) + step_ids = candidate_series(step_frame, ["studyDeploymentId", "dataStream.studyDeploymentId"]) + if step_ids is None: + return render_heatmap(filtered_location, step_frame.iloc[0:0], Path(output_file)) + filtered_steps = prepare_step_frame(step_frame[step_ids.isin(deployment_ids)]) + return render_heatmap(filtered_location, filtered_steps, Path(output_file)) diff --git a/src/carp/reader.py b/src/carp/reader.py deleted file mode 100644 index 7e61d7f..0000000 --- a/src/carp/reader.py +++ /dev/null @@ -1,1417 +0,0 @@ -import ijson -from pathlib import Path -from typing import Generator, Any, Dict, Optional, List, Set -from collections import defaultdict -from rich.console import Console -from rich.table import Table -from tqdm import tqdm -import json -from dataclasses import dataclass, field - -console = Console() - - -@dataclass -class ParticipantInfo: - """Represents participant information from participant-data.json""" - - study_deployment_id: str - role_name: str = "Participant" - full_name: Optional[str] = None - sex: Optional[str] = None - ssn: Optional[str] = None - user_id: Optional[str] = None - email: Optional[str] = None - consent_signed: bool = False - consent_timestamp: Optional[str] = None - source_folder: Optional[str] = None - # Unified participant ID assigned when same participant is detected across folders - unified_participant_id: Optional[str] = None - - def to_dict(self) -> Dict[str, Any]: - return { - "study_deployment_id": self.study_deployment_id, - "role_name": self.role_name, - "full_name": self.full_name, - "sex": self.sex, - "ssn": self.ssn, - "user_id": self.user_id, - "email": self.email, - "consent_signed": self.consent_signed, - "consent_timestamp": self.consent_timestamp, - "source_folder": self.source_folder, - "unified_participant_id": self.unified_participant_id, - } - - -class ParticipantManager: - """ - Manages participant data across multiple data folders. - Links participants across folders using SSN or user ID as identifiers. - """ - - def __init__(self): - # studyDeploymentId -> ParticipantInfo - self.participants_by_deployment: Dict[str, ParticipantInfo] = {} - # unified_participant_id -> list of ParticipantInfo (same person across folders) - self.unified_participants: Dict[str, List[ParticipantInfo]] = {} - # For generating unified IDs - self._unified_id_counter = 0 - - def load_participant_data(self, data_folders: List[Path]): - """ - Loads participant data from participant-data.json files in each data folder. - """ - console.print( - f"[bold blue]Loading participant data from {len(data_folders)} folders...[/bold blue]" - ) - - for folder in data_folders: - participant_file = folder / "participant-data.json" - if participant_file.exists(): - self._load_single_file(participant_file, folder.name) - else: - console.print(f"[yellow]No participant-data.json found in {folder}[/yellow]") - - # After loading all, unify participants - self._unify_participants() - - console.print( - f"[bold green]Loaded {len(self.participants_by_deployment)} participant deployments, " - f"{len(self.unified_participants)} unique participants[/bold green]" - ) - - def _load_single_file(self, file_path: Path, folder_name: str): - """Load participant data from a single file.""" - try: - with open(file_path, "r") as f: - data = json.load(f) - except (json.JSONDecodeError, IOError) as e: - console.print(f"[red]Error reading {file_path}: {e}[/red]") - return - - for entry in data: - deployment_id = entry.get("studyDeploymentId") - if not deployment_id: - continue - - roles = entry.get("roles", []) - for role in roles: - role_name = role.get("roleName", "Unknown") - role_data = role.get("data", {}) - - # Extract participant info from various fields - participant = ParticipantInfo( - study_deployment_id=deployment_id, - role_name=role_name, - source_folder=folder_name, - ) - - # Extract full name (can be a dict with firstName/lastName or a string) - full_name_data = role_data.get("dk.carp.webservices.input.full_name") - if full_name_data: - if isinstance(full_name_data, dict): - # Combine firstName, middleName, lastName - parts = [] - if full_name_data.get("firstName"): - parts.append(full_name_data["firstName"]) - if full_name_data.get("middleName"): - parts.append(full_name_data["middleName"]) - if full_name_data.get("lastName"): - parts.append(full_name_data["lastName"]) - if parts: - participant.full_name = " ".join(parts) - elif isinstance(full_name_data, str): - participant.full_name = full_name_data - - # Extract sex - sex_data = role_data.get("dk.cachet.carp.input.sex") - if sex_data: - participant.sex = sex_data - - # Extract SSN (can be a dict with socialSecurityNumber or a string) - ssn_data = role_data.get("dk.carp.webservices.input.ssn") - if ssn_data: - if isinstance(ssn_data, dict): - ssn_value = ssn_data.get("socialSecurityNumber") - if ssn_value: - participant.ssn = str(ssn_value) - elif isinstance(ssn_data, str): - participant.ssn = ssn_data - - # Extract consent info - consent_data = role_data.get("dk.carp.webservices.input.informed_consent") - if consent_data: - participant.consent_signed = True - if isinstance(consent_data, dict): - participant.consent_timestamp = consent_data.get("signedTimestamp") - participant.user_id = consent_data.get("userId") - participant.email = consent_data.get( - "name" - ) # email is stored in "name" field - - # Extract name from consent signature if not already set - if not participant.full_name: - consent_json_str = consent_data.get("consent") - if consent_json_str and isinstance(consent_json_str, str): - try: - consent_doc = json.loads(consent_json_str) - signature = consent_doc.get("signature", {}) - if isinstance(signature, dict): - first_name = (signature.get("firstName") or "").strip() - last_name = (signature.get("lastName") or "").strip() - if first_name or last_name: - participant.full_name = ( - f"{first_name} {last_name}".strip() - ) - except json.JSONDecodeError: - pass - - self.participants_by_deployment[deployment_id] = participant - - def _unify_participants(self): - """ - Identify the same participant across different folders/deployments. - Uses email as primary identifier (most accurate), falls back to SSN, then full name. - """ - # Group by identifier - by_email: Dict[str, List[ParticipantInfo]] = defaultdict(list) - by_ssn: Dict[str, List[ParticipantInfo]] = defaultdict(list) - by_name: Dict[str, List[ParticipantInfo]] = defaultdict(list) - - for p in self.participants_by_deployment.values(): - # Email, SSN, name must be strings for use as dict keys - if p.email and isinstance(p.email, str): - by_email[p.email.lower()].append(p) # normalize email to lowercase - if p.ssn and isinstance(p.ssn, str): - by_ssn[p.ssn].append(p) - if p.full_name and isinstance(p.full_name, str): - by_name[p.full_name.strip().lower()].append(p) # normalize name - - # Assign unified IDs, preferring email grouping (most accurate) - assigned: Set[str] = set() # deployment IDs already assigned - - # First pass: use email (most accurate identifier) - for email, participants in by_email.items(): - unified_id = f"P{self._unified_id_counter:04d}" - self._unified_id_counter += 1 - - for p in participants: - if p.study_deployment_id not in assigned: - p.unified_participant_id = unified_id - assigned.add(p.study_deployment_id) - - self.unified_participants[unified_id] = participants - - # Second pass: use SSN for remaining - for ssn, participants in by_ssn.items(): - unassigned = [p for p in participants if p.study_deployment_id not in assigned] - if not unassigned: - continue - - unified_id = f"P{self._unified_id_counter:04d}" - self._unified_id_counter += 1 - - for p in unassigned: - p.unified_participant_id = unified_id - assigned.add(p.study_deployment_id) - - self.unified_participants[unified_id] = unassigned - - # Third pass: use full name for remaining - for name, participants in by_name.items(): - unassigned = [p for p in participants if p.study_deployment_id not in assigned] - if not unassigned: - continue - - unified_id = f"P{self._unified_id_counter:04d}" - self._unified_id_counter += 1 - - for p in unassigned: - p.unified_participant_id = unified_id - assigned.add(p.study_deployment_id) - - self.unified_participants[unified_id] = unassigned - - # Remaining participants get their own unified ID - for p in self.participants_by_deployment.values(): - if p.study_deployment_id not in assigned: - unified_id = f"P{self._unified_id_counter:04d}" - self._unified_id_counter += 1 - p.unified_participant_id = unified_id - self.unified_participants[unified_id] = [p] - - # Propagate name/SSN data across unified participants - # If any deployment has name/SSN, share it with all deployments of same participant - self._propagate_participant_data() - - def _propagate_participant_data(self): - """ - Propagate name, SSN, and other data to all records of the same unified participant. - If one deployment has data that others don't, copy it to all. - """ - for unified_id, participants in self.unified_participants.items(): - # Collect best available data from all records - best_full_name = None - best_ssn = None - best_sex = None - - for p in participants: - if p.full_name and isinstance(p.full_name, str) and not best_full_name: - best_full_name = p.full_name - if p.ssn and isinstance(p.ssn, str) and not best_ssn: - best_ssn = p.ssn - if p.sex and not best_sex: - best_sex = p.sex - - # Apply to all records - for p in participants: - if best_full_name and not p.full_name: - p.full_name = best_full_name - if best_ssn and not p.ssn: - p.ssn = best_ssn - if best_sex and not p.sex: - p.sex = best_sex - - def get_participant(self, study_deployment_id: str) -> Optional[ParticipantInfo]: - """Get participant info by study deployment ID.""" - return self.participants_by_deployment.get(study_deployment_id) - - def get_unified_participant(self, unified_id: str) -> List[ParticipantInfo]: - """Get all deployments for a unified participant.""" - return self.unified_participants.get(unified_id, []) - - def find_by_email(self, email: str) -> List[ParticipantInfo]: - """Find all participant records matching an email address.""" - email_lower = email.lower() - return [ - p - for p in self.participants_by_deployment.values() - if p.email and p.email.lower() == email_lower - ] - - def find_by_ssn(self, ssn: str) -> List[ParticipantInfo]: - """Find all participant records matching an SSN.""" - return [p for p in self.participants_by_deployment.values() if p.ssn and p.ssn == ssn] - - def find_by_name(self, name: str) -> List[ParticipantInfo]: - """Find all participant records matching a full name (case-insensitive).""" - name_lower = name.strip().lower() - return [ - p - for p in self.participants_by_deployment.values() - if p.full_name and p.full_name.strip().lower() == name_lower - ] - - def get_deployment_ids_by_email(self, email: str) -> List[str]: - """Get all deployment IDs for a participant by email.""" - return [p.study_deployment_id for p in self.find_by_email(email)] - - def get_deployment_ids_by_ssn(self, ssn: str) -> List[str]: - """Get all deployment IDs for a participant by SSN.""" - return [p.study_deployment_id for p in self.find_by_ssn(ssn)] - - def get_deployment_ids_by_name(self, name: str) -> List[str]: - """Get all deployment IDs for a participant by name.""" - return [p.study_deployment_id for p in self.find_by_name(name)] - - def print_summary(self): - """Print a summary table of participants.""" - table = Table(title="Participants Summary") - table.add_column("Unified ID", style="cyan") - table.add_column("Deployments", style="magenta") - table.add_column("Folders", style="green") - table.add_column("Email", style="yellow") - table.add_column("SSN", style="red") - table.add_column("Full Name", style="white") - - for unified_id, participants in self.unified_participants.items(): - folders = set(p.source_folder for p in participants if p.source_folder) - emails = set(p.email for p in participants if p.email and isinstance(p.email, str)) - ssns = set(p.ssn for p in participants if p.ssn and isinstance(p.ssn, str)) - names = set( - p.full_name for p in participants if p.full_name and isinstance(p.full_name, str) - ) - table.add_row( - unified_id, - str(len(participants)), - ", ".join(sorted(folders)), - ", ".join(emails) if emails else "N/A", - ", ".join(ssns) if ssns else "N/A", - ", ".join(names) if names else "N/A", - ) - - console.print(table) - - -class ParticipantAccessor: - """ - Fluent API for accessing participant data. - Usage: sd.participant("email@example.com").info(), .all_data(), .available_fields() - sd.participant("email@example.com").visualize.location() - """ - - def __init__(self, sleepiness_data: "CarpDataStream", email: str): - self._sd = sleepiness_data - self._email = email - self._participants = sleepiness_data.participant_manager.find_by_email(email) - self._deployment_ids = set( - sleepiness_data.participant_manager.get_deployment_ids_by_email(email) - ) - self._visualizer = None - - @property - def exists(self) -> bool: - """Check if participant exists.""" - return len(self._participants) > 0 - - @property - def visualize(self): - """ - Access visualization methods for this participant. - Usage: sd.participant("email").visualize.location() - """ - if self._visualizer is None: - from .plotting import ParticipantVisualizer - - self._visualizer = ParticipantVisualizer(self._sd, self._deployment_ids, self._email) - return self._visualizer - - def info(self) -> Optional[Dict[str, Any]]: - """ - Get participant information as a dictionary. - Returns combined info from all deployments for this participant. - """ - if not self._participants: - return None - - # Get first participant as base - base = self._participants[0] - - # Combine info from all records - all_folders = set() - all_deployment_ids = set() - - for p in self._participants: - if p.source_folder: - all_folders.add(p.source_folder) - all_deployment_ids.add(p.study_deployment_id) - - return { - "email": self._email, - "unified_id": base.unified_participant_id, - "full_name": base.full_name, - "ssn": base.ssn, - "sex": base.sex, - "user_id": base.user_id, - "consent_signed": base.consent_signed, - "consent_timestamp": base.consent_timestamp, - "folders": sorted(all_folders), - "deployment_ids": sorted(all_deployment_ids), - "num_deployments": len(all_deployment_ids), - } - - def print_info(self): - """Print participant information in a formatted table.""" - info = self.info() - if not info: - console.print(f"[red]No participant found with email: {self._email}[/red]") - return - - table = Table(title=f"Participant: {self._email}") - table.add_column("Field", style="cyan") - table.add_column("Value", style="white") - - for key, value in info.items(): - if isinstance(value, list): - value = ", ".join(str(v) for v in value) - table.add_row(key, str(value) if value is not None else "N/A") - - console.print(table) - - def all_data(self, data_type: Optional[str] = None) -> Generator[Dict[str, Any], None, None]: - """ - Get all data items for this participant. - Optionally filter by data type (e.g., "dk.cachet.carp.stepcount"). - """ - yield from self._sd._get_data_by_deployment_ids(self._deployment_ids, data_type) - - def available_fields(self, sample_size: int = 100) -> Set[str]: - """ - Discover all available fields in this participant's data. - Scans a sample of records and returns field paths in dot-notation. - """ - fields = set() - count = 0 - - for item in self.all_data(): - if count >= sample_size: - break - self._collect_fields(item, "", fields) - count += 1 - - return fields - - def _collect_fields(self, obj: Any, prefix: str, fields: Set[str]): - """Recursively collect field paths.""" - if isinstance(obj, dict): - for key, value in obj.items(): - path = f"{prefix}.{key}" if prefix else key - fields.add(path) - self._collect_fields(value, path, fields) - elif isinstance(obj, list) and obj: - # Sample first item in list - self._collect_fields(obj[0], f"{prefix}[]", fields) - - def print_available_fields(self, sample_size: int = 100): - """Print all available fields in a formatted list.""" - fields = self.available_fields(sample_size) - console.print(f"[bold]Available fields for {self._email}:[/bold]") - for f in sorted(fields): - console.print(f" - {f}") - - def data_types(self) -> Set[str]: - """Get all unique data types for this participant.""" - types = set() - for item in self.all_data(): - data_stream = item.get("dataStream", {}) - data_type = data_stream.get("dataType", {}) - type_name = data_type.get("name") - if type_name: - types.add(type_name) - return types - - def print_data_types(self): - """Print all data types available for this participant.""" - types = self.data_types() - console.print(f"[bold]Data types for {self._email}:[/bold]") - for t in sorted(types): - console.print(f" - {t}") - - def count(self, data_type: Optional[str] = None) -> int: - """Count total data items for this participant.""" - return sum(1 for _ in self.all_data(data_type)) - - def dataframe(self, data_type: str, parquet_dir: Optional[str] = None): - """ - Get a pandas DataFrame of this participant's data for a specific type. - Uses parquet files if available and parquet_dir is specified. - """ - try: - import pandas as pd - except ImportError: - console.print( - "[red]pandas is required for dataframe(). Install with: pip install pandas[/red]" - ) - return None - - if parquet_dir: - # Try to load from parquet and filter - df = self._sd.get_dataframe(data_type, parquet_dir) - if df is not None and not df.empty: - return df[df["studyDeploymentId"].isin(self._deployment_ids)] - - # Fall back to streaming - items = list(self.all_data(data_type)) - if not items: - return pd.DataFrame() - return pd.DataFrame(items) - - -class CarpDataStream: - def __init__(self, file_paths: str | Path | List[str | Path], load_participants: bool = True): - if isinstance(file_paths, (str, Path)): - file_paths = [file_paths] - - self.file_paths = [Path(p) for p in file_paths] - for p in self.file_paths: - if not p.exists(): - raise FileNotFoundError(f"File not found: {p}") - - self.schema_cache = {} - self.participant_manager = ParticipantManager() - - # Auto-detect and load participant data from parent folders - if load_participants: - self._auto_load_participants() - - def _auto_load_participants(self): - """ - Automatically detect and load participant data from the data folders - containing the input files. - """ - data_folders = set() - for file_path in self.file_paths: - # Each file is typically in a phase folder like data/phase-1-1/data-streams.json - parent = file_path.parent - if (parent / "participant-data.json").exists(): - data_folders.add(parent) - - if data_folders: - self.participant_manager.load_participant_data(list(data_folders)) - - def load_participants_from_folders(self, folders: List[str | Path]): - """ - Manually load participant data from specific folders. - Useful when files are in a different location than the input data. - """ - folder_paths = [Path(f) for f in folders] - self.participant_manager.load_participant_data(folder_paths) - - def participant(self, email: str) -> ParticipantAccessor: - """ - Access participant data via email using a fluent API. - - Usage: - sd.participant("email@example.com").info() - sd.participant("email@example.com").all_data() - sd.participant("email@example.com").available_fields() - sd.participant("email@example.com").data_types() - sd.participant("email@example.com").dataframe("dk.cachet.carp.stepcount") - """ - return ParticipantAccessor(self, email) - - def get_participant(self, study_deployment_id: str) -> Optional[ParticipantInfo]: - """Get participant info by study deployment ID.""" - return self.participant_manager.get_participant(study_deployment_id) - - def find_participant_by_email(self, email: str) -> List[ParticipantInfo]: - """Find all participant records matching an email address.""" - return self.participant_manager.find_by_email(email) - - def find_participant_by_ssn(self, ssn: str) -> List[ParticipantInfo]: - """Find all participant records matching an SSN.""" - return self.participant_manager.find_by_ssn(ssn) - - def find_participant_by_name(self, name: str) -> List[ParticipantInfo]: - """Find all participant records matching a full name.""" - return self.participant_manager.find_by_name(name) - - def get_data_by_email( - self, email: str, data_type: Optional[str] = None - ) -> Generator[Dict[str, Any], None, None]: - """ - Get all data items for a participant identified by email. - Optionally filter by data type. - """ - deployment_ids = set(self.participant_manager.get_deployment_ids_by_email(email)) - yield from self._get_data_by_deployment_ids(deployment_ids, data_type) - - def get_data_by_ssn( - self, ssn: str, data_type: Optional[str] = None - ) -> Generator[Dict[str, Any], None, None]: - """ - Get all data items for a participant identified by SSN. - Optionally filter by data type. - """ - deployment_ids = set(self.participant_manager.get_deployment_ids_by_ssn(ssn)) - yield from self._get_data_by_deployment_ids(deployment_ids, data_type) - - def get_data_by_name( - self, name: str, data_type: Optional[str] = None - ) -> Generator[Dict[str, Any], None, None]: - """ - Get all data items for a participant identified by full name. - Optionally filter by data type. - """ - deployment_ids = set(self.participant_manager.get_deployment_ids_by_name(name)) - yield from self._get_data_by_deployment_ids(deployment_ids, data_type) - - def _get_data_by_deployment_ids( - self, deployment_ids: set, data_type: Optional[str] = None - ) -> Generator[Dict[str, Any], None, None]: - """Internal helper to filter data by deployment IDs and optionally by type.""" - if not deployment_ids: - return - - for item in self._get_item_generator(): - item_deployment_id = item.get("studyDeploymentId") - if not item_deployment_id: - item_deployment_id = item.get("dataStream", {}).get("studyDeploymentId") - - if item_deployment_id not in deployment_ids: - continue - - if data_type: - dt = item.get("dataStream", {}).get("dataType", {}) - target_namespace, target_name = data_type.rsplit(".", 1) - if dt.get("name") != target_name or dt.get("namespace") != target_namespace: - continue - - yield item - - def print_participants(self): - """Print a summary of all participants.""" - self.participant_manager.print_summary() - - def _get_item_generator(self) -> Generator[Dict[str, Any], None, None]: - """ - Returns a generator that yields items from the JSON files. - Uses ijson for memory-efficient streaming. - """ - for file_path in self.file_paths: - with open(file_path, "rb") as f: - # Assuming the file is a list of objects. - # 'item' matches objects in a list. - # use_float=True ensures numbers are floats, avoiding Decimal schema mismatches in PyArrow - yield from ijson.items(f, "item", use_float=True) - - def _get_item_generator_with_participant( - self, include_participant: bool = False - ) -> Generator[Dict[str, Any], None, None]: - """ - Returns a generator that yields items from the JSON files, - optionally enriched with participant info. - """ - for item in self._get_item_generator(): - if include_participant: - deployment_id = item.get("studyDeploymentId") - if not deployment_id: - deployment_id = item.get("dataStream", {}).get("studyDeploymentId") - - if deployment_id: - participant = self.participant_manager.get_participant(deployment_id) - if participant: - item = item.copy() # Don't mutate original - item["_participant"] = participant.to_dict() - - yield item - - def get_data_with_participants( - self, data_type: Optional[str] = None - ) -> Generator[Dict[str, Any], None, None]: - """ - Yields items enriched with participant information. - If data_type is specified, filters to that type. - """ - gen = self._get_item_generator_with_participant(include_participant=True) - - if data_type: - target_namespace, target_name = data_type.rsplit(".", 1) - for item in gen: - dt = item.get("dataStream", {}).get("dataType", {}) - if dt.get("name") == target_name and dt.get("namespace") == target_namespace: - yield item - else: - yield from gen - - def group_by_participant(self, output_dir: str | Path, data_type: Optional[str] = None): - """ - Groups data by unified participant ID and exports each group to a separate JSON file. - Useful for analyzing individual participant data across all phases. - """ - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - console.print(f"[bold blue]Grouping data by participant into {output_dir}...[/bold blue]") - - files = {} - gen = self.get_data_with_participants(data_type) - - try: - for item in tqdm(gen, desc="Grouping by participant"): - participant_info = item.get("_participant", {}) - unified_id = participant_info.get("unified_participant_id", "unknown") - - if unified_id not in files: - f = open(output_dir / f"{unified_id}.json", "w") - f.write("[") - files[unified_id] = {"handle": f, "first": True} - - f_info = files[unified_id] - if not f_info["first"]: - f_info["handle"].write(",") - json.dump(item, f_info["handle"]) - f_info["first"] = False - - finally: - for f_info in files.values(): - f_info["handle"].write("]") - f_info["handle"].close() - - console.print( - f"[bold green]Grouping complete! Created {len(files)} participant files.[/bold green]" - ) - - def group_by_email(self, output_dir: str | Path, data_type: Optional[str] = None): - """ - Groups data by participant email and exports each group to a separate JSON file. - """ - self._group_by_field_value(output_dir, "email", data_type) - - def group_by_ssn(self, output_dir: str | Path, data_type: Optional[str] = None): - """ - Groups data by participant SSN and exports each group to a separate JSON file. - """ - self._group_by_field_value(output_dir, "ssn", data_type) - - def group_by_name(self, output_dir: str | Path, data_type: Optional[str] = None): - """ - Groups data by participant full name and exports each group to a separate JSON file. - """ - self._group_by_field_value(output_dir, "full_name", data_type) - - def _group_by_field_value( - self, output_dir: str | Path, field: str, data_type: Optional[str] = None - ): - """Internal helper to group data by a participant field (email, ssn, or full_name).""" - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - console.print(f"[bold blue]Grouping data by {field} into {output_dir}...[/bold blue]") - - files = {} - gen = self.get_data_with_participants(data_type) - - try: - for item in tqdm(gen, desc=f"Grouping by {field}"): - participant_info = item.get("_participant", {}) - value = participant_info.get(field, "unknown") - - if not value or not isinstance(value, str): - value = "unknown" - - # Sanitize filename - safe_value = "".join( - c for c in value if c.isalnum() or c in ("-", "_", "@", ".") - ).strip() - if not safe_value: - safe_value = "unknown" - - if safe_value not in files: - f = open(output_dir / f"{safe_value}.json", "w") - f.write("[") - files[safe_value] = {"handle": f, "first": True} - - f_info = files[safe_value] - if not f_info["first"]: - f_info["handle"].write(",") - json.dump(item, f_info["handle"]) - f_info["first"] = False - - finally: - for f_info in files.values(): - f_info["handle"].write("]") - f_info["handle"].close() - - console.print(f"[bold green]Grouping complete! Created {len(files)} files.[/bold green]") - - def get_dataframe_with_participants( - self, data_type: str, parquet_dir: Optional[str | Path] = None - ): - """ - Returns a pandas DataFrame for the specified data type, enriched with participant info. - Adds columns: participant_id, participant_email, participant_folder - """ - try: - import pandas as pd - except ImportError: - console.print("[bold red]pandas is required for DataFrame conversion.[/bold red]") - return None - - # Get base dataframe - df = self.get_dataframe(data_type, parquet_dir) - if df is None or df.empty: - return df - - # Add participant columns - def get_participant_info(deployment_id): - p = self.participant_manager.get_participant(deployment_id) - if p: - return pd.Series( - { - "participant_id": p.unified_participant_id, - "participant_email": p.email, - "participant_folder": p.source_folder, - } - ) - return pd.Series( - {"participant_id": None, "participant_email": None, "participant_folder": None} - ) - - # Extract studyDeploymentId from dataStream column if it exists - if "dataStream" in df.columns: - deployment_ids = df["dataStream"].apply( - lambda x: x.get("studyDeploymentId") if isinstance(x, dict) else None - ) - elif "studyDeploymentId" in df.columns: - deployment_ids = df["studyDeploymentId"] - else: - console.print("[yellow]Could not find studyDeploymentId column[/yellow]") - return df - - participant_info = deployment_ids.apply(get_participant_info) - return pd.concat([df, participant_info], axis=1) - - def scan_schema(self) -> Dict[str, Any]: - """ - Scans the entire file to infer the schema of the data. - Returns a dictionary mapping data types to their field structures. - """ - schemas = defaultdict(set) - - # We need to count items for tqdm, but counting requires a pass. - # For very large files, we might just use file size or unknown length. - # Let's try to estimate or just use a simple progress bar. - - console.print(f"[bold blue]Scanning schema for {len(self.file_paths)} files...[/bold blue]") - - # We can use tqdm wrapping the generator, but we don't know total length easily without reading. - # We can use file size as a proxy if we read raw bytes, but ijson handles the reading. - # Let's just use a counter. - - count = 0 - with tqdm(desc="Processing items", unit=" items") as pbar: - for item in self._get_item_generator(): - data_type = item.get("dataStream", {}).get("dataType", {}).get("name", "unknown") - namespace = ( - item.get("dataStream", {}).get("dataType", {}).get("namespace", "unknown") - ) - full_type = f"{namespace}.{data_type}" - - measurement_data = item.get("measurement", {}).get("data", {}) - - # Collect keys - for key in measurement_data.keys(): - schemas[full_type].add(key) - - count += 1 - if count % 1000 == 0: - pbar.update(1000) - pbar.update(count % 1000) - - # Convert sets to lists for JSON serialization/display - self.schema_cache = {k: list(v) for k, v in schemas.items()} - return self.schema_cache - - def print_schema(self): - if not self.schema_cache: - self.scan_schema() - - table = Table(title="Inferred Schema") - table.add_column("Data Type", style="cyan") - table.add_column("Fields", style="magenta") - - for dtype, fields in self.schema_cache.items(): - table.add_row(dtype, ", ".join(sorted(fields))) - - console.print(table) - - def get_data_by_type(self, target_type: str) -> Generator[Dict[str, Any], None, None]: - """ - Yields items of a specific data type. - """ - target_namespace, target_name = target_type.rsplit(".", 1) - - for item in self._get_item_generator(): - dt = item.get("dataStream", {}).get("dataType", {}) - if dt.get("name") == target_name and dt.get("namespace") == target_namespace: - yield item - - def export_to_json(self, output_path: str, data_type: Optional[str] = None): - """ - Exports data to a JSON file. Can filter by data type. - """ - console.print(f"[bold green]Exporting data to {output_path}...[/bold green]") - - generator = self.get_data_by_type(data_type) if data_type else self._get_item_generator() - - with open(output_path, "w") as f: - f.write("[") - first = True - for item in tqdm(generator, desc="Exporting"): - if not first: - f.write(",") - json.dump(item, f) - first = False - f.write("]") - - console.print("[bold green]Export complete![/bold green]") - - def group_by_field(self, field_path: str, output_dir: str | Path): - """ - Groups data by a specific field and exports each group to a separate JSON file. - field_path is a dot-separated string, e.g., 'dataStream.dataType.name'. - """ - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - console.print(f"[bold blue]Grouping data by {field_path} into {output_dir}...[/bold blue]") - - # We can't keep all files open if there are too many groups. - # But for things like dataType, there are usually < 20 groups. - # A safe approach for low memory is to read the file once and append to files, - # but opening/closing files for every line is slow. - # A middle ground is to keep a cache of open file handles, closing LRU if too many. - - # For simplicity and speed assuming reasonable number of groups (<100): - files = {} - - try: - for item in tqdm(self._get_item_generator(), desc="Grouping"): - # Extract value - value = item - for part in field_path.split("."): - if isinstance(value, dict): - value = value.get(part) - else: - value = None - break - - if value is None: - value = "unknown" - - value = str(value) - # Sanitize filename - safe_value = "".join(c for c in value if c.isalnum() or c in ("-", "_")).strip() - if not safe_value: - safe_value = "unknown" - - if safe_value not in files: - f = open(output_dir / f"{safe_value}.json", "w") - f.write("[") - files[safe_value] = {"handle": f, "first": True} - - f_info = files[safe_value] - if not f_info["first"]: - f_info["handle"].write(",") - json.dump(item, f_info["handle"]) - f_info["first"] = False - - finally: - for f_info in files.values(): - f_info["handle"].write("]") - f_info["handle"].close() - - console.print(f"[bold green]Grouping complete! Created {len(files)} files.[/bold green]") - - def count_items(self) -> int: - """ - Counts the total number of items in the JSON file. - """ - console.print(f"[bold blue]Counting items in {len(self.file_paths)} files...[/bold blue]") - count = 0 - for _ in tqdm(self._get_item_generator(), desc="Counting"): - count += 1 - return count - - def convert_to_parquet(self, output_dir: str | Path, batch_size: int = 10000): - """ - Converts the JSON data to Parquet files, grouped by data type. - Requires pyarrow and pandas. - """ - import importlib.util - - if not importlib.util.find_spec("pyarrow") or not importlib.util.find_spec("pandas"): - console.print( - "[bold red]pyarrow and pandas are required for Parquet conversion. Please install them.[/bold red]" - ) - return - - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - console.print(f"[bold blue]Converting to Parquet in {output_dir}...[/bold blue]") - - writers = {} - buffers = defaultdict(list) - - try: - for item in tqdm(self._get_item_generator(), desc="Converting"): - # Determine type - try: - dtype = item.get("dataStream", {}).get("dataType", {}).get("name", "unknown") - # Sanitize - safe_name = "".join(c for c in dtype if c.isalnum() or c in ("-", "_")).strip() - if not safe_name: - safe_name = "unknown" - except (AttributeError, TypeError): - safe_name = "unknown" - - buffers[safe_name].append(item) - - if len(buffers[safe_name]) >= batch_size: - self._flush_buffer_to_parquet( - safe_name, buffers[safe_name], writers, output_dir - ) - buffers[safe_name].clear() - - finally: - # Flush remaining - for name, buf in buffers.items(): - if buf: - self._flush_buffer_to_parquet(name, buf, writers, output_dir) - - # Close writers - for writer in writers.values(): - writer.close() - - console.print( - f"[bold green]Conversion complete! Created {len(writers)} Parquet files.[/bold green]" - ) - - def _flush_buffer_to_parquet(self, name, buffer, writers, output_dir): - import pyarrow as pa - import pyarrow.parquet as pq - - if not buffer: - return - - try: - # PyArrow's from_pylist is robust but might need explicit schema if types vary. - # We let it infer for now. - table = pa.Table.from_pylist(buffer) - except Exception as e: - console.print(f"[red]Error converting batch for {name}: {e}[/red]") - return - - if name not in writers: - file_path = output_dir / f"{name}.parquet" - # Use the schema from the first batch - writers[name] = pq.ParquetWriter(file_path, table.schema) - - try: - # If the new batch has a different schema (e.g. missing fields or new fields), - # write_table might fail or produce a file with multiple schemas (which is bad). - # Ideally we should unify schemas, but that requires reading all data first. - # For now, we assume schema consistency or that PyArrow handles minor diffs. - # If strict schema validation fails, we might need to cast. - - # Check if schema matches writer's schema - if not table.schema.equals(writers[name].schema): - # Try to cast to the writer's schema - # This handles cases where a field is missing (null) or type promotion is needed - try: - table = table.cast(writers[name].schema) - except Exception: - # If casting fails, we might have a problem. - # For now, log and skip or try to write anyway (which might fail) - # console.print(f"[yellow]Schema mismatch for {name}. Attempting cast... {cast_error}[/yellow]") - pass - - writers[name].write_table(table) - except Exception as e: - console.print(f"[red]Error writing batch for {name}: {e}[/red]") - - def get_dataframe(self, data_type: str, parquet_dir: Optional[str | Path] = None): - """ - Returns a pandas DataFrame for the specified data type. - If parquet_dir is provided and contains the corresponding parquet file, it loads from there. - Otherwise, it scans the JSON file (which is slower). - """ - try: - import pandas as pd - except ImportError: - console.print( - "[bold red]pandas is required for DataFrame conversion. Please install it.[/bold red]" - ) - return None - - # Check Parquet first - if parquet_dir: - parquet_dir = Path(parquet_dir) - # data_type might be full namespace "dk.cachet.carp.heartbeat" - # or just "heartbeat" if we simplified names in conversion. - # Our conversion uses simplified names. - - simple_name = data_type.split(".")[-1] - parquet_path = parquet_dir / f"{simple_name}.parquet" - - if parquet_path.exists(): - console.print(f"[bold blue]Loading {data_type} from {parquet_path}...[/bold blue]") - return pd.read_parquet(parquet_path) - - # Try full name just in case - safe_full_name = "".join(c for c in data_type if c.isalnum() or c in ("-", "_")).strip() - parquet_path_full = parquet_dir / f"{safe_full_name}.parquet" - if parquet_path_full.exists(): - console.print( - f"[bold blue]Loading {data_type} from {parquet_path_full}...[/bold blue]" - ) - return pd.read_parquet(parquet_path_full) - - # Fallback to JSON scan - console.print( - f"[bold yellow]Parquet file not found. Scanning JSON for {data_type}...[/bold yellow]" - ) - data = list(tqdm(self.get_data_by_type(data_type), desc="Loading to DataFrame")) - return pd.DataFrame(data) - - def list_all_fields(self, sample_size: int = 100) -> List[str]: - """ - Scans a sample of items to find all available dot-separated field paths. - Useful for determining what fields can be used in group_by_field. - """ - console.print( - f"[bold blue]Scanning first {sample_size} items to find field paths...[/bold blue]" - ) - paths = set() - - def _recurse(obj, current_path): - if isinstance(obj, dict): - for k, v in obj.items(): - new_path = f"{current_path}.{k}" if current_path else k - paths.add(new_path) - _recurse(v, new_path) - - count = 0 - for item in self._get_item_generator(): - _recurse(item, "") - count += 1 - if count >= sample_size: - break - - return sorted(list(paths)) - - def generate_type_definitions( - self, output_file: str = "generated_types.py", sample_size: int = 1000 - ): - """ - Generates a Python module with dataclasses representing the data schema. - Detects nested JSON strings and generates types for them as well. - """ - console.print(f"[bold blue]Inferring schema from first {sample_size} items...[/bold blue]") - schema = self._infer_full_schema(sample_size) - - console.print("[bold blue]Generating code...[/bold blue]") - code = self._generate_code_from_schema(schema) - - with open(output_file, "w") as f: - f.write(code) - console.print(f"[bold green]Generated type definitions in {output_file}[/bold green]") - - def _infer_full_schema(self, sample_size: int) -> Dict[str, Any]: - root_schema = {"type": "object", "fields": {}} - - def merge(schema, value): - if value is None: - schema["nullable"] = True - return - - if isinstance(value, dict): - if schema.get("type") and schema["type"] != "object": - schema["type"] = "Any" # Conflict - return - schema["type"] = "object" - if "fields" not in schema: - schema["fields"] = {} - - for k, v in value.items(): - if k not in schema["fields"]: - schema["fields"][k] = {} - merge(schema["fields"][k], v) - - elif isinstance(value, list): - if schema.get("type") and schema["type"] != "list": - schema["type"] = "Any" - return - schema["type"] = "list" - if "item_type" not in schema: - schema["item_type"] = {} - - for item in value: - merge(schema["item_type"], item) - - else: - # Primitive - # Check if string is JSON - is_json = False - if isinstance(value, str): - try: - if (value.strip().startswith("{") and value.strip().endswith("}")) or ( - value.strip().startswith("[") and value.strip().endswith("]") - ): - parsed = json.loads(value) - if isinstance(parsed, (dict, list)): - is_json = True - schema["is_json_string"] = True - merge(schema, parsed) - return - except (json.JSONDecodeError, TypeError): - pass - - if not is_json: - py_type = type(value).__name__ - # Map python types to type hints - if py_type == "float": - py_type = "float" - elif py_type == "int": - py_type = "int" - elif py_type == "str": - py_type = "str" - elif py_type == "bool": - py_type = "bool" - - if schema.get("type") == "primitive" and schema.get("python_type") != py_type: - # If mixing int and float, upgrade to float - if {schema.get("python_type"), py_type} == {"int", "float"}: - schema["python_type"] = "float" - else: - schema["python_type"] = "Any" - else: - schema["type"] = "primitive" - schema["python_type"] = py_type - - count = 0 - for item in self._get_item_generator(): - merge(root_schema, item) - count += 1 - if count >= sample_size: - break - - return root_schema - - def _generate_code_from_schema(self, schema: Dict[str, Any]) -> str: - classes = {} # name -> definition - - def get_type_name(schema, context_name): - if schema.get("type") == "object": - class_name = "".join(x[:1].upper() + x[1:] for x in context_name.split("_")) - if not class_name: - class_name = "Root" - - # Handle collision - base_name = class_name - counter = 1 - while ( - class_name in classes - and classes[class_name] is not None - and classes[class_name] != schema.get("fields") - ): - # Note: comparing fields is a weak check for equality, but sufficient for now - class_name = f"{base_name}{counter}" - counter += 1 - - if class_name not in classes: - classes[class_name] = None # Placeholder - fields = [] - for k, v in schema.get("fields", {}).items(): - field_type = get_type_name(v, k) - fields.append( - ( - k, - field_type, - v.get("nullable", False), - v.get("is_json_string", False), - ) - ) - classes[class_name] = fields - - return class_name - - elif schema.get("type") == "list": - item_type = get_type_name(schema.get("item_type", {}), context_name + "_item") - return f"List[{item_type}]" - - elif schema.get("type") == "primitive": - t = schema.get("python_type", "Any") - return "Any" if t == "Any" else t - - return "Any" - - get_type_name(schema, "SleepinessItem") - - # Generate Code - lines = [ - "# Auto-generated type definitions", - "", - "from __future__ import annotations", - "from dataclasses import dataclass", - "from typing import List, Optional, Any, Dict", - "import json", - "", - "def parse_json_field(value):", - " if isinstance(value, str):", - " try:", - " return json.loads(value)", - " except:", - " return value", - " return value", - "", - ] - - for name, fields in classes.items(): - if fields is None: - continue # Should not happen if recursion finished - - lines.append("@dataclass") - lines.append(f"class {name}:") - if not fields: - lines.append(" pass") - - for fname, ftype, nullable, is_json in fields: - safe_fname = fname - if safe_fname in ( - "from", - "class", - "def", - "return", - "import", - "type", - "global", - "for", - "if", - "else", - "while", - ): - safe_fname = f"{fname}_" - - type_hint = ftype - if nullable: - type_hint = f"Optional[{type_hint}]" - - lines.append(f" {safe_fname}: {type_hint} = None") - - # Add from_dict method - lines.append("") - lines.append(" @classmethod") - lines.append(" def from_dict(cls, obj: Any) -> Any:") - lines.append(" if not isinstance(obj, dict): return obj") - lines.append(" instance = cls()") - for fname, ftype, nullable, is_json in fields: - safe_fname = fname - if safe_fname in ( - "from", - "class", - "def", - "return", - "import", - "type", - "global", - "for", - "if", - "else", - "while", - ): - safe_fname = f"{fname}_" - - base_type = ftype - is_list = False - if ftype.startswith("List[") and ftype.endswith("]"): - base_type = ftype[5:-1] - is_list = True - - is_custom_class = base_type in classes - - lines.append(f" val = obj.get('{fname}')") - if is_json: - lines.append(" if isinstance(val, str): val = parse_json_field(val)") - - if is_custom_class: - if is_list: - lines.append(" if val is not None and isinstance(val, list):") - lines.append( - f" instance.{safe_fname} = [{base_type}.from_dict(x) for x in val]" - ) - else: - lines.append(" if val is not None:") - lines.append( - f" instance.{safe_fname} = {base_type}.from_dict(val)" - ) - else: - lines.append(f" instance.{safe_fname} = val") - - lines.append(" return instance") - lines.append("") - - return "\n".join(lines) diff --git a/src/carp/records/__init__.py b/src/carp/records/__init__.py new file mode 100644 index 0000000..639e1be --- /dev/null +++ b/src/carp/records/__init__.py @@ -0,0 +1,5 @@ +"""Record iteration services.""" + +from .service import RecordService + +__all__ = ["RecordService"] diff --git a/src/carp/records/service.py b/src/carp/records/service.py new file mode 100644 index 0000000..8637457 --- /dev/null +++ b/src/carp/records/service.py @@ -0,0 +1,81 @@ +"""CARP record iteration, filtering, and inspection.""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator +from typing import Any + +from carp.core.fields import collect_field_paths, deployment_id_from_record, full_data_type +from carp.core.files import iter_json_array + + +class RecordService: + """Stream and filter CARP records.""" + + def __init__(self, file_paths: tuple[Any, ...], participant_directory: Any) -> None: + self._file_paths = file_paths + self._participants = participant_directory + + def iter_records( + self, + data_type: str | None = None, + deployment_ids: Iterable[str] | None = None, + ) -> Iterator[dict[str, Any]]: + """Yield records matching optional data-type and deployment filters.""" + + allowed_ids = set(deployment_ids or []) + for file_path in self._file_paths: + for item in iter_json_array(file_path): + if allowed_ids and deployment_id_from_record(item) not in allowed_ids: + continue + if data_type and full_data_type(item) != data_type: + continue + yield item + + def iter_with_participants(self, data_type: str | None = None) -> Iterator[dict[str, Any]]: + """Yield records enriched with participant metadata.""" + + for item in self.iter_records(data_type): + participant = self._participants.get_participant(deployment_id_from_record(item) or "") + if not participant: + yield item + continue + enriched = dict(item) + enriched["_participant"] = participant.to_dict() + yield enriched + + def count( + self, + data_type: str | None = None, + deployment_ids: Iterable[str] | None = None, + ) -> int: + """Return the number of matching records.""" + + return sum(1 for _ in self.iter_records(data_type, deployment_ids)) + + def list_fields(self, sample_size: int = 100) -> list[str]: + """Return field paths sampled from the first records.""" + + fields: set[str] = set() + for index, item in enumerate(self.iter_records()): + if index >= sample_size: + break + fields.update(self.collect_fields(item)) + return sorted(fields) + + def data_types(self) -> list[str]: + """Return all observed record data types.""" + + return sorted({self.data_type(item) for item in self.iter_records()}) + + @staticmethod + def collect_fields(item: dict[str, Any]) -> set[str]: + """Collect field paths for one record.""" + + return collect_field_paths(item) + + @staticmethod + def data_type(item: dict[str, Any]) -> str: + """Return the fully qualified data type for one record.""" + + return full_data_type(item) diff --git a/src/carp/schema/__init__.py b/src/carp/schema/__init__.py new file mode 100644 index 0000000..29e6fac --- /dev/null +++ b/src/carp/schema/__init__.py @@ -0,0 +1,5 @@ +"""Schema discovery services.""" + +from .service import SchemaService + +__all__ = ["SchemaService"] diff --git a/src/carp/schema/service.py b/src/carp/schema/service.py new file mode 100644 index 0000000..f3583ab --- /dev/null +++ b/src/carp/schema/service.py @@ -0,0 +1,30 @@ +"""Schema discovery for CARP studies.""" + +from __future__ import annotations + +from collections import defaultdict +from typing import Any + + +class SchemaService: + """Infer lightweight measurement schemas grouped by data type.""" + + def __init__(self, records: Any) -> None: + self._records = records + self._cache: dict[str, list[str]] = {} + + def scan(self) -> dict[str, list[str]]: + """Return inferred measurement keys grouped by data type.""" + + schemas: dict[str, set[str]] = defaultdict(set) + for item in self._records.iter_records(): + measurement = item.get("measurement", {}).get("data", {}) + for key in measurement.keys(): + schemas[self._records.data_type(item)].add(key) + self._cache = {key: sorted(values) for key, values in sorted(schemas.items())} + return self._cache + + def cached(self) -> dict[str, list[str]]: + """Return the cached schema, scanning the study if needed.""" + + return self._cache or self.scan() diff --git a/src/carp/study.py b/src/carp/study.py new file mode 100644 index 0000000..88ea2c0 --- /dev/null +++ b/src/carp/study.py @@ -0,0 +1,47 @@ +"""Composition root for the modular CARP Analytics API.""" + +from __future__ import annotations + +from pathlib import Path + +from carp.constants import PARTICIPANT_FILE +from carp.core.files import resolve_paths +from carp.export import ExportService +from carp.frames import FrameService +from carp.participants import ParticipantDirectory, ParticipantService +from carp.plotting import PlotService +from carp.records import RecordService +from carp.schema import SchemaService +from carp.types import TypeDefinitionService + + +def _discover_participant_folders(file_paths: tuple[Path, ...]) -> tuple[Path, ...]: + """Return phase folders that contain participant metadata.""" + + folders = {path.parent for path in file_paths if (path.parent / PARTICIPANT_FILE).exists()} + return tuple(sorted(folders)) + + +class CarpStudy: + """Primary public entrypoint for working with CARP study data.""" + + def __init__( + self, + file_paths: str | Path | tuple[str | Path, ...] | list[str | Path], + load_participants: bool = True, + ): + self.file_paths = resolve_paths(file_paths) + participant_folders = _discover_participant_folders(self.file_paths) if load_participants else () + self._directory = ParticipantDirectory.from_folders(participant_folders) + self.records = RecordService(self.file_paths, self._directory) + self.participants = ParticipantService(self, self._directory) + self.schema = SchemaService(self.records) + self.export = ExportService(self.records) + self.frames = FrameService(self.records, self._directory) + self.types = TypeDefinitionService(self.records) + self.plots = PlotService(self.frames, self.participants) + + def participant(self, email: str) -> object: + """Return a participant-scoped view by email.""" + + return self.participants.view(email) diff --git a/src/carp/types/__init__.py b/src/carp/types/__init__.py new file mode 100644 index 0000000..240dfa7 --- /dev/null +++ b/src/carp/types/__init__.py @@ -0,0 +1,5 @@ +"""Type-generation services.""" + +from .service import TypeDefinitionService + +__all__ = ["TypeDefinitionService"] diff --git a/src/carp/types/infer.py b/src/carp/types/infer.py new file mode 100644 index 0000000..f235e7d --- /dev/null +++ b/src/carp/types/infer.py @@ -0,0 +1,64 @@ +"""Schema inference helpers for generated type definitions.""" + +from __future__ import annotations + +import json +from typing import Any + + +def _maybe_json_string(value: object) -> Any | None: + """Parse JSON-like strings when possible.""" + + if not isinstance(value, str): + return None + stripped = value.strip() + if not stripped or stripped[0] not in "[{" or stripped[-1] not in "]}": + return None + try: + parsed = json.loads(stripped) + except json.JSONDecodeError: + return None + return parsed if isinstance(parsed, (dict, list)) else None + + +def merge_schema(schema: dict[str, Any], value: Any) -> None: + """Merge a Python value into an inferred schema.""" + + if value is None: + schema["nullable"] = True + return + parsed = _maybe_json_string(value) + if parsed is not None: + schema["is_json_string"] = True + merge_schema(schema, parsed) + return + if isinstance(value, dict): + schema["type"] = "object" + fields = schema.setdefault("fields", {}) + for key, child in value.items(): + merge_schema(fields.setdefault(key, {}), child) + return + if isinstance(value, list): + schema["type"] = "list" + item_type = schema.setdefault("item_type", {}) + for child in value: + merge_schema(item_type, child) + return + python_type = type(value).__name__ + if schema.get("type") == "primitive" and schema.get("python_type") != python_type: + pair = {schema.get("python_type"), python_type} + schema["python_type"] = "float" if pair == {"int", "float"} else "Any" + return + schema["type"] = "primitive" + schema["python_type"] = python_type + + +def infer_schema(records: Any, sample_size: int) -> dict[str, Any]: + """Infer a schema from sampled study records.""" + + root = {"type": "object", "fields": {}} + for index, item in enumerate(records): + if index >= sample_size: + break + merge_schema(root, item) + return root diff --git a/src/carp/types/render.py b/src/carp/types/render.py new file mode 100644 index 0000000..6a3896c --- /dev/null +++ b/src/carp/types/render.py @@ -0,0 +1,97 @@ +"""Code rendering for inferred type definitions.""" + +from __future__ import annotations + +from typing import Any + + +def render_types(schema: dict[str, Any], root_name: str = "StudyItem") -> str: + """Render dataclass code from an inferred schema.""" + + classes: dict[str, list[tuple[str, str, bool, bool]] | None] = {} + + def type_name(node: dict[str, Any], context: str) -> str: + if node.get("type") == "object": + class_name = "".join(part[:1].upper() + part[1:] for part in context.split("_")) or root_name + while class_name in classes: + class_name = f"{class_name}Item" + classes[class_name] = None + fields = [] + for key, value in node.get("fields", {}).items(): + fields.append( + ( + key, + type_name(value, key), + value.get("nullable", False), + value.get("is_json_string", False), + ) + ) + classes[class_name] = fields + return class_name + if node.get("type") == "list": + return f"list[{type_name(node.get('item_type', {}), context + '_item')}]" + if node.get("type") == "primitive": + return str(node.get("python_type", "Any")) + return "Any" + + type_name(schema, root_name) + lines = [ + '"""Auto-generated type definitions for CARP data."""', + "", + "from __future__ import annotations", + "", + "import json", + "from dataclasses import dataclass", + "from typing import Any", + "", + "", + "def parse_json_field(value: Any) -> Any:", + ' """Parse JSON-like string fields when possible."""', + "", + " if not isinstance(value, str):", + " return value", + " try:", + " return json.loads(value)", + " except json.JSONDecodeError:", + " return value", + "", + ] + for class_name, fields in classes.items(): + lines.extend(["@dataclass(slots=True)", f"class {class_name}:", f' """Generated dataclass for `{class_name}`."""']) + if not fields: + lines.extend([" pass", ""]) + continue + for name, annotation, nullable, _ in fields: + type_hint = f"{annotation} | None" if nullable else annotation + safe_name = f"{name}_" if name in {"class", "from", "type"} else name + lines.append(f" {safe_name}: {type_hint} = None") + lines.extend( + [ + "", + " @classmethod", + " def from_dict(cls, obj: Any) -> Any:", + ' """Build an instance from a dictionary."""', + "", + " if not isinstance(obj, dict):", + " return obj", + " instance = cls()", + ] + ) + for name, annotation, _, is_json in fields: + safe_name = f"{name}_" if name in {"class", "from", "type"} else name + base_type = annotation.removeprefix("list[").removesuffix("]") + lines.append(f" value = obj.get('{name}')") + if is_json: + lines.append(" value = parse_json_field(value)") + if annotation.startswith("list[") and base_type in classes: + lines.extend( + [ + " if isinstance(value, list):", + f" value = [{base_type}.from_dict(item) for item in value]", + ] + ) + elif base_type in classes: + lines.extend([" if value is not None:", f" value = {base_type}.from_dict(value)"]) + lines.append(f" instance.{safe_name} = value") + lines.extend([" return instance", ""]) + return "\n".join(lines) diff --git a/src/carp/types/service.py b/src/carp/types/service.py new file mode 100644 index 0000000..a274261 --- /dev/null +++ b/src/carp/types/service.py @@ -0,0 +1,28 @@ +"""Type-definition generation services.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .infer import infer_schema +from .render import render_types + + +class TypeDefinitionService: + """Generate typed Python models from sampled CARP records.""" + + def __init__(self, records: Any) -> None: + self._records = records + + def generate( + self, + output_file: str | Path = "generated_types.py", + sample_size: int = 1_000, + ) -> Path: + """Generate a Python module containing inferred dataclasses.""" + + schema = infer_schema(self._records.iter_records(), sample_size) + output_path = Path(output_file) + output_path.write_text(render_types(schema), encoding="utf-8") + return output_path diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..49c7d1b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,33 @@ +"""Shared pytest fixtures for CARP Analytics.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from carp import CarpStudy + + +@pytest.fixture() +def fixture_root() -> Path: + """Return the self-contained multi-phase fixture root.""" + + return Path(__file__).parent / "fixtures" / "multi_phase" + + +@pytest.fixture() +def study_paths(fixture_root: Path) -> list[Path]: + """Return the default synthetic study file paths.""" + + return [ + fixture_root / "phase_a" / "data-streams.json", + fixture_root / "phase_b" / "data-streams.json", + ] + + +@pytest.fixture() +def study(study_paths: list[Path]) -> CarpStudy: + """Return a study backed by self-contained fixtures.""" + + return CarpStudy(study_paths) diff --git a/tests/fixtures/multi_phase/phase_a/data-streams.json b/tests/fixtures/multi_phase/phase_a/data-streams.json new file mode 100644 index 0000000..111918d --- /dev/null +++ b/tests/fixtures/multi_phase/phase_a/data-streams.json @@ -0,0 +1,109 @@ +[ + { + "studyDeploymentId": "deploy-email-a", + "dataStream": { + "studyDeploymentId": "deploy-email-a", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "stepcount" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 1000, + "data": { + "steps": 100 + } + }, + "sequenceId": 1, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-email-a", + "dataStream": { + "studyDeploymentId": "deploy-email-a", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "location" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 1000, + "data": { + "latitude": 55.1, + "longitude": 12.1 + } + }, + "sequenceId": 2, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-ssn-a", + "dataStream": { + "studyDeploymentId": "deploy-ssn-a", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "stepcount" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 2000, + "data": { + "steps": 50 + } + }, + "sequenceId": 3, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "dataStream": { + "studyDeploymentId": "deploy-name-a", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "survey" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 3000, + "data": { + "response_json": "{\"score\": 3, \"tags\": [\"rested\", \"calm\"]}" + } + }, + "sequenceId": 4, + "syncPoint": 1, + "triggerIds": [ + "survey" + ], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-unknown-a", + "dataStream": { + "studyDeploymentId": "deploy-unknown-a", + "dataType": { + "namespace": "com.acme", + "name": "stepcount" + }, + "deviceRoleName": "Watch" + }, + "measurement": { + "sensorStartTime": 4000, + "data": { + "steps": 9 + } + }, + "sequenceId": 5, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Watch" + } +] diff --git a/tests/fixtures/multi_phase/phase_a/participant-data.json b/tests/fixtures/multi_phase/phase_a/participant-data.json new file mode 100644 index 0000000..3f4d24d --- /dev/null +++ b/tests/fixtures/multi_phase/phase_a/participant-data.json @@ -0,0 +1,64 @@ +[ + { + "studyDeploymentId": "deploy-email-a", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.full_name": { + "firstName": "Alice", + "lastName": "Example" + }, + "dk.carp.webservices.input.informed_consent": { + "signedTimestamp": "2024-01-01T00:00:00Z", + "userId": "user-email-a", + "name": "alice@example.com", + "consent": "{\"signature\": {\"firstName\": \"Alice\", \"lastName\": \"Example\"}}" + } + } + } + ], + "common": {} + }, + { + "studyDeploymentId": "deploy-ssn-a", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.full_name": "Bob Example", + "dk.carp.webservices.input.ssn": { + "socialSecurityNumber": "1111" + }, + "dk.cachet.carp.input.sex": "male" + } + } + ], + "common": {} + }, + { + "studyDeploymentId": "deploy-name-a", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.full_name": { + "firstName": "Charlie", + "lastName": "Example" + } + } + } + ], + "common": {} + }, + { + "studyDeploymentId": "deploy-unknown-a", + "roles": [ + { + "roleName": "Participant", + "data": {} + } + ], + "common": {} + } +] diff --git a/tests/fixtures/multi_phase/phase_b/data-streams.json b/tests/fixtures/multi_phase/phase_b/data-streams.json new file mode 100644 index 0000000..167d50d --- /dev/null +++ b/tests/fixtures/multi_phase/phase_b/data-streams.json @@ -0,0 +1,133 @@ +[ + { + "studyDeploymentId": "deploy-email-b", + "dataStream": { + "studyDeploymentId": "deploy-email-b", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "stepcount" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 5000, + "data": { + "steps": 150, + "cadence": 90 + } + }, + "sequenceId": 6, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-email-b", + "dataStream": { + "studyDeploymentId": "deploy-email-b", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "location" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 5000, + "data": { + "latitude": 55.2, + "longitude": 12.2 + } + }, + "sequenceId": 7, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-ssn-b", + "dataStream": { + "studyDeploymentId": "deploy-ssn-b", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "stepcount" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 6000, + "data": { + "steps": 70 + } + }, + "sequenceId": 8, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-name-b", + "dataStream": { + "studyDeploymentId": "deploy-name-b", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "survey" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 7000, + "data": { + "response_json": "{\"score\": 5}" + } + }, + "sequenceId": 9, + "syncPoint": 1, + "triggerIds": [ + "survey" + ], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-name-b", + "dataStream": { + "studyDeploymentId": "deploy-name-b", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "location" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 7100, + "data": { + "latitude": 56.0, + "longitude": 13.0 + } + }, + "sequenceId": 10, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + }, + { + "studyDeploymentId": "deploy-orphan", + "dataStream": { + "studyDeploymentId": "deploy-orphan", + "dataType": { + "namespace": "dk.cachet.carp", + "name": "weather" + }, + "deviceRoleName": "Phone" + }, + "measurement": { + "sensorStartTime": 8000, + "data": { + "temperature": 21 + } + }, + "sequenceId": 11, + "syncPoint": 1, + "triggerIds": [], + "deviceRoleName": "Phone" + } +] diff --git a/tests/fixtures/multi_phase/phase_b/participant-data.json b/tests/fixtures/multi_phase/phase_b/participant-data.json new file mode 100644 index 0000000..5cd0b16 --- /dev/null +++ b/tests/fixtures/multi_phase/phase_b/participant-data.json @@ -0,0 +1,46 @@ +[ + { + "studyDeploymentId": "deploy-email-b", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.informed_consent": { + "signedTimestamp": "2024-01-02T00:00:00Z", + "userId": "user-email-b", + "name": "alice@example.com", + "consent": "{\"signature\": {\"firstName\": \"Alice\", \"lastName\": \"Example\"}}" + } + } + } + ], + "common": {} + }, + { + "studyDeploymentId": "deploy-ssn-b", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.full_name": "Robert Example", + "dk.carp.webservices.input.ssn": { + "socialSecurityNumber": "1111" + } + } + } + ], + "common": {} + }, + { + "studyDeploymentId": "deploy-name-b", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.full_name": "Charlie Example" + } + } + ], + "common": {} + } +] diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..5896922 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,57 @@ +"""Tests for CLI wiring and command execution.""" + +from __future__ import annotations + +from argparse import Namespace + +from carp.commandline import app as cli_app + + +def test_cli_commands_and_help(capsys, study_paths, tmp_path) -> None: + """Exercise the public CLI commands.""" + + assert cli_app.main(["--version"]) == 0 + assert cli_app.main([]) == 0 + assert cli_app.main(["schema", *map(str, study_paths)]) == 0 + assert cli_app.main(["count", *map(str, study_paths)]) == 0 + assert cli_app.main(["participants", *map(str, study_paths)]) == 0 + assert cli_app.main( + ["export", *map(str, study_paths), "-o", str(tmp_path / "export.json"), "-t", "dk.cachet.carp.location"] + ) == 0 + assert cli_app.main(["group", *map(str, study_paths), "-o", str(tmp_path / "grouped")]) == 0 + captured = capsys.readouterr().out + assert "carp-analytics-python version" in captured + assert "Total items" in captured + + +def test_cli_convert_and_error_paths(monkeypatch, capsys, study_paths, tmp_path) -> None: + """Exercise CLI conversion and exception-handling branches.""" + + assert cli_app.main(["convert", *map(str, study_paths), "-o", str(tmp_path / "parquet"), "--batch-size", "1"]) == 0 + assert cli_app.main(["count", "missing.json"]) == 1 + + class FakeParser: + """Minimal fake parser for exception tests.""" + + def parse_args(self, _argv): + return Namespace(version=False, command="test", handler=lambda _args: (_ for _ in ()).throw(KeyboardInterrupt())) + + def print_help(self): + return None + + monkeypatch.setattr(cli_app, "_build_parser", lambda: FakeParser()) + assert cli_app.main(["ignored"]) == 130 + monkeypatch.setattr( + cli_app, + "_build_parser", + lambda: type( + "BrokenParser", + (), + { + "parse_args": lambda self, _argv: Namespace(version=False, command="x", handler=lambda _args: (_ for _ in ()).throw(ValueError("boom"))), + "print_help": lambda self: None, + }, + )(), + ) + assert cli_app.main(["ignored"]) == 1 + assert "Error: boom" in capsys.readouterr().out diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..110303a --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,55 @@ +"""Tests for shared CARP helpers.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from carp.core.dependencies import import_or_raise, module_available +from carp.core.fields import collect_field_paths, deployment_id_from_record, get_nested_value +from carp.core.files import JsonArrayWriter, iter_json_array, resolve_paths +from carp.core.naming import parquet_stem, sanitize_filename +from carp.participants.directory import ParticipantDirectory +from carp.participants.parser import load_participant_file + + +def test_core_helpers_cover_nested_values_and_paths(study_paths: list[Path]) -> None: + """Exercise shared path and field helpers.""" + + record = next(iter_json_array(study_paths[0])) + assert resolve_paths(study_paths) == tuple(study_paths) + assert get_nested_value(record, "measurement.data.steps") == 100 + assert get_nested_value(record, "missing.value", "fallback") == "fallback" + assert deployment_id_from_record(record) == "deploy-email-a" + assert "measurement.data.steps" in collect_field_paths(record) + assert sanitize_filename("alice@example.com", allowed="-_.@") == "alice@example.com" + assert parquet_stem("dk.cachet.carp.stepcount") == "dk.cachet.carp__stepcount" + + +def test_json_array_writer_and_module_helpers(tmp_path: Path) -> None: + """Exercise JSON writing and optional dependency errors.""" + + output_path = tmp_path / "output.json" + writer = JsonArrayWriter(output_path) + writer.write({"value": 1}) + writer.write({"value": 2}) + writer.close() + assert output_path.read_text(encoding="utf-8") == '[{"value": 1},{"value": 2}]' + assert module_available("json") is True + with pytest.raises(RuntimeError): + import_or_raise("module_that_does_not_exist_for_tests", "test") + + +def test_participant_loader_handles_invalid_consent(tmp_path: Path) -> None: + """Exercise parser branches for invalid consent payloads and missing folders.""" + + participant_file = tmp_path / "participant-data.json" + participant_file.write_text( + '[{"studyDeploymentId":"x","roles":[{"roleName":"Participant","data":{"dk.carp.webservices.input.informed_consent":"broken"}}]}]', + encoding="utf-8", + ) + loaded = load_participant_file(participant_file) + assert loaded["x"].consent_signed is False + empty_directory = ParticipantDirectory.from_folders((tmp_path / "missing",)) + assert empty_directory.summary_rows() == [] diff --git a/tests/test_edge_frames_plotting.py b/tests/test_edge_frames_plotting.py new file mode 100644 index 0000000..f8eb23e --- /dev/null +++ b/tests/test_edge_frames_plotting.py @@ -0,0 +1,84 @@ +"""Additional edge-case coverage for frames and plotting.""" + +from __future__ import annotations + +from types import SimpleNamespace + +from carp.core.dependencies import import_or_raise +from carp.core.fields import collect_field_paths +from carp.plotting.prepare import candidate_series, frames_from_items, prepare_location_frame, prepare_step_frame +from carp.plotting.render import _merge_steps, render_heatmap + + +def test_frame_service_edge_branches(study, tmp_path) -> None: + """Exercise dataframe and parquet helper branches.""" + + pandas = import_or_raise("pandas", "test") + pyarrow = import_or_raise("pyarrow", "test") + assert study.records.list_fields(sample_size=0) == [] + assert collect_field_paths([]) == set() + assert "items[]" in collect_field_paths({"items": []}) + assert study.frames.get_dataframe_with_participants("missing.type").empty + nested = pandas.DataFrame({"dataStream": [{"studyDeploymentId": "nested-id"}]}) + assert study.frames._deployment_series(nested).tolist() == ["nested-id"] + assert study.frames._participant_row("deploy-email-a")["participant_email"] == "alice@example.com" + aligned = study.frames._align_table( + pyarrow, + pyarrow.Table.from_pylist([{"steps": 1}]), + pyarrow.schema([("steps", pyarrow.float64()), ("cadence", pyarrow.int64())]), + ) + assert aligned.column_names == ["steps", "cadence"] + assert aligned["steps"][0].as_py() == 1.0 + assert aligned["cadence"][0].as_py() is None + assert study.participant("alice@example.com").dataframe("missing.type").empty + assert study.participant("alice@example.com").available_fields(sample_size=0) == [] + assert study.frames.convert_to_parquet(tmp_path / "flush", batch_size=50) + assert study.frames.get_dataframe("missing.type", tmp_path / "flush").empty + + +def test_plotting_helpers_and_edge_paths(study, tmp_path, monkeypatch) -> None: + """Exercise helper functions and low-probability plotting branches.""" + + pandas = import_or_raise("pandas", "test") + location_items = [ + SimpleNamespace( + measurement=SimpleNamespace( + data=SimpleNamespace(latitude=1.0, longitude=2.0), + sensorStartTime=10, + ) + ), + SimpleNamespace(measurement=None), + ] + step_items = [ + SimpleNamespace(measurement=SimpleNamespace(data=SimpleNamespace(steps=3), sensorStartTime=10)), + SimpleNamespace(measurement=SimpleNamespace(data=SimpleNamespace(steps=None), sensorStartTime=11)), + ] + location_frame, step_frame = frames_from_items(location_items, step_items) + assert not location_frame.empty and not step_frame.empty + assert candidate_series(pandas.DataFrame({"value": [1]}), ["missing", "value"]).tolist() == [1] + assert candidate_series(pandas.DataFrame({"nested": [{"a": {"b": 1}}]}), ["nested.a.b"]).tolist() == [1] + assert candidate_series(pandas.DataFrame({"value": [1]}), ["missing.path"]) is None + assert list(prepare_location_frame(study.frames.get_dataframe("dk.cachet.carp.location"))["_lat"]) == [55.1, 55.2, 56.0] + assert list(prepare_step_frame(study.frames.get_dataframe("dk.cachet.carp.stepcount"))["_steps"]) == [100, 50, 150, 70] + assert render_heatmap(location_frame.iloc[0:0], step_frame, tmp_path / "empty.html") is None + assert render_heatmap(location_frame, pandas.DataFrame({"_steps": [0], "_time": [10], "_lat": [1.0], "_lon": [2.0]}), tmp_path / "zero.html") is not None + assert _merge_steps(pandas, location_frame, pandas.DataFrame({"_steps": [1]})).empty + assert study.plots.unified("missing") is None + assert study.plots.deployment("missing", output_file=str(tmp_path / "missing.html")) is None + assert study.plots.deployment("deploy-email-a", location_type="missing.type", output_file=str(tmp_path / "noloc.html")) is None + assert study.plots.deployment("deploy-email-a", step_type="missing.type", output_file=str(tmp_path / "nosteps.html")) is not None + assert study.plots.from_items(location_items, step_items, output_file=str(tmp_path / "objects.html")) is not None + monkeypatch.setattr(study.plots, "candidate_series", lambda *_args, **_kwargs: None) + assert study.participant("alice@example.com").dataframe("dk.cachet.carp.stepcount").shape[0] == 4 + + calls = {"count": 0} + + def staged_series(*_args, **_kwargs): + calls["count"] += 1 + frame = _args[0] + if calls["count"] == 1: + return pandas.Series(["deploy-email-a"] * len(frame), index=frame.index) + return None + + monkeypatch.setattr("carp.plotting.service.candidate_series", staged_series) + assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "staged.html")) is not None diff --git a/tests/test_edge_types_cli.py b/tests/test_edge_types_cli.py new file mode 100644 index 0000000..34690ab --- /dev/null +++ b/tests/test_edge_types_cli.py @@ -0,0 +1,103 @@ +"""Additional edge-case coverage for CLI and type-generation helpers.""" + +from __future__ import annotations + +import runpy +from pathlib import Path + +from carp import CarpStudy +from carp.core.fields import get_nested_value +from carp.participants.parser import load_participant_file +from carp.types.infer import _maybe_json_string, infer_schema, merge_schema +from carp.types.render import render_types + + +def test_cli_module_entrypoint(monkeypatch) -> None: + """Execute the module-level CLI entrypoint.""" + + exit_codes = [] + monkeypatch.setattr("carp.commandline.app.main", lambda: 7) + monkeypatch.setattr("sys.exit", lambda code: exit_codes.append(code)) + runpy.run_module("carp.cli", run_name="__main__") + assert exit_codes == [7] + + +def test_parser_and_schema_edge_branches(study_paths: list[Path], tmp_path: Path) -> None: + """Exercise parser branches not covered by the default fixture.""" + + assert get_nested_value({"a": 1}, "a.b", "fallback") == "fallback" + assert CarpStudy(study_paths).schema.cached()["dk.cachet.carp.location"] == ["latitude", "longitude"] + + participant_file = tmp_path / "participant-data.json" + participant_file.write_text( + """ + [ + {"roles": [{"data": {}}]}, + { + "studyDeploymentId": "string-ssn", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.ssn": "2222", + "dk.carp.webservices.input.informed_consent": { + "name": "eve@example.com", + "consent": "{broken json}", + "note": 1 + } + } + } + ] + }, + { + "studyDeploymentId": "non-string-consent", + "roles": [ + { + "roleName": "Participant", + "data": { + "dk.carp.webservices.input.informed_consent": { + "name": "nonstr@example.com", + "consent": 1 + } + } + } + ] + } + ] + """, + encoding="utf-8", + ) + loaded = load_participant_file(participant_file) + assert loaded["string-ssn"].ssn == "2222" + assert loaded["string-ssn"].email == "eve@example.com" + assert loaded["string-ssn"].full_name is None + assert loaded["non-string-consent"].email == "nonstr@example.com" + + +def test_type_inference_and_rendering_edge_branches() -> None: + """Exercise edge branches in schema inference and code rendering.""" + + assert _maybe_json_string("plain text") is None + assert _maybe_json_string("{broken}") is None + schema = {} + merge_schema(schema, None) + merge_schema(schema, {"value": [1, 2.0]}) + assert schema["nullable"] is True + assert infer_schema(iter([{"a": 1}, {"a": 2}]), sample_size=0)["fields"] == {} + + rendered = render_types( + { + "type": "object", + "fields": { + "child": {"type": "object", "fields": {}}, + "other": {"type": "object", "fields": {"child": {"type": "object", "fields": {"value": {"type": "primitive", "python_type": "int"}}}}}, + "matching": {"type": "object", "fields": {"child": {"type": "object", "fields": {}}}}, + "items": {"type": "list", "item_type": {"type": "object", "fields": {"from": {"type": "primitive", "python_type": "str"}}}}, + "mystery": {}, + }, + } + ) + assert "class Child:" in rendered + assert "class ChildItem:" in rendered + assert "from_: str = None" in rendered + assert "mystery: Any = None" in rendered diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..8f618f1 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,24 @@ +"""Tests for JSON export and grouping flows.""" + +from __future__ import annotations + +import json + + +def test_export_json_and_group_by_field(study, tmp_path) -> None: + """Exercise JSON export and field-based grouping.""" + + export_path = study.export.export_json(tmp_path / "records.json", "dk.cachet.carp.location") + payload = json.loads(export_path.read_text(encoding="utf-8")) + assert len(payload) == 3 + grouped = study.export.group_by_field("dataStream.dataType.namespace", tmp_path / "grouped") + assert {path.name for path in grouped} == {"com.acme.json", "dk.cachet.carp.json"} + + +def test_group_by_participant_and_identity(study, tmp_path) -> None: + """Exercise participant-aware grouping flows.""" + + participant_files = study.export.group_by_participant(tmp_path / "participants") + identity_files = study.export.group_by_identity("email", tmp_path / "emails") + assert len(participant_files) == 5 + assert {path.name for path in identity_files} == {"alice@example.com.json", "unknown.json"} diff --git a/tests/test_frames.py b/tests/test_frames.py new file mode 100644 index 0000000..5198f84 --- /dev/null +++ b/tests/test_frames.py @@ -0,0 +1,29 @@ +"""Tests for dataframe and parquet services.""" + +from __future__ import annotations + + +def test_dataframe_loading_and_participant_columns(study) -> None: + """Exercise dataframe loading from JSON and participant enrichment.""" + + frame = study.frames.get_dataframe("dk.cachet.carp.stepcount") + assert frame.shape[0] == 4 + enriched = study.frames.get_dataframe_with_participants("dk.cachet.carp.weather") + assert enriched.loc[0, "participant_id"] is None + assert study.frames.parquet_path("dk.cachet.carp.stepcount", "out").name == "dk.cachet.carp__stepcount.parquet" + + +def test_parquet_conversion_and_reload(study, tmp_path) -> None: + """Exercise namespace-aware parquet conversion and reload.""" + + output_dir = tmp_path / "parquet" + created = study.frames.convert_to_parquet(output_dir, batch_size=1) + assert {path.name for path in created} == { + "com.acme__stepcount.parquet", + "dk.cachet.carp__location.parquet", + "dk.cachet.carp__stepcount.parquet", + "dk.cachet.carp__survey.parquet", + "dk.cachet.carp__weather.parquet", + } + frame = study.frames.get_dataframe("dk.cachet.carp.stepcount", output_dir) + assert set(frame.columns) >= {"studyDeploymentId", "measurement"} diff --git a/tests/test_participants.py b/tests/test_participants.py new file mode 100644 index 0000000..c2e5db2 --- /dev/null +++ b/tests/test_participants.py @@ -0,0 +1,42 @@ +"""Tests for participant lookup and unified views.""" + +from __future__ import annotations + +from carp.participants.view import ParticipantView + + +def test_participant_lookups_and_summary(study) -> None: + """Exercise participant lookup methods and summary rows.""" + + assert len(study.participants.by_email("alice@example.com")) == 2 + assert len(study.participants.by_ssn("1111")) == 2 + assert len(study.participants.by_name("Charlie Example")) == 2 + summary_rows = study.participants.summary_rows() + assert len(summary_rows) == 4 + assert any(row["emails"] == "alice@example.com" for row in summary_rows) + + +def test_participant_view_info_fields_and_dataframe(study, tmp_path) -> None: + """Exercise the participant-scoped view object.""" + + participant = study.participant("alice@example.com") + assert isinstance(participant, ParticipantView) + info = participant.info() + assert info is not None + assert info["num_deployments"] == 2 + assert participant.count() == 4 + assert participant.data_types() == ["dk.cachet.carp.location", "dk.cachet.carp.stepcount"] + assert "measurement.data.latitude" in participant.available_fields() + assert "measurement.data.steps" in participant.available_fields() + assert participant.dataframe("dk.cachet.carp.stepcount").shape[0] == 2 + assert participant.plot_location(output_file=str(tmp_path / "participant.html")) is not None + + +def test_missing_participant_view_and_unified_lookup(study) -> None: + """Exercise missing participants and unified participant lookups.""" + + missing = study.participant("nobody@example.com") + assert missing.exists is False + assert missing.info() is None + unified_id = study.participant("alice@example.com").info()["unified_id"] + assert len(study.participants.unified(unified_id)) == 2 diff --git a/tests/test_real_data.py b/tests/test_real_data.py new file mode 100644 index 0000000..82945e3 --- /dev/null +++ b/tests/test_real_data.py @@ -0,0 +1,23 @@ +"""Optional real-data integration tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from carp import CarpStudy + +SLEEP_DATA_ROOT = Path(__file__).resolve().parents[1] / "sleep-data" + + +@pytest.mark.skipif(not SLEEP_DATA_ROOT.exists(), reason="sleep-data is not available") +def test_real_data_smoke() -> None: + """Exercise stable invariants on local real study data.""" + + file_paths = sorted(SLEEP_DATA_ROOT.glob("phase-*/data-streams.json")) + study = CarpStudy(file_paths) + assert study.records.count() > 0 + assert len(study.records.data_types()) >= 3 + assert len(study.schema.scan()) >= 3 + assert len(study.participants.summary_rows()) >= 1 diff --git a/tests/test_records_schema.py b/tests/test_records_schema.py new file mode 100644 index 0000000..55bbd13 --- /dev/null +++ b/tests/test_records_schema.py @@ -0,0 +1,29 @@ +"""Tests for record iteration and schema discovery.""" + +from __future__ import annotations + + +def test_record_filters_and_participant_enrichment(study) -> None: + """Exercise record filtering and participant enrichment.""" + + assert study.records.count() == 11 + assert study.records.count("dk.cachet.carp.stepcount") == 4 + filtered = list(study.records.iter_records(deployment_ids=("deploy-email-a",))) + assert len(filtered) == 2 + enriched = list(study.records.iter_with_participants("dk.cachet.carp.stepcount")) + assert all("_participant" in item for item in enriched) + + +def test_record_field_listing_data_types_and_schema_cache(study) -> None: + """Exercise schema discovery and deployment-id fallback paths.""" + + data_types = study.records.data_types() + assert "com.acme.stepcount" in data_types + assert "dk.cachet.carp.survey" in data_types + assert "triggerIds[]" in study.records.list_fields() + survey = list(study.records.iter_records("dk.cachet.carp.survey")) + assert len(survey) == 2 + assert study.records.count(deployment_ids=("deploy-name-a", "deploy-name-b")) == 3 + schema = study.schema.scan() + assert schema["dk.cachet.carp.stepcount"] == ["cadence", "steps"] + assert study.schema.cached() == schema diff --git a/tests/test_structure.py b/tests/test_structure.py new file mode 100644 index 0000000..8eab46b --- /dev/null +++ b/tests/test_structure.py @@ -0,0 +1,22 @@ +"""Structural repository tests.""" + +from __future__ import annotations + +from pathlib import Path + + +def test_python_files_stay_under_two_hundred_lines() -> None: + """Enforce the 200-line limit for Python source and test files.""" + + root = Path(__file__).resolve().parents[1] + python_files = [ + path + for path in root.rglob("*.py") + if all(part not in {".venv.nosync", "dist", "__pycache__"} for part in path.parts) + ] + offenders = [] + for path in python_files: + line_count = len(path.read_text(encoding="utf-8").splitlines()) + if line_count > 200: + offenders.append((path.relative_to(root), line_count)) + assert offenders == [] diff --git a/tests/test_types_plotting.py b/tests/test_types_plotting.py new file mode 100644 index 0000000..b25deb7 --- /dev/null +++ b/tests/test_types_plotting.py @@ -0,0 +1,44 @@ +"""Tests for generated types and plotting services.""" + +from __future__ import annotations + +import importlib.util +import sys + + +def test_generate_type_definitions(study, tmp_path) -> None: + """Exercise generated type definitions for JSON-string payloads.""" + + output_path = study.types.generate(tmp_path / "generated_types.py", sample_size=11) + code = output_path.read_text(encoding="utf-8") + assert "parse_json_field" in code + assert "class StudyItem" in code + spec = importlib.util.spec_from_file_location("generated_types", output_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[spec.name] = module + spec.loader.exec_module(module) + payload = {"measurement": {"data": {"response_json": '{"score": 1}'}}} + instance = module.StudyItem.from_dict(payload) + assert instance.measurement.data.response_json.score == 1 + + +def test_plot_service_outputs_html(study, tmp_path) -> None: + """Exercise participant, deployment, unified, and item-based plots.""" + + participant_path = study.plots.participant("alice@example.com", output_file=str(tmp_path / "alice.html")) + assert participant_path is not None + assert "leaflet" in (tmp_path / "alice.html").read_text(encoding="utf-8").lower() + unified_id = study.participant("alice@example.com").info()["unified_id"] + assert study.plots.unified(unified_id, output_file=str(tmp_path / "unified.html")) is not None + assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "solo.html"), include_steps=False) is not None + location_items = [] + assert study.plots.from_items(location_items, output_file=str(tmp_path / "none.html")) is None + + +def test_plot_service_handles_missing_filters(study, monkeypatch, tmp_path) -> None: + """Exercise plot branches for missing participants and missing columns.""" + + assert study.plots.participant("missing@example.com") is None + monkeypatch.setattr("carp.plotting.service.candidate_series", lambda *_args, **_kwargs: None) + assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "missing.html")) is None diff --git a/uv.lock b/uv.lock index f4509f9..78780d6 100644 --- a/uv.lock +++ b/uv.lock @@ -7,6 +7,24 @@ resolution-markers = [ "python_full_version < '3.11'", ] +[[package]] +name = "alabaster" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, +] + +[[package]] +name = "babel" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" }, +] + [[package]] name = "branca" version = "0.8.2" @@ -65,6 +83,21 @@ dev = [ { name = "pytest" }, { name = "pytest-cov" }, { name = "ruff" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, + { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, + { name = "sphinx-rtd-theme" }, +] +docs = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, + { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, +] +test = [ + { name = "folium" }, + { name = "matplotlib" }, + { name = "pandas" }, + { name = "pyarrow" }, ] [package.metadata] @@ -91,6 +124,15 @@ dev = [ { name = "pytest", specifier = ">=7.4.0" }, { name = "pytest-cov", specifier = ">=4.1.0" }, { name = "ruff", specifier = ">=0.1.0" }, + { name = "sphinx", specifier = ">=8.1.3" }, + { name = "sphinx-rtd-theme", specifier = ">=3.1.0" }, +] +docs = [{ name = "sphinx", specifier = ">=8.0.0" }] +test = [ + { name = "folium", specifier = ">=0.14.0" }, + { name = "matplotlib", specifier = ">=3.7.0" }, + { name = "pandas", specifier = ">=2.0.0" }, + { name = "pyarrow", specifier = ">=14.0.0" }, ] [[package]] @@ -487,6 +529,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, +] + +[[package]] +name = "docutils" +version = "0.22.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.1" @@ -691,6 +758,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/f2/53b6e9bdd2a91202066764eaa74b572ba4dede0fe47a5a26f4de34b7541a/ijson-3.4.0.post0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a0fedf09c0f6ffa2a99e7e7fd9c5f3caf74e655c1ee015a0797383e99382ebc3", size = 54657, upload-time = "2025-10-10T05:29:24.482Z" }, ] +[[package]] +name = "imagesize" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/e6/7bf14eeb8f8b7251141944835abd42eb20a658d89084b7e1f3e5fe394090/imagesize-2.0.0.tar.gz", hash = "sha256:8e8358c4a05c304f1fccf7ff96f036e7243a189e9e42e90851993c558cfe9ee3", size = 1773045, upload-time = "2026-03-03T14:18:29.941Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/53/fb7122b71361a0d121b669dcf3d31244ef75badbbb724af388948de543e2/imagesize-2.0.0-py2.py3-none-any.whl", hash = "sha256:5667c5bbb57ab3f1fa4bc366f4fbc971db3d5ed011fd2715fd8001f782718d96", size = 9441, upload-time = "2026-03-03T14:18:27.892Z" }, +] + [[package]] name = "iniconfig" version = "2.3.0" @@ -1790,6 +1866,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" }, ] +[[package]] +name = "roman-numerals" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/41dc953bbeb056c17d5f7a519f50fdf010bd0553be2d630bc69d1e022703/roman_numerals-4.1.0.tar.gz", hash = "sha256:1af8b147eb1405d5839e78aeb93131690495fe9da5c91856cb33ad55a7f1e5b2", size = 9077, upload-time = "2025-12-17T18:25:34.381Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/54/6f679c435d28e0a568d8e8a7c0a93a09010818634c3c3907fc98d8983770/roman_numerals-4.1.0-py3-none-any.whl", hash = "sha256:647ba99caddc2cc1e55a51e4360689115551bf4476d90e8162cf8c345fe233c7", size = 7676, upload-time = "2025-12-17T18:25:33.098Z" }, +] + [[package]] name = "ruff" version = "0.14.7" @@ -2005,6 +2090,193 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "snowballstemmer" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" }, +] + +[[package]] +name = "sphinx" +version = "8.1.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +dependencies = [ + { name = "alabaster", marker = "python_full_version < '3.11'" }, + { name = "babel", marker = "python_full_version < '3.11'" }, + { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "imagesize", marker = "python_full_version < '3.11'" }, + { name = "jinja2", marker = "python_full_version < '3.11'" }, + { name = "packaging", marker = "python_full_version < '3.11'" }, + { name = "pygments", marker = "python_full_version < '3.11'" }, + { name = "requests", marker = "python_full_version < '3.11'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.11'" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611, upload-time = "2024-10-13T20:27:13.93Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125, upload-time = "2024-10-13T20:27:10.448Z" }, +] + +[[package]] +name = "sphinx" +version = "9.0.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.11.*'", +] +dependencies = [ + { name = "alabaster", marker = "python_full_version == '3.11.*'" }, + { name = "babel", marker = "python_full_version == '3.11.*'" }, + { name = "colorama", marker = "python_full_version == '3.11.*' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, + { name = "imagesize", marker = "python_full_version == '3.11.*'" }, + { name = "jinja2", marker = "python_full_version == '3.11.*'" }, + { name = "packaging", marker = "python_full_version == '3.11.*'" }, + { name = "pygments", marker = "python_full_version == '3.11.*'" }, + { name = "requests", marker = "python_full_version == '3.11.*'" }, + { name = "roman-numerals", marker = "python_full_version == '3.11.*'" }, + { name = "snowballstemmer", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version == '3.11.*'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version == '3.11.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" }, +] + +[[package]] +name = "sphinx" +version = "9.1.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", +] +dependencies = [ + { name = "alabaster", marker = "python_full_version >= '3.12'" }, + { name = "babel", marker = "python_full_version >= '3.12'" }, + { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, + { name = "imagesize", marker = "python_full_version >= '3.12'" }, + { name = "jinja2", marker = "python_full_version >= '3.12'" }, + { name = "packaging", marker = "python_full_version >= '3.12'" }, + { name = "pygments", marker = "python_full_version >= '3.12'" }, + { name = "requests", marker = "python_full_version >= '3.12'" }, + { name = "roman-numerals", marker = "python_full_version >= '3.12'" }, + { name = "snowballstemmer", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" }, +] + +[[package]] +name = "sphinx-rtd-theme" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, + { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, + { name = "sphinxcontrib-jquery" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/84/68/a1bfbf38c0f7bccc9b10bbf76b94606f64acb1552ae394f0b8285bfaea25/sphinx_rtd_theme-3.1.0.tar.gz", hash = "sha256:b44276f2c276e909239a4f6c955aa667aaafeb78597923b1c60babc76db78e4c", size = 7620915, upload-time = "2026-01-12T16:03:31.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/c7/b5c8015d823bfda1a346adb2c634a2101d50bb75d421eb6dcb31acd25ebc/sphinx_rtd_theme-3.1.0-py2.py3-none-any.whl", hash = "sha256:1785824ae8e6632060490f67cf3a72d404a85d2d9fc26bce3619944de5682b89", size = 7655617, upload-time = "2026-01-12T16:03:28.101Z" }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" }, +] + +[[package]] +name = "sphinxcontrib-jquery" +version = "4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, + { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/de/f3/aa67467e051df70a6330fe7770894b3e4f09436dea6881ae0b4f3d87cad8/sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", size = 122331, upload-time = "2023-03-14T15:01:01.944Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/85/749bd22d1a68db7291c89e2ebca53f4306c3f205853cf31e9de279034c3c/sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae", size = 121104, upload-time = "2023-03-14T15:01:00.356Z" }, +] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, +] + [[package]] name = "threadpoolctl" version = "3.6.0"