diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..5546673
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,29 @@
+name: ci
+
+on:
+ push:
+ branches:
+ - "**"
+ pull_request:
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install -e .
+ python -m pip install pytest pytest-cov mypy ruff sphinx sphinx-rtd-theme pandas pyarrow folium matplotlib
+ - name: Lint
+ run: ruff check src examples tests docs
+ - name: Type check
+ run: mypy src/carp
+ - name: Test
+ run: pytest --cov=src/carp --cov-branch --cov-fail-under=100
+ - name: Build docs
+ run: sphinx-build -b html docs docs/_build/html
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..05eebb6
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,132 @@
+name: release
+
+on:
+ push:
+ tags:
+ - "**"
+
+concurrency:
+ group: release-${{ github.ref }}
+ cancel-in-progress: false
+
+jobs:
+ validate_tag:
+ runs-on: ubuntu-latest
+ outputs:
+ version: ${{ steps.version.outputs.version }}
+ tag: ${{ steps.version.outputs.tag }}
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+ - id: version
+ name: Validate tag against package version
+ run: |
+ version=$(python - <<'PY'
+ import pathlib
+ import tomllib
+
+ project = tomllib.loads(pathlib.Path("pyproject.toml").read_text())
+ print(project["project"]["version"])
+ PY
+ )
+ tag="${GITHUB_REF_NAME}"
+ if [ "${tag}" != "${version}" ] && [ "${tag}" != "v${version}" ]; then
+ echo "Tag ${tag} does not match package version ${version}." >&2
+ exit 1
+ fi
+ echo "version=${version}" >> "${GITHUB_OUTPUT}"
+ echo "tag=${tag}" >> "${GITHUB_OUTPUT}"
+
+ test:
+ needs: validate_tag
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install -e .
+ python -m pip install pytest pytest-cov pandas pyarrow folium matplotlib
+ - name: Run tests
+ run: pytest --cov=src/carp --cov-branch --cov-fail-under=100
+
+ quality:
+ needs: validate_tag
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install -e .
+ python -m pip install pytest pytest-cov mypy ruff sphinx sphinx-rtd-theme pandas pyarrow folium matplotlib
+ - name: Lint
+ run: ruff check src examples tests docs
+ - name: Type check
+ run: mypy src/carp
+ - name: Build docs
+ run: sphinx-build -W -b html docs docs/_build/html
+
+ build:
+ needs: [test, quality]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+ - name: Build distributions
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install build twine
+ python -m build
+ python -m twine check dist/*
+ - uses: actions/upload-artifact@v4
+ with:
+ name: python-package-distributions
+ path: dist/
+
+ publish_pypi:
+ needs: build
+ runs-on: ubuntu-latest
+ environment:
+ name: pypi
+ permissions:
+ id-token: write
+ steps:
+ - uses: actions/download-artifact@v4
+ with:
+ name: python-package-distributions
+ path: dist/
+ - uses: pypa/gh-action-pypi-publish@release/v1
+
+ publish_github:
+ needs: [validate_tag, publish_pypi]
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write
+ steps:
+ - uses: actions/download-artifact@v4
+ with:
+ name: python-package-distributions
+ path: dist/
+ - name: Generate checksums
+ run: shasum -a 256 dist/* > dist/SHA256SUMS.txt
+ - uses: softprops/action-gh-release@v2
+ with:
+ name: Release ${{ needs.validate_tag.outputs.tag }}
+ tag_name: ${{ needs.validate_tag.outputs.tag }}
+ generate_release_notes: true
+ files: dist/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 681e02e..7932cbe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,13 +1,35 @@
# Changelog
-All notable changes to this project will be documented in this file.
+## [0.2.0] - 2026-03-26
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+### Added
+
+- New `CarpStudy` public API as the primary entrypoint for CARP study analysis
+- Modular service layout under `carp.core`, `participants`, `records`, `schema`, `export`, `frames`, `types`, `plotting`, and `commandline`
+- Self-contained pytest suite with committed multi-phase fixtures and optional `sleep-data` smoke coverage
+- 100% line and branch coverage enforcement for `src/carp`
+- Sphinx documentation site with autodoc and Napoleon support
+- GitHub Actions CI for linting, type-checking, tests, and docs builds
+- Tag-driven CD workflow that validates version tags, publishes to PyPI, and creates GitHub releases
+- Dedicated `test` and `docs` dependency groups
+
+### Changed
+
+- Replaced the legacy method-heavy design with a thin `CarpStudy` composition root and focused services
+- Kept the `carp` CLI command set stable while rewriting the implementation behind modular handlers
+- Switched plotting defaults to `dk.cachet.carp.location`
+- Made parquet filenames namespace-aware to avoid same-name type collisions
+- Added Google-style docstrings and expanded type annotations across the package
+- Refreshed the README, example scripts, generated type example, and notebook to use the new API
+- Normalized Ruff, MyPy, coverage, and documentation build configuration in `pyproject.toml`
+
+### Removed
-## [Unreleased]
+- Legacy `carp.reader` monolith
+- Legacy `carp.plotting.map_viz` module
+- Old `CarpDataStream`-centric example usage and stale plotting/type-generation references
-## [0.1.0] - 2024-12-02
+## [0.1.0]
### Added
diff --git a/README.md b/README.md
index 87a5fb1..574cfb4 100644
--- a/README.md
+++ b/README.md
@@ -4,202 +4,76 @@
[](https://pypi.org/project/carp-analytics-python/)
[](https://opensource.org/licenses/MIT)
-A high-performance Python library for processing and analysing data from [CARP](https://carp.computerome.dk/) (Copenhagen Research Platform) studies.
-
-> [!BETA]
-> The CARP package is at the beta phase and the APIs and methods might change more often.
-
-## Features
-
-- **Schema Discovery**: Automatically scans and infers the schema of the data
-- **Data Grouping**: Efficiently groups data by any field (e.g., data type, device ID) into separate files
-- **Parquet Export**: Convert JSON data to Parquet for faster subsequent analysis
-- **Participant Management**: Link and track participants across multiple study phases
-- **Visualization**: Generate location heatmaps and other visualizations
-- **Pandas Integration**: Seamlessly work with DataFrames
-
-## Installation
-
-```bash
-pip install carp-analytics-python
-```
-
-### With Optional Dependencies
-
-```bash
-# For pandas/parquet support
-pip install carp-analytics-python[pandas]
-
-# For visualization support
-pip install carp-analytics-python[viz]
-
-# For scientific computing (numpy, scipy, scikit-learn)
-pip install carp-analytics-python[science]
-
-# Install everything
-pip install carp-analytics-python[all]
-```
-
-### Development Installation
-
-```bash
-git clone https://github.com/carp-dk/carp-analytics-python.git
-cd carp-analytics-python
-
-# Using uv (recommended)
-uv sync
-
-# Or using pip
-pip install -e .
-```
+`carp-analytics-python` is a Python library for working with CARP study data. It focuses on streaming JSON records, participant lookup, schema discovery, export, parquet conversion, and optional plotting.
## Quick Start
```python
-from carp import CarpDataStream
+from carp import CarpStudy
-# Initialize with a data file
-data = CarpDataStream("data/study-phase-1/data-streams.json")
-
-# Scan and print the schema
-data.print_schema()
-
-# Convert to Parquet for faster analysis
-data.convert_to_parquet("output_parquet")
-
-# Load data as a DataFrame
-df = data.get_dataframe("dk.cachet.carp.stepcount", "output_parquet")
-print(df.head())
+study = CarpStudy("sleep-data/phase-1-1/data-streams.json")
+print(study.records.count())
+print(study.participants.summary_rows()[0])
```
-## Working with Participants
-
-```python
-from carp import CarpDataStream
-
-# Load data from multiple phases
-data = CarpDataStream([
- "data/phase-1/data-streams.json",
- "data/phase-2/data-streams.json",
-])
-
-# Print participant summary
-data.print_participants()
-
-# Access participant data via email
-participant = data.participant("user@example.com")
-
-# Get participant info
-print(participant.info())
-
-# Get available data types for this participant
-participant.print_data_types()
-
-# Get a DataFrame of step count data
-df = participant.dataframe("dk.cachet.carp.stepcount", "output_parquet")
-```
+## Main API
-## Data Export
+`CarpStudy` is the primary entrypoint.
```python
-# Export specific data type to JSON
-data.export_to_json("heartbeat_data.json", data_type="dk.cachet.carp.heartbeat")
+from carp import CarpStudy
-# Group data by data type
-data.group_by_field("dataStream.dataType.name", "output_by_type")
+study = CarpStudy([
+ "sleep-data/phase-1-1/data-streams.json",
+ "sleep-data/phase-2-1/data-streams.json",
+])
-# Group data by participant
-data.group_by_participant("output_by_participant")
+study.schema.scan()
+study.export.export_json("output.json", data_type="dk.cachet.carp.stepcount")
+study.frames.convert_to_parquet("output_parquet")
+study.participant("alice@example.com").info()
```
-## Visualization
+## CLI
-```python
-# Generate location heatmap for a participant
-participant = data.participant("user@example.com")
-participant.visualize.location(output_file="user_locations.html")
+```bash
+carp schema sleep-data/phase-1-1/data-streams.json
+carp count sleep-data/phase-1-1/data-streams.json
+carp participants sleep-data/phase-1-1/data-streams.json
+carp export sleep-data/phase-1-1/data-streams.json -o output.json -t dk.cachet.carp.stepcount
+carp group sleep-data/phase-1-1/data-streams.json -o grouped_output
+carp convert sleep-data/phase-1-1/data-streams.json -o output_parquet
```
-## Command Line Interface
+## Documentation
-The package includes a CLI for common operations:
+The docs are built with Sphinx, `autodoc`, and `napoleon`.
```bash
-# Show schema of data files
-carp schema data/study/data-streams.json
-
-# Convert JSON to Parquet
-carp convert data/study/data-streams.json -o output_parquet
-
-# Count items in data files
-carp count data/study/data-streams.json
-
-# List participants
-carp participants data/study/data-streams.json
-
-# Export filtered data
-carp export data/study/data-streams.json -o output.json -t dk.cachet.carp.stepcount
-
-# Group data by field
-carp group data/study/data-streams.json -f dataStream.dataType.name -o grouped_output
+python -m pip install sphinx sphinx-rtd-theme
+sphinx-build -b html docs docs/_build/html
```
-## API Reference
-
-### `CarpDataStream`
-
-The main class for working with CARP data streams.
-
-| Method | Description |
-|--------|-------------|
-| `scan_schema()` | Scan and infer the data schema |
-| `print_schema()` | Print the inferred schema as a table |
-| `convert_to_parquet(output_dir)` | Convert JSON to Parquet files |
-| `get_dataframe(data_type, parquet_dir)` | Load data as a pandas DataFrame |
-| `export_to_json(output_path, data_type)` | Export data to JSON file |
-| `group_by_field(field_path, output_dir)` | Group data by a specific field |
-| `participant(email)` | Access participant data via fluent API |
-| `print_participants()` | Print participant summary table |
-
-### `ParticipantAccessor`
-
-Fluent API for accessing individual participant data.
+## Release Automation
-| Method | Description |
-|--------|-------------|
-| `info()` | Get participant information as a dictionary |
-| `print_info()` | Print participant info as a table |
-| `all_data(data_type)` | Generator for all participant data |
-| `data_types()` | Get all unique data types |
-| `dataframe(data_type, parquet_dir)` | Get data as a pandas DataFrame |
-| `visualize.location()` | Generate location heatmap |
+Pushing a new version tag triggers the release workflow. The tag must match the
+package version in `pyproject.toml` as either `0.1.0` or `v0.1.0`.
-## Requirements
+The release workflow reruns tests, linting, type checks, docs builds, and
+package builds before it publishes the distributions to PyPI and attaches the
+same artifacts to a GitHub release.
-- Python 3.10+
-- ijson (for streaming JSON parsing)
-- rich (for terminal output)
-- tqdm (for progress bars)
+PyPI publishing uses GitHub Actions trusted publishing. Configure a trusted
+publisher on PyPI for this repository and the `release` workflow, with the
+`pypi` environment enabled in GitHub.
-Optional:
-- pandas, pyarrow (for DataFrame and Parquet support)
-- matplotlib, folium (for visualization)
-- numpy, scipy, scikit-learn (for scientific computing)
+## Examples
-## Contributing
-
-Contributions are welcome! Please feel free to submit a Pull Request.
-
-1. Fork the repository
-2. Create your feature branch (`git checkout -b feature/featA`)
-3. Commit your changes (`git commit -m 'Add some featA'`)
-4. Push to the branch (`git push origin feature/featA`)
-5. Open a Pull Request
-
-## Licence
-
-This project is licensed under the MIT Licence - see the [Licence](LICENSE) file for details.
+```bash
+python examples/main.py sleep-data/phase-1-1/data-streams.json
+python examples/disc.py sleep-data/phase-1-1/data-streams.json
+```
-## Acknowledgments
+## Optional Dependencies
-- [CARP - Copenhagen Research Platform](https://carp.dk/)
+`pandas` and `pyarrow` enable dataframe and parquet support. `folium` enables plotting.
diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/docs/_static/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/docs/_templates/.gitkeep b/docs/_templates/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/docs/_templates/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 0000000..45844b3
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,142 @@
+API Reference
+=============
+
+.. automodule:: carp
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.study
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.core.models
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.core.fields
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.core.files
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.core.naming
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.core.dependencies
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.participants.parser
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.participants.directory
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.participants.view
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.participants.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.records.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.schema.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.export.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.frames.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.types.infer
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.types.render
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.types.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.plotting.prepare
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.plotting.render
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.plotting.service
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.app
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.common
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.schema
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.count
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.participants
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.export
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: carp.commandline.convert
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/architecture.rst b/docs/architecture.rst
new file mode 100644
index 0000000..4500287
--- /dev/null
+++ b/docs/architecture.rst
@@ -0,0 +1,13 @@
+Architecture
+============
+
+The package is intentionally split into small services:
+
+* ``carp.study`` composes the public `CarpStudy` entrypoint.
+* ``carp.participants`` handles participant parsing and lookup.
+* ``carp.records`` streams and filters JSON records.
+* ``carp.schema`` infers measurement schemas.
+* ``carp.export`` writes JSON output and grouped files.
+* ``carp.frames`` loads pandas dataframes and writes parquet files.
+* ``carp.types`` generates dataclasses from sampled records.
+* ``carp.plotting`` renders HTML maps for participant data.
diff --git a/docs/cli.rst b/docs/cli.rst
new file mode 100644
index 0000000..e15895a
--- /dev/null
+++ b/docs/cli.rst
@@ -0,0 +1,13 @@
+CLI
+===
+
+The command line interface exposes the same core flows as the Python API.
+
+.. code-block:: bash
+
+ carp schema sleep-data/phase-1-1/data-streams.json
+ carp count sleep-data/phase-1-1/data-streams.json
+ carp participants sleep-data/phase-1-1/data-streams.json
+ carp export sleep-data/phase-1-1/data-streams.json -o output.json -t dk.cachet.carp.stepcount
+ carp group sleep-data/phase-1-1/data-streams.json -o grouped_output
+ carp convert sleep-data/phase-1-1/data-streams.json -o output_parquet
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..ab221f3
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,24 @@
+"""Sphinx configuration for CARP Analytics."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+project_root = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(project_root / "src"))
+
+project = "CARP Analytics Python"
+author = "CARP Team"
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.viewcode",
+]
+autodoc_typehints = "description"
+napoleon_google_docstring = True
+napoleon_numpy_docstring = False
+templates_path = ["_templates"]
+exclude_patterns = ["_build"]
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..e024384
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,11 @@
+CARP Analytics Python
+=====================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents
+
+ overview
+ api
+ cli
+ architecture
diff --git a/docs/overview.rst b/docs/overview.rst
new file mode 100644
index 0000000..ce5e00c
--- /dev/null
+++ b/docs/overview.rst
@@ -0,0 +1,8 @@
+Overview
+========
+
+`carp-analytics-python` is built around :class:`carp.study.CarpStudy`.
+It provides services for records, participants, schema discovery, export,
+dataframe conversion, type generation, and plotting.
+
+The package is documented with Google-style docstrings and Sphinx autodoc.
diff --git a/examples/demo.ipynb b/examples/demo.ipynb
index a6bfc40..a8b8e60 100644
--- a/examples/demo.ipynb
+++ b/examples/demo.ipynb
@@ -1,504 +1,134 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c33366d9",
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "from pathlib import Path\n",
- "\n",
- "if \"src\" not in sys.path:\n",
- " sys.path.append(str(Path.cwd() / \"src\"))\n",
- "\n",
- "from sleepiness import SleepinessData\n",
- "\n",
- "file_paths = [\n",
- " \"sleep-data/phase-1-1/data-streams.json\",\n",
- " \"sleep-data/phase-2-1/data-streams.json\",\n",
- " \"sleep-data/phase-3-1/data-streams.json\"\n",
- "]\n",
- "# OR\n",
- "# file_paths = \"data/phase-1-1/data-streams.json\"\n",
- "\n",
- "sd = SleepinessData(file_paths)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "52dc794a",
- "metadata": {},
- "source": [
- "## Participant Data Integration\n",
- "When loading multiple data folders, the library automatically loads `participant-data.json` from each folder and unifies participants across folders (using email/SSN as identifiers)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18e00b45",
- "metadata": {},
- "outputs": [],
- "source": [
- "# View all participants across all loaded data folders\n",
- "sd.print_participants()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "914c2bed",
- "metadata": {},
- "source": [
- "### Data with Participant Info\n",
- "Iterate through data items enriched with participant information:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d35b3bd5",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get participant info for a specific deployment\n",
- "# for item in sd._get_item_generator():\n",
- "# deployment_id = item.get('studyDeploymentId')\n",
- "# if deployment_id:\n",
- "# participant = sd.get_participant(deployment_id)\n",
- "# if participant:\n",
- "# print(f\"Deployment: {deployment_id[:30]}...\")\n",
- "# print(f\" Unified ID: {participant.unified_participant_id}\")\n",
- "# print(f\" Email: {participant.email}\")\n",
- "# print(f\" Source folder: {participant.source_folder}\")\n",
- "# break\n",
- "\n",
- "# Get participant info\n",
- "sd.participant(\"test@example.com\").info()\n",
- "sd.participant(\"test@example.com\").print_info()\n",
- "\n",
- "# Get all data for this participant\n",
- "count = 0\n",
- "for item in sd.participant(\"test@example.com\").all_data():\n",
- " print(item)\n",
- " count += 1\n",
- " if count >= 5:\n",
- " print(\"Limit output for demo\")\n",
- " break\n",
- "\n",
- "# Filter by data type\n",
- "for item in sd.participant(\"test@example.com\").all_data(\"dk.cachet.carp.location\"):\n",
- " print(item)\n",
- "\n",
- "# See available fields\n",
- "sd.participant(\"test@example.com\").available_fields()\n",
- "sd.participant(\"test@example.com\").print_available_fields()\n",
- "\n",
- "# See data types available\n",
- "sd.participant(\"test@example.com\").data_types()\n",
- "sd.participant(\"test@example.com\").print_data_types()\n",
- "\n",
- "# Get count\n",
- "sd.participant(\"test@example.com\").count()\n",
- "\n",
- "# Get DataFrame\n",
- "df = sd.participant(\"test@example.com\").dataframe(\"dk.cachet.carp.stepcount\")\n",
- "\n",
- "# Check if exists\n",
- "sd.participant(\"test@example.com\").exists"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6145e273",
- "metadata": {},
- "source": [
- "### DataFrame with Participant Info\n",
- "Get a DataFrame enriched with participant columns:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "507cb8e8",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get DataFrame with participant columns\n",
- "df = sd.get_dataframe_with_participants(\"dk.cachet.carp.stepcount\")\n",
- "if df is not None and not df.empty:\n",
- " print(df[['participant_id', 'participant_email', 'participant_folder']].head())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d158d50b",
- "metadata": {},
- "source": [
- "### Visualize Participant Data on Map\n",
- "Generate a heatmap aggregating data for a specific participant across all their deployments:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "761607c2",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sleepiness.plotting import LocationVisualizer\n",
- "\n",
- "# Create visualizer\n",
- "viz = LocationVisualizer(sd)\n",
- "\n",
- "# Plot heatmap for a specific participant (e.g., P0002 who appears in all 3 phases)\n",
- "viz.plot_participant_heatmap(\n",
- " unified_participant_id=\"P0002\", # Choose a participant from the summary table\n",
- " output_file=\"participant_heatmap.html\",\n",
- " location_type=\"dk.cachet.carp.location\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f9477bd0",
- "metadata": {},
- "source": [
- "## 1. Schema Discovery\n",
- "Scan the file to understand the structure of the data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5877bce4",
- "metadata": {},
- "outputs": [],
- "source": [
- "sd.print_schema()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "77f655a6",
- "metadata": {},
- "source": [
- "### Generate Type Definitions\n",
- "You can generate a Python module with dataclasses representing the data schema. This allows for type-safe access to the data, including nested JSON objects."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e6bbdbd5",
- "metadata": {},
- "outputs": [],
- "source": [
- "import importlib\n",
- "import sleepiness.reader\n",
- "importlib.reload(sleepiness.reader)\n",
- "\n",
- "# Re-initialize sd to ensure latest code is used\n",
- "sd = sleepiness.reader.SleepinessData(file_paths)\n",
- "sd.generate_type_definitions(output_file=\"generated_types.py\", sample_size=500)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a2fd20bb",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Example usage of generated types\n",
- "try:\n",
- " import generated_types\n",
- " import importlib\n",
- " importlib.reload(generated_types)\n",
- " \n",
- " # Read one item and convert\n",
- " gen = sd._get_item_generator()\n",
- " item = next(gen)\n",
- " \n",
- " obj = generated_types.SleepinessItem.from_dict(item)\n",
- " print(f\"Converted object type: {type(obj)}\")\n",
- " if obj.dataStream and obj.dataStream.dataType:\n",
- " print(f\"Data Stream: {obj.dataStream.dataType.name}\")\n",
- "except ImportError:\n",
- " print(\"Could not import generated_types. Please restart kernel or check file.\")\n",
- "except Exception as e:\n",
- " print(f\"Error: {e}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "62ff430f",
- "metadata": {},
- "outputs": [],
- "source": [
- "item = next(sd._get_item_generator())\n",
- "obj = generated_types.SleepinessItem.from_dict(item)\n",
- "sd.generate_type_definitions(output_file=\"generated_types.py\", sample_size=500)\n",
- "\n",
- "item = next(sd._get_item_generator())\n",
- "obj = generated_types.SleepinessItem.from_dict(item)\n",
- "\n",
- "# Type-safe access\n",
- "print(obj.dataStream.dataType.name)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f243a62f",
- "metadata": {},
- "source": [
- "## 2. Count Items\n",
- "Count the total number of records in the file."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "814969c3",
- "metadata": {},
- "outputs": [],
- "source": [
- "count = sd.count_items()\n",
- "print(f\"Total items: {count}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3f357eba",
- "metadata": {},
- "source": [
- "## 3. Grouping Data\n",
- "Split the large JSON file into smaller files based on the data type."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0e151d64",
- "metadata": {},
- "source": [
- "### Explore Available Fields\n",
- "You can scan a sample of the data to list all available fields in dot-notation. This is helpful for deciding which field to group by."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5c3c6be4",
- "metadata": {},
- "outputs": [],
- "source": [
- "fields = sd.list_all_fields(sample_size=500)\n",
- "print(\"Available fields for grouping:\")\n",
- "for f in fields:\n",
- " print(f\" - {f}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "56ffebb8",
- "metadata": {},
- "outputs": [],
- "source": [
- "output_groups = \"output_groups\"\n",
- "# sd.group_by_field(\"dataStream.studyDeploymentId\", output_groups)\n",
- "sd.group_by_email(output_groups)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3f9f3497",
- "metadata": {},
- "source": [
- "## 4. Export to JSON\n",
- "Export a specific data type to a separate JSON file."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2e9aa571",
- "metadata": {},
- "outputs": [],
- "source": [
- "sd.export_to_json(\"heartbeat.json\", data_type=\"dk.cachet.carp.heartbeat\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fc1c9eb8",
- "metadata": {},
- "source": [
- "## 5. Convert to Parquet\n",
- "Convert the data to Parquet format for efficient storage and loading."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9a648c91",
- "metadata": {},
- "outputs": [],
- "source": [
- "parquet_dir = \"output_parquet\"\n",
- "sd.convert_to_parquet(parquet_dir)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b5f02117",
- "metadata": {},
- "source": [
- "## 6. Load DataFrame\n",
- "Load data into a pandas DataFrame, utilizing the Parquet files if available."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9d9112e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load stepcount data\n",
- "df = sd.get_dataframe(\"dk.cachet.carp.completedtask\", parquet_dir)\n",
- "\n",
- "if df is not None:\n",
- " print(f\"Loaded {len(df)} records\")\n",
- " display(df.head())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7472a455",
- "metadata": {},
- "outputs": [],
- "source": [
- "# df first row\n",
- "df.iloc[313].measurement"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b10095ea",
- "metadata": {},
- "source": [
- "## 7. Plotting\n",
- "Generate a heatmap of user locations and overlay step count data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a7d3b527",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sleepiness.plotting import LocationVisualizer\n",
- "\n",
- "# Initialize visualizer\n",
- "viz = LocationVisualizer(sd)\n",
- "\n",
- "# Pick a user ID (you can find one from the grouping step or list_all_fields)\n",
- "# For demo purposes, let's try to find a valid ID from the loaded dataframe if available, \n",
- "# or just use a hardcoded one if you know it.\n",
- "study_deployment_id = \"0efd5a7f-6428-48db-8099-8d65a62606b4\" # Example ID\n",
- "\n",
- "# Generate heatmap\n",
- "# Note: Ensure you have 'dk.cachet.carp.geolocation' and 'dk.cachet.carp.stepcount' data available\n",
- "# You might need to run convert_to_parquet first if you haven't.\n",
- "\n",
- "\n",
- "viz.plot_user_heatmap(\n",
- " study_deployment_id=study_deployment_id,\n",
- " location_type=\"dk.cachet.carp.location\", # Adjust type name if different\n",
- " step_type=\"dk.cachet.carp.stepcount\", # Adjust type name if different\n",
- " output_file=\"user_heatmap.html\"\n",
- ")\n",
- "\n",
- "# Display the map in the notebook\n",
- "# from IPython.display import IFrame\n",
- "# IFrame(src='user_heatmap.html', width=700, height=600)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "63223a42",
- "metadata": {},
- "source": [
- "### Plotting with Type-Safe Objects\n",
- "You can also convert the data to type-safe objects and pass them directly to the visualizer. This is useful if you want to manipulate the objects before plotting."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "997894b1",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 1. Get DataFrames\n",
- "df_loc = sd.get_dataframe(\"dk.cachet.carp.location\", parquet_dir)\n",
- "df_steps = sd.get_dataframe(\"dk.cachet.carp.stepcount\", parquet_dir)\n",
- "\n",
- "# 2. Filter by User\n",
- "# Using the same ID as above\n",
- "if df_loc is not None and not df_loc.empty:\n",
- " df_loc_user = df_loc[df_loc['studyDeploymentId'] == study_deployment_id]\n",
- " df_steps_user = df_steps[df_steps['studyDeploymentId'] == study_deployment_id] if df_steps is not None else pd.DataFrame()\n",
- "\n",
- " # 3. Convert to Objects\n",
- " # Note: generated_types.SleepinessItem.from_dict expects a dictionary structure matching the JSON.\n",
- " # If df_loc comes from Parquet, it might have nested columns as dicts (if read correctly) or flat columns.\n",
- " # Let's assume it has nested columns or we convert it.\n",
- " \n",
- " # If the dataframe has nested dicts (e.g. 'measurement' column contains dicts):\n",
- " location_items = [generated_types.SleepinessItem.from_dict(row) for row in df_loc_user.to_dict('records')]\n",
- " step_items = [generated_types.SleepinessItem.from_dict(row) for row in df_steps_user.to_dict('records')]\n",
- " \n",
- " print(f\"Converted {len(location_items)} location items and {len(step_items)} step items.\")\n",
- "\n",
- " # 4. Plot\n",
- " viz.plot_heatmap_from_items(\n",
- " location_items=location_items,\n",
- " step_items=step_items,\n",
- " output_file=\"user_heatmap_objects.html\"\n",
- " )\n",
- " \n",
- " # Display\n",
- " # IFrame(src='user_heatmap_objects.html', width=700, height=600)\n",
- "else:\n",
- " print(\"No data found to plot.\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "carp-analytics-python (3.13.5)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.13.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CARP Analytics Notebook Example\n",
+ "\n",
+ "This notebook shows the current `CarpStudy` API with the bundled `sleep-data` dataset or the committed test fixtures."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from pathlib import Path\n",
+ "\n",
+ "if \"src\" not in sys.path:\n",
+ " sys.path.append(str(Path.cwd() / \"src\"))\n",
+ "\n",
+ "from carp import CarpStudy\n",
+ "\n",
+ "\n",
+ "def default_paths() -> list[Path]:\n",
+ " sleep_paths = sorted(Path(\"sleep-data\").glob(\"phase-*/data-streams.json\"))\n",
+ " if sleep_paths:\n",
+ " return sleep_paths\n",
+ " return sorted(Path(\"tests/fixtures/multi_phase\").glob(\"*/data-streams.json\"))\n",
+ "\n",
+ "\n",
+ "file_paths = default_paths()\n",
+ "study = CarpStudy(file_paths)\n",
+ "file_paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Total records: {study.records.count():,}\")\n",
+ "print(f\"Data types: {study.records.data_types()}\")\n",
+ "study.schema.scan()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "participant_rows = study.participants.summary_rows()\n",
+ "participant_rows[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "example_email = next((row[\"emails\"] for row in participant_rows if row[\"emails\"] != \"N/A\"), None)\n",
+ "participant = study.participant(example_email) if example_email else None\n",
+ "participant.info() if participant else None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " step_frame = study.frames.get_dataframe(\"dk.cachet.carp.stepcount\")\n",
+ " step_frame.head()\n",
+ "except RuntimeError as exc:\n",
+ " print(exc)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "generated_path = Path(\"examples/generated_types.py\")\n",
+ "study.types.generate(generated_path, sample_size=25)\n",
+ "\n",
+ "import generated_types\n",
+ "\n",
+ "first_record = next(study.records.iter_records())\n",
+ "generated_types.StudyItem.from_dict(first_record)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if participant is not None:\n",
+ " try:\n",
+ " participant.plot_location(output_file=\"examples/user_heatmap.html\")\n",
+ " except RuntimeError as exc:\n",
+ " print(exc)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/examples/disc.py b/examples/disc.py
index 4de8498..0832a0a 100644
--- a/examples/disc.py
+++ b/examples/disc.py
@@ -1,59 +1,38 @@
-# discover_schema.py
-import ijson
-from collections import defaultdict
-import yaml
-
-def discover_schema(file_path):
- schema = defaultdict(set)
-
- with open(file_path, 'rb') as f:
- parser = ijson.parse(f)
- current_path = []
- for prefix, event, value in parser:
- current_path = prefix.split('.')
- full_path = '.'.join(current_path)
-
- if event == 'map_key':
- current_path.append(value)
- continue
- elif event in ('start_map', 'start_array'):
- pass
- elif event in ('end_map', 'end_array'):
- if current_path:
- current_path.pop()
- continue
-
- # leaf value
- if value is None:
- type_name = 'null'
- elif event == 'string':
- type_name = 'string'
- elif event in ('number', 'integer'):
- type_name = 'number'
- elif event == 'boolean':
- type_name = 'boolean'
- else:
- type_name = event
-
- schema['.'.join(current_path)].add(type_name)
-
- # Convert to nice nested dict
- nested = {}
- for path, types in schema.items():
- parts = path.split('.')
- d = nested
- for part in parts[:-1]:
- if part not in d:
- d[part] = {'_type': 'object', '_children': {}}
- elif '_children' not in d[part]:
- d[part]['_children'] = {}
- d = d[part]['_children']
- key = parts[-1]
- d[key] = {'_type': list(types)} if len(types) > 1 else {'_type': list(types)[0]}
-
- return nested
-
-if __name__ == '__main__':
- import sys
- schema = discover_schema(sys.argv[1])
- print(yaml.dump(schema, default_flow_style=False, sort_keys=False))
\ No newline at end of file
+"""Compact schema-discovery example for `CarpStudy`."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from carp import CarpStudy
+
+
+def _default_paths() -> list[Path]:
+ """Return bundled data-stream files for schema discovery."""
+
+ sleep_paths = sorted(Path("sleep-data").glob("phase-*/data-streams.json"))
+ if sleep_paths:
+ return sleep_paths
+ return sorted(Path("tests/fixtures/multi_phase").glob("*/data-streams.json"))
+
+
+def main() -> int:
+ """Load a study and print schema and field examples."""
+
+ file_paths = [Path(arg) for arg in sys.argv[1:]] or _default_paths()
+ study = CarpStudy(file_paths, load_participants=False)
+ print("Observed data types:")
+ for data_type in study.records.data_types():
+ print(f" - {data_type}")
+ print("\nSchema summary:")
+ for data_type, fields in study.schema.scan().items():
+ print(f" {data_type}: {', '.join(fields)}")
+ print("\nSample field paths:")
+ for field in study.records.list_fields(sample_size=3)[:12]:
+ print(f" - {field}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/examples/generated_types.py b/examples/generated_types.py
index afd1dbe..7e8f3bf 100644
--- a/examples/generated_types.py
+++ b/examples/generated_types.py
@@ -1,250 +1,122 @@
-# Auto-generated type definitions
+"""Example generated dataclasses for CARP study records."""
from __future__ import annotations
-from dataclasses import dataclass
-from typing import List, Optional, Any, Dict
+
import json
+from dataclasses import dataclass
+from typing import Any
-def parse_json_field(value):
- if isinstance(value, str):
- try:
- return json.loads(value)
- except:
- return value
- return value
-
-@dataclass
-class SleepinessItem:
- sequenceId: int = None
- studyDeploymentId: str = None
- deviceRoleName: str = None
- measurement: Measurement = None
- triggerIds: List[int] = None
- syncPoint: SyncPoint = None
- dataStream: DataStream = None
- @classmethod
- def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('sequenceId')
- instance.sequenceId = val
- val = obj.get('studyDeploymentId')
- instance.studyDeploymentId = val
- val = obj.get('deviceRoleName')
- instance.deviceRoleName = val
- val = obj.get('measurement')
- if val is not None:
- instance.measurement = Measurement.from_dict(val)
- val = obj.get('triggerIds')
- instance.triggerIds = val
- val = obj.get('syncPoint')
- if val is not None:
- instance.syncPoint = SyncPoint.from_dict(val)
- val = obj.get('dataStream')
- if val is not None:
- instance.dataStream = DataStream.from_dict(val)
- return instance
-
-@dataclass
-class Measurement:
- sensorStartTime: int = None
- data: Data = None
+def parse_json_field(value: Any) -> Any:
+ """Parse JSON text when a field stores serialized payload data."""
+
+ if not isinstance(value, str):
+ return value
+ try:
+ return json.loads(value)
+ except json.JSONDecodeError:
+ return value
+
+
+@dataclass(slots=True)
+class DataType:
+ """Data-type metadata for one CARP record."""
+
+ namespace: str | None = None
+ name: str | None = None
@classmethod
def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('sensorStartTime')
- instance.sensorStartTime = val
- val = obj.get('data')
- if val is not None:
- instance.data = Data.from_dict(val)
- return instance
-
-@dataclass
-class Data:
- __type: str = None
- period: int = None
- deviceType: str = None
- deviceRoleName: str = None
- batteryLevel: int = None
- batteryStatus: str = None
- screenEvent: str = None
- type_: str = None
- confidence: int = None
- triggerId: int = None
- taskName: str = None
- destinationDeviceRoleName: str = None
- control: str = None
- steps: int = None
- time: str = None
- speed: float = None
- isMock: bool = None
- heading: float = None
- accuracy: float = None
- altitude: float = None
- latitude: float = None
- longitude: float = None
- speedAccuracy: float = None
- headingAccuracy: float = None
- verticalAccuracy: float = None
- elapsedRealtimeNanos: int = None
- elapsedRealtimeUncertaintyNanos: float = None
- date: str = None
- sunset: str = None
- country: str = None
- sunrise: str = None
- tempMax: float = None
- tempMin: float = None
- areaName: str = None
- humidity: float = None
- pressure: float = None
- windSpeed: float = None
- cloudiness: float = None
- windDegree: float = None
- temperature: float = None
- weatherMain: str = None
- weatherDescription: str = None
+ """Build a data-type object from a dictionary."""
+
+ return obj if not isinstance(obj, dict) else cls(obj.get("namespace"), obj.get("name"))
+
+
+@dataclass(slots=True)
+class DataStream:
+ """Stream metadata attached to a CARP record."""
+
+ studyDeploymentId: str | None = None
+ deviceRoleName: str | None = None
+ dataType: DataType | None = None
@classmethod
def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('__type')
- instance.__type = val
- val = obj.get('period')
- instance.period = val
- val = obj.get('deviceType')
- instance.deviceType = val
- val = obj.get('deviceRoleName')
- instance.deviceRoleName = val
- val = obj.get('batteryLevel')
- instance.batteryLevel = val
- val = obj.get('batteryStatus')
- instance.batteryStatus = val
- val = obj.get('screenEvent')
- instance.screenEvent = val
- val = obj.get('type')
- instance.type_ = val
- val = obj.get('confidence')
- instance.confidence = val
- val = obj.get('triggerId')
- instance.triggerId = val
- val = obj.get('taskName')
- instance.taskName = val
- val = obj.get('destinationDeviceRoleName')
- instance.destinationDeviceRoleName = val
- val = obj.get('control')
- instance.control = val
- val = obj.get('steps')
- instance.steps = val
- val = obj.get('time')
- instance.time = val
- val = obj.get('speed')
- instance.speed = val
- val = obj.get('isMock')
- instance.isMock = val
- val = obj.get('heading')
- instance.heading = val
- val = obj.get('accuracy')
- instance.accuracy = val
- val = obj.get('altitude')
- instance.altitude = val
- val = obj.get('latitude')
- instance.latitude = val
- val = obj.get('longitude')
- instance.longitude = val
- val = obj.get('speedAccuracy')
- instance.speedAccuracy = val
- val = obj.get('headingAccuracy')
- instance.headingAccuracy = val
- val = obj.get('verticalAccuracy')
- instance.verticalAccuracy = val
- val = obj.get('elapsedRealtimeNanos')
- instance.elapsedRealtimeNanos = val
- val = obj.get('elapsedRealtimeUncertaintyNanos')
- instance.elapsedRealtimeUncertaintyNanos = val
- val = obj.get('date')
- instance.date = val
- val = obj.get('sunset')
- instance.sunset = val
- val = obj.get('country')
- instance.country = val
- val = obj.get('sunrise')
- instance.sunrise = val
- val = obj.get('tempMax')
- instance.tempMax = val
- val = obj.get('tempMin')
- instance.tempMin = val
- val = obj.get('areaName')
- instance.areaName = val
- val = obj.get('humidity')
- instance.humidity = val
- val = obj.get('pressure')
- instance.pressure = val
- val = obj.get('windSpeed')
- instance.windSpeed = val
- val = obj.get('cloudiness')
- instance.cloudiness = val
- val = obj.get('windDegree')
- instance.windDegree = val
- val = obj.get('temperature')
- instance.temperature = val
- val = obj.get('weatherMain')
- instance.weatherMain = val
- val = obj.get('weatherDescription')
- instance.weatherDescription = val
- return instance
-
-@dataclass
-class SyncPoint:
- synchronizedOn: str = None
- sensorTimestampAtSyncPoint: int = None
- relativeClockSpeed: float = None
+ """Build stream metadata from a dictionary."""
+
+ if not isinstance(obj, dict):
+ return obj
+ return cls(
+ studyDeploymentId=obj.get("studyDeploymentId"),
+ deviceRoleName=obj.get("deviceRoleName"),
+ dataType=DataType.from_dict(obj.get("dataType")),
+ )
+
+
+@dataclass(slots=True)
+class MeasurementData:
+ """Common measurement payload used in the examples."""
+
+ steps: int | None = None
+ latitude: float | None = None
+ longitude: float | None = None
+ response_json: Any = None
@classmethod
def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('synchronizedOn')
- instance.synchronizedOn = val
- val = obj.get('sensorTimestampAtSyncPoint')
- instance.sensorTimestampAtSyncPoint = val
- val = obj.get('relativeClockSpeed')
- instance.relativeClockSpeed = val
- return instance
-
-@dataclass
-class DataStream:
- studyDeploymentId: str = None
- deviceRoleName: str = None
- dataType: DataType = None
+ """Build a measurement payload from a dictionary."""
+
+ if not isinstance(obj, dict):
+ return obj
+ return cls(
+ steps=obj.get("steps"),
+ latitude=obj.get("latitude"),
+ longitude=obj.get("longitude"),
+ response_json=parse_json_field(obj.get("response_json")),
+ )
+
+
+@dataclass(slots=True)
+class Measurement:
+ """Measurement wrapper for one CARP record."""
+
+ sensorStartTime: int | None = None
+ data: MeasurementData | None = None
@classmethod
def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('studyDeploymentId')
- instance.studyDeploymentId = val
- val = obj.get('deviceRoleName')
- instance.deviceRoleName = val
- val = obj.get('dataType')
- if val is not None:
- instance.dataType = DataType.from_dict(val)
- return instance
-
-@dataclass
-class DataType:
- namespace: str = None
- name: str = None
+ """Build a measurement object from a dictionary."""
+
+ if not isinstance(obj, dict):
+ return obj
+ return cls(
+ sensorStartTime=obj.get("sensorStartTime"),
+ data=MeasurementData.from_dict(obj.get("data")),
+ )
+
+
+@dataclass(slots=True)
+class StudyItem:
+ """Example typed CARP record used by the examples notebook."""
+
+ sequenceId: int | None = None
+ studyDeploymentId: str | None = None
+ deviceRoleName: str | None = None
+ triggerIds: list[Any] | None = None
+ measurement: Measurement | None = None
+ dataStream: DataStream | None = None
@classmethod
def from_dict(cls, obj: Any) -> Any:
- if not isinstance(obj, dict): return obj
- instance = cls()
- val = obj.get('namespace')
- instance.namespace = val
- val = obj.get('name')
- instance.name = val
- return instance
+ """Build a typed study item from a dictionary."""
+
+ if not isinstance(obj, dict):
+ return obj
+ return cls(
+ sequenceId=obj.get("sequenceId"),
+ studyDeploymentId=obj.get("studyDeploymentId"),
+ deviceRoleName=obj.get("deviceRoleName"),
+ triggerIds=obj.get("triggerIds"),
+ measurement=Measurement.from_dict(obj.get("measurement")),
+ dataStream=DataStream.from_dict(obj.get("dataStream")),
+ )
diff --git a/examples/main.py b/examples/main.py
index 24edf10..0ff30ed 100644
--- a/examples/main.py
+++ b/examples/main.py
@@ -1,41 +1,49 @@
#!/usr/bin/env python3
-"""
-Example script demonstrating basic usage of the carp-analytics-python library.
+"""End-to-end example usage for `CarpStudy`."""
-Run from the project root after installing the package:
- python examples/main.py data/study/data-streams.json
-"""
+from __future__ import annotations
-from carp import CarpDataStream
import sys
+from pathlib import Path
-def main():
- file_path = "data/study/data-streams.json"
- if len(sys.argv) > 1:
- file_path = sys.argv[1]
-
- print(f"Loading {file_path}...")
- data = CarpDataStream(file_path)
-
- # Scan and print schema
- print("Scanning schema...")
- data.print_schema()
-
- # Example: Grouping data by data type
- # output_dir = "output_groups"
- # print(f"Grouping data into {output_dir}...")
- # data.group_by_field("dataStream.dataType.name", output_dir)
-
- # Convert to Parquet
- parquet_dir = "output_parquet"
- data.convert_to_parquet(parquet_dir)
-
- # Load back as DataFrame
- df = data.get_dataframe("dk.cachet.carp.stepcount", parquet_dir)
- if df is not None:
- print(f"Loaded {len(df)} stepcount records.")
- print(df.head())
-
-
-if __name__ == '__main__':
- main()
\ No newline at end of file
+from carp import CarpStudy
+
+
+def _default_paths() -> list[Path]:
+ """Return bundled study paths for the example."""
+
+ sleep_paths = sorted(Path("sleep-data").glob("phase-*/data-streams.json"))
+ if sleep_paths:
+ return sleep_paths
+ fixture_root = Path("tests/fixtures/multi_phase")
+ return sorted(fixture_root.glob("*/data-streams.json"))
+
+
+def main() -> int:
+ """Run the example against one or more study files."""
+
+ file_paths = [Path(arg) for arg in sys.argv[1:]] or _default_paths()
+ study = CarpStudy(file_paths, load_participants=True)
+ print(f"Loaded {len(file_paths)} study file(s)")
+ print(f"Total records: {study.records.count():,}")
+ print(f"Data types: {', '.join(study.records.data_types())}")
+ rows = study.participants.summary_rows()
+ print(f"Unified participants: {len(rows)}")
+ for row in rows[:3]:
+ print(f" {row['unified_id']}: {row['emails']} ({row['deployments']} deployments)")
+ example_email = next((row["emails"] for row in rows if row["emails"] != "N/A"), None)
+ if example_email:
+ participant = study.participant(example_email)
+ print(f"Example participant: {participant.info()}")
+ try:
+ step_frame = study.frames.get_dataframe("dk.cachet.carp.stepcount")
+ except RuntimeError as exc:
+ print(f"Skipping dataframe example: {exc}")
+ else:
+ print("Step-count preview:")
+ print(step_frame.head().to_string(index=False))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/pyproject.toml b/pyproject.toml
index 0388c4e..f657906 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,17 @@ dev = [
"mypy>=1.5.0",
"ruff>=0.1.0",
"pre-commit>=3.4.0",
+ "sphinx>=8.1.3",
+ "sphinx-rtd-theme>=3.1.0",
+]
+test = [
+ "pandas>=2.0.0",
+ "pyarrow>=14.0.0",
+ "matplotlib>=3.7.0",
+ "folium>=0.14.0",
+]
+docs = [
+ "sphinx>=8.0.0",
]
[build-system]
@@ -101,7 +112,10 @@ packages = ["src/carp"]
[tool.ruff]
target-version = "py310"
-line-length = 100
+line-length = 140
+extend-exclude = ["examples/demo.ipynb"]
+
+[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
@@ -116,7 +130,7 @@ ignore = [
"B008", # do not perform function calls in argument defaults
]
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
known-first-party = ["carp"]
[tool.mypy]
@@ -126,16 +140,25 @@ warn_unused_configs = true
disallow_untyped_defs = true
exclude = ["examples/", "tests/"]
+[[tool.mypy.overrides]]
+module = ["ijson", "pandas", "pyarrow", "pyarrow.*", "folium", "folium.*"]
+ignore_missing_imports = true
+
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
-addopts = "-v --tb=short"
+addopts = "-v --tb=short --cov=src/carp --cov-branch --cov-report=term-missing"
+markers = [
+ "real_data: optional tests that use local sleep-data when available",
+]
[tool.coverage.run]
source = ["src/carp"]
branch = true
[tool.coverage.report]
+fail_under = 100
+show_missing = true
exclude_lines = [
"pragma: no cover",
"if TYPE_CHECKING:",
diff --git a/src/carp/__init__.py b/src/carp/__init__.py
index 40bd9bd..e24f3e6 100644
--- a/src/carp/__init__.py
+++ b/src/carp/__init__.py
@@ -1,20 +1,6 @@
-"""
-CARP Analytics Python - A high-performance library for processing CARP study data.
+"""Public package interface for CARP Analytics."""
-This library provides tools for streaming, processing, and analysing large JSON
-data streams from CARP (Copenhagen Research Platform) clinical and research studies.
-"""
+from .study import CarpStudy
-from .reader import CarpDataStream, ParticipantManager, ParticipantInfo, ParticipantAccessor
-
-__version__ = "0.1.0"
-__author__ = "Copenhagen Research Platform"
-__email__ = "support@carp.dk"
-
-__all__ = [
- "CarpDataStream",
- "ParticipantManager",
- "ParticipantInfo",
- "ParticipantAccessor",
- "__version__",
-]
+__all__ = ["CarpStudy"]
+__version__ = "0.2.0"
diff --git a/src/carp/cli.py b/src/carp/cli.py
index 586416f..e751160 100644
--- a/src/carp/cli.py
+++ b/src/carp/cli.py
@@ -1,139 +1,10 @@
-"""
-Command-line interface for CARP Analytics Python.
-"""
+"""Command-line entrypoint for CARP Analytics."""
-import argparse
-import sys
-
-from rich.console import Console
-
-console = Console()
+from __future__ import annotations
+import sys
-def main() -> int:
- """Main entry point for the CLI."""
- parser = argparse.ArgumentParser(
- prog="carp",
- description="CARP Analytics - Process and analyze data from CARP research studies",
- )
- parser.add_argument(
- "--version",
- action="store_true",
- help="Show version and exit",
- )
-
- subparsers = parser.add_subparsers(dest="command", help="Available commands")
-
- # Schema command
- schema_parser = subparsers.add_parser("schema", help="Scan and print data schema")
- schema_parser.add_argument("files", nargs="+", help="JSON data files to process")
-
- # Convert command
- convert_parser = subparsers.add_parser("convert", help="Convert JSON to Parquet")
- convert_parser.add_argument("files", nargs="+", help="JSON data files to convert")
- convert_parser.add_argument(
- "-o", "--output",
- default="output_parquet",
- help="Output directory for Parquet files (default: output_parquet)",
- )
- convert_parser.add_argument(
- "--batch-size",
- type=int,
- default=10000,
- help="Batch size for conversion (default: 10000)",
- )
-
- # Count command
- count_parser = subparsers.add_parser("count", help="Count items in data files")
- count_parser.add_argument("files", nargs="+", help="JSON data files to count")
-
- # Participants command
- participants_parser = subparsers.add_parser(
- "participants",
- help="List participants from data files",
- )
- participants_parser.add_argument("files", nargs="+", help="JSON data files to process")
-
- # Export command
- export_parser = subparsers.add_parser("export", help="Export data to JSON")
- export_parser.add_argument("files", nargs="+", help="JSON data files to process")
- export_parser.add_argument(
- "-o", "--output",
- required=True,
- help="Output JSON file path",
- )
- export_parser.add_argument(
- "-t", "--type",
- dest="data_type",
- help="Filter by data type (e.g., dk.cachet.carp.stepcount)",
- )
-
- # Group command
- group_parser = subparsers.add_parser("group", help="Group data by field")
- group_parser.add_argument("files", nargs="+", help="JSON data files to process")
- group_parser.add_argument(
- "-f", "--field",
- default="dataStream.dataType.name",
- help="Field path to group by (default: dataStream.dataType.name)",
- )
- group_parser.add_argument(
- "-o", "--output",
- default="output_grouped",
- help="Output directory (default: output_grouped)",
- )
-
- args = parser.parse_args()
-
- if args.version:
- from carp import __version__
- console.print(f"carp-analytics-python version {__version__}")
- return 0
-
- if not args.command:
- parser.print_help()
- return 0
-
- # Import here to avoid slow startup for --help
- from carp import CarpDataStream
-
- try:
- if args.command == "schema":
- sd = CarpDataStream(args.files, load_participants=False)
- sd.print_schema()
-
- elif args.command == "convert":
- sd = CarpDataStream(args.files, load_participants=False)
- sd.convert_to_parquet(args.output, batch_size=args.batch_size)
-
- elif args.command == "count":
- sd = CarpDataStream(args.files, load_participants=False)
- count = sd.count_items()
- console.print(f"[bold green]Total items: {count:,}[/bold green]")
-
- elif args.command == "participants":
- sd = CarpDataStream(args.files, load_participants=True)
- sd.print_participants()
-
- elif args.command == "export":
- sd = CarpDataStream(args.files, load_participants=False)
- sd.export_to_json(args.output, data_type=args.data_type)
-
- elif args.command == "group":
- sd = CarpDataStream(args.files, load_participants=False)
- sd.group_by_field(args.field, args.output)
-
- except FileNotFoundError as e:
- console.print(f"[bold red]Error: {e}[/bold red]")
- return 1
- except KeyboardInterrupt:
- console.print("\n[yellow]Interrupted.[/yellow]")
- return 130
- except Exception as e:
- console.print(f"[bold red]Error: {e}[/bold red]")
- return 1
-
- return 0
-
+from carp.commandline.app import main
if __name__ == "__main__":
sys.exit(main())
diff --git a/src/carp/commandline/__init__.py b/src/carp/commandline/__init__.py
new file mode 100644
index 0000000..9472d5e
--- /dev/null
+++ b/src/carp/commandline/__init__.py
@@ -0,0 +1 @@
+"""Command-line support for CARP Analytics."""
diff --git a/src/carp/commandline/app.py b/src/carp/commandline/app.py
new file mode 100644
index 0000000..90e5a9f
--- /dev/null
+++ b/src/carp/commandline/app.py
@@ -0,0 +1,56 @@
+"""Argument parsing and dispatch for the CARP CLI."""
+
+from __future__ import annotations
+
+import argparse
+
+from .common import console, print_version
+from .convert import register as register_convert
+from .count import register as register_count
+from .export import register_export, register_group
+from .participants import register as register_participants
+from .schema import register as register_schema
+
+
+def _build_parser() -> argparse.ArgumentParser:
+ """Construct the top-level CLI parser."""
+
+ parser = argparse.ArgumentParser(
+ prog="carp",
+ description="CARP Analytics - Process and analyze data from CARP research studies",
+ )
+ parser.add_argument("--version", action="store_true", help="Show version and exit")
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
+ for register in (
+ register_schema,
+ register_convert,
+ register_count,
+ register_participants,
+ register_export,
+ register_group,
+ ):
+ register(subparsers)
+ return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+ """Run the CARP command-line interface."""
+
+ parser = _build_parser()
+ args = parser.parse_args(argv)
+ if args.version:
+ return print_version()
+ if not args.command:
+ parser.print_help()
+ return 0
+ try:
+ return int(args.handler(args))
+ except FileNotFoundError as exc:
+ console.print(f"[bold red]Error: {exc}[/bold red]")
+ return 1
+ except KeyboardInterrupt:
+ console.print("\n[yellow]Interrupted.[/yellow]")
+ return 130
+ except Exception as exc:
+ console.print(f"[bold red]Error: {exc}[/bold red]")
+ return 1
diff --git a/src/carp/commandline/common.py b/src/carp/commandline/common.py
new file mode 100644
index 0000000..0f558f4
--- /dev/null
+++ b/src/carp/commandline/common.py
@@ -0,0 +1,48 @@
+"""Shared CLI helpers and presenters."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from rich.console import Console
+from rich.table import Table
+
+from carp import __version__
+from carp.study import CarpStudy
+
+console = Console()
+
+
+def build_study(files: Any, load_participants: bool = True) -> CarpStudy:
+ """Construct a study from CLI arguments."""
+
+ return CarpStudy(files, load_participants=load_participants)
+
+
+def print_version() -> int:
+ """Print the package version and return a success status."""
+
+ console.print(f"carp-analytics-python version {__version__}")
+ return 0
+
+
+def print_schema(schema_map: dict[str, list[str]]) -> None:
+ """Render a schema table."""
+
+ table = Table(title="Inferred Schema")
+ table.add_column("Data Type", style="cyan")
+ table.add_column("Fields", style="magenta")
+ for data_type, fields in schema_map.items():
+ table.add_row(data_type, ", ".join(fields))
+ console.print(table)
+
+
+def print_participants(rows: list[dict[str, str]]) -> None:
+ """Render participant summary rows."""
+
+ table = Table(title="Participants Summary")
+ for column in ("unified_id", "deployments", "folders", "emails", "ssns", "names"):
+ table.add_column(column.replace("_", " ").title())
+ for row in rows:
+ table.add_row(*(row[key] for key in row))
+ console.print(table)
diff --git a/src/carp/commandline/convert.py b/src/carp/commandline/convert.py
new file mode 100644
index 0000000..fede4d2
--- /dev/null
+++ b/src/carp/commandline/convert.py
@@ -0,0 +1,28 @@
+"""CLI command for parquet conversion."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .common import build_study, console
+
+
+def register(subparsers: Any) -> None:
+ """Register the `convert` subcommand."""
+
+ parser = subparsers.add_parser("convert", help="Convert JSON to Parquet")
+ parser.add_argument("files", nargs="+", help="JSON data files to convert")
+ parser.add_argument("-o", "--output", default="output_parquet", help="Output directory")
+ parser.add_argument("--batch-size", type=int, default=10_000, help="Batch size")
+ parser.set_defaults(handler=run)
+
+
+def run(args: Any) -> int:
+ """Execute the `convert` subcommand."""
+
+ files = build_study(args.files, load_participants=False).frames.convert_to_parquet(
+ args.output,
+ batch_size=args.batch_size,
+ )
+ console.print(f"[bold green]Created {len(files)} parquet files.[/bold green]")
+ return 0
diff --git a/src/carp/commandline/count.py b/src/carp/commandline/count.py
new file mode 100644
index 0000000..37eebfa
--- /dev/null
+++ b/src/carp/commandline/count.py
@@ -0,0 +1,23 @@
+"""CLI command for record counting."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .common import build_study, console
+
+
+def register(subparsers: Any) -> None:
+ """Register the `count` subcommand."""
+
+ parser = subparsers.add_parser("count", help="Count items in data files")
+ parser.add_argument("files", nargs="+", help="JSON data files to count")
+ parser.set_defaults(handler=run)
+
+
+def run(args: Any) -> int:
+ """Execute the `count` subcommand."""
+
+ count = build_study(args.files, load_participants=False).records.count()
+ console.print(f"[bold green]Total items: {count:,}[/bold green]")
+ return 0
diff --git a/src/carp/commandline/export.py b/src/carp/commandline/export.py
new file mode 100644
index 0000000..b8be9b9
--- /dev/null
+++ b/src/carp/commandline/export.py
@@ -0,0 +1,54 @@
+"""CLI commands for exporting study data."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .common import build_study, console
+
+
+def register_export(subparsers: Any) -> None:
+ """Register the `export` subcommand."""
+
+ parser = subparsers.add_parser("export", help="Export data to JSON")
+ parser.add_argument("files", nargs="+", help="JSON data files to process")
+ parser.add_argument("-o", "--output", required=True, help="Output JSON file path")
+ parser.add_argument("-t", "--type", dest="data_type", help="Filter by data type")
+ parser.set_defaults(handler=run_export)
+
+
+def register_group(subparsers: Any) -> None:
+ """Register the `group` subcommand."""
+
+ parser = subparsers.add_parser("group", help="Group data by field")
+ parser.add_argument("files", nargs="+", help="JSON data files to process")
+ parser.add_argument(
+ "-f",
+ "--field",
+ default="dataStream.dataType.name",
+ help="Field path to group by",
+ )
+ parser.add_argument("-o", "--output", default="output_grouped", help="Output directory")
+ parser.set_defaults(handler=run_group)
+
+
+def run_export(args: Any) -> int:
+ """Execute the `export` subcommand."""
+
+ output = build_study(args.files, load_participants=False).export.export_json(
+ args.output,
+ args.data_type,
+ )
+ console.print(f"[bold green]Exported data to {output}[/bold green]")
+ return 0
+
+
+def run_group(args: Any) -> int:
+ """Execute the `group` subcommand."""
+
+ files = build_study(args.files, load_participants=False).export.group_by_field(
+ args.field,
+ args.output,
+ )
+ console.print(f"[bold green]Created {len(files)} grouped files.[/bold green]")
+ return 0
diff --git a/src/carp/commandline/participants.py b/src/carp/commandline/participants.py
new file mode 100644
index 0000000..a58ddb5
--- /dev/null
+++ b/src/carp/commandline/participants.py
@@ -0,0 +1,22 @@
+"""CLI command for participant summaries."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .common import build_study, print_participants
+
+
+def register(subparsers: Any) -> None:
+ """Register the `participants` subcommand."""
+
+ parser = subparsers.add_parser("participants", help="List participants from data files")
+ parser.add_argument("files", nargs="+", help="JSON data files to process")
+ parser.set_defaults(handler=run)
+
+
+def run(args: Any) -> int:
+ """Execute the `participants` subcommand."""
+
+ print_participants(build_study(args.files).participants.summary_rows())
+ return 0
diff --git a/src/carp/commandline/schema.py b/src/carp/commandline/schema.py
new file mode 100644
index 0000000..c0d1386
--- /dev/null
+++ b/src/carp/commandline/schema.py
@@ -0,0 +1,22 @@
+"""CLI command for schema discovery."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .common import build_study, print_schema
+
+
+def register(subparsers: Any) -> None:
+ """Register the `schema` subcommand."""
+
+ parser = subparsers.add_parser("schema", help="Scan and print data schema")
+ parser.add_argument("files", nargs="+", help="JSON data files to process")
+ parser.set_defaults(handler=run)
+
+
+def run(args: Any) -> int:
+ """Execute the `schema` subcommand."""
+
+ print_schema(build_study(args.files, load_participants=False).schema.scan())
+ return 0
diff --git a/src/carp/constants.py b/src/carp/constants.py
new file mode 100644
index 0000000..f7ad041
--- /dev/null
+++ b/src/carp/constants.py
@@ -0,0 +1,7 @@
+"""Shared constants for CARP Analytics."""
+
+DATA_STREAM_FILE = "data-streams.json"
+PARTICIPANT_FILE = "participant-data.json"
+DEFAULT_LOCATION_TYPE = "dk.cachet.carp.location"
+DEFAULT_STEP_TYPE = "dk.cachet.carp.stepcount"
+UNKNOWN_VALUE = "unknown"
diff --git a/src/carp/core/__init__.py b/src/carp/core/__init__.py
new file mode 100644
index 0000000..d99bee6
--- /dev/null
+++ b/src/carp/core/__init__.py
@@ -0,0 +1,18 @@
+"""Core utilities shared across CARP Analytics services."""
+
+from .fields import collect_field_paths, deployment_id_from_record, full_data_type
+from .files import JsonArrayWriter, iter_json_array, resolve_paths
+from .models import ParticipantInfo
+from .naming import parquet_stem, sanitize_filename
+
+__all__ = [
+ "JsonArrayWriter",
+ "ParticipantInfo",
+ "collect_field_paths",
+ "deployment_id_from_record",
+ "full_data_type",
+ "iter_json_array",
+ "parquet_stem",
+ "resolve_paths",
+ "sanitize_filename",
+]
diff --git a/src/carp/core/dependencies.py b/src/carp/core/dependencies.py
new file mode 100644
index 0000000..767a266
--- /dev/null
+++ b/src/carp/core/dependencies.py
@@ -0,0 +1,33 @@
+"""Optional dependency helpers."""
+
+from __future__ import annotations
+
+import importlib
+import importlib.util
+from typing import Any
+
+
+def module_available(module_name: str) -> bool:
+ """Return whether a module can be imported."""
+
+ return importlib.util.find_spec(module_name) is not None
+
+
+def import_or_raise(module_name: str, extra_name: str) -> Any:
+ """Import a dependency or raise a helpful runtime error.
+
+ Args:
+ module_name: Importable module name.
+ extra_name: Package extra or install hint shown to the user.
+
+ Returns:
+ The imported module.
+
+ Raises:
+ RuntimeError: If the dependency is unavailable.
+ """
+
+ try:
+ return importlib.import_module(module_name)
+ except ImportError as exc: # pragma: no cover - exercised through callers.
+ raise RuntimeError(f"{module_name} is required for this feature. Install the `{extra_name}` extras.") from exc
diff --git a/src/carp/core/fields.py b/src/carp/core/fields.py
new file mode 100644
index 0000000..2ae681a
--- /dev/null
+++ b/src/carp/core/fields.py
@@ -0,0 +1,56 @@
+"""Helpers for nested CARP record structures."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from carp.constants import UNKNOWN_VALUE
+
+
+def get_nested_value(value: Any, path: str, default: Any = None) -> Any:
+ """Resolve a dot-separated path from nested dictionaries."""
+
+ current = value
+ for part in path.split("."):
+ if not isinstance(current, dict):
+ return default
+ current = current.get(part)
+ if current is None:
+ return default
+ return current
+
+
+def collect_field_paths(value: Any, prefix: str = "") -> set[str]:
+ """Collect dot-separated field paths from nested dictionaries."""
+
+ paths: set[str] = set()
+ if isinstance(value, dict):
+ for key, child in value.items():
+ path = f"{prefix}.{key}" if prefix else key
+ paths.add(path)
+ paths.update(collect_field_paths(child, path))
+ elif isinstance(value, list):
+ if prefix:
+ paths.add(f"{prefix}[]")
+ if value:
+ paths.update(collect_field_paths(value[0], f"{prefix}[]"))
+ return paths
+
+
+def full_data_type(item: dict[str, Any]) -> str:
+ """Return the fully qualified data type for a CARP record."""
+
+ data_type = get_nested_value(item, "dataStream.dataType", {})
+ namespace = data_type.get("namespace", UNKNOWN_VALUE)
+ name = data_type.get("name", UNKNOWN_VALUE)
+ return f"{namespace}.{name}"
+
+
+def deployment_id_from_record(item: dict[str, Any]) -> str | None:
+ """Return the deployment identifier for a CARP record."""
+
+ top_level = item.get("studyDeploymentId")
+ if isinstance(top_level, str):
+ return top_level
+ nested = get_nested_value(item, "dataStream.studyDeploymentId")
+ return nested if isinstance(nested, str) else None
diff --git a/src/carp/core/files.py b/src/carp/core/files.py
new file mode 100644
index 0000000..51ed658
--- /dev/null
+++ b/src/carp/core/files.py
@@ -0,0 +1,53 @@
+"""Filesystem helpers for CARP Analytics."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterator, Sequence
+from pathlib import Path
+from typing import Any
+
+import ijson
+
+
+def resolve_paths(file_paths: str | Path | Sequence[str | Path]) -> tuple[Path, ...]:
+ """Validate and normalize data-stream paths."""
+
+ raw_paths = [file_paths] if isinstance(file_paths, (str, Path)) else list(file_paths)
+ resolved = tuple(Path(path) for path in raw_paths)
+ for path in resolved:
+ if not path.exists():
+ raise FileNotFoundError(f"File not found: {path}")
+ return resolved
+
+
+def iter_json_array(file_path: Path) -> Iterator[dict[str, Any]]:
+ """Stream JSON array items from disk using `ijson`."""
+
+ with file_path.open("rb") as handle:
+ yield from ijson.items(handle, "item", use_float=True)
+
+
+class JsonArrayWriter:
+ """Incrementally write JSON arrays without buffering the full payload."""
+
+ def __init__(self, output_path: Path):
+ self.output_path = output_path
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
+ self._handle = self.output_path.open("w", encoding="utf-8")
+ self._first_item = True
+ self._handle.write("[")
+
+ def write(self, item: dict[str, Any]) -> None:
+ """Append one JSON object to the array."""
+
+ if not self._first_item:
+ self._handle.write(",")
+ json.dump(item, self._handle)
+ self._first_item = False
+
+ def close(self) -> None:
+ """Finalize and close the output file."""
+
+ self._handle.write("]")
+ self._handle.close()
diff --git a/src/carp/core/models.py b/src/carp/core/models.py
new file mode 100644
index 0000000..e05b467
--- /dev/null
+++ b/src/carp/core/models.py
@@ -0,0 +1,28 @@
+"""Domain models shared by multiple subsystems."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+
+@dataclass(slots=True)
+class ParticipantInfo:
+ """Normalized participant metadata for one deployment."""
+
+ study_deployment_id: str
+ role_name: str = "Participant"
+ full_name: str | None = None
+ sex: str | None = None
+ ssn: str | None = None
+ user_id: str | None = None
+ email: str | None = None
+ consent_signed: bool = False
+ consent_timestamp: str | None = None
+ source_folder: str | None = None
+ unified_participant_id: str | None = None
+
+ def to_dict(self) -> dict[str, Any]:
+ """Return a JSON-serializable representation of the participant."""
+
+ return asdict(self)
diff --git a/src/carp/core/naming.py b/src/carp/core/naming.py
new file mode 100644
index 0000000..f143977
--- /dev/null
+++ b/src/carp/core/naming.py
@@ -0,0 +1,20 @@
+"""File and identifier naming helpers."""
+
+from __future__ import annotations
+
+from carp.constants import UNKNOWN_VALUE
+
+
+def sanitize_filename(value: str, allowed: str = "-_") -> str:
+ """Return a filesystem-safe representation of a string."""
+
+ safe = "".join(char for char in value if char.isalnum() or char in allowed).strip()
+ return safe or UNKNOWN_VALUE
+
+
+def parquet_stem(data_type: str) -> str:
+ """Return a namespace-aware parquet stem for a data type."""
+
+ namespace, _, name = data_type.rpartition(".")
+ stem = f"{namespace}__{name}" if namespace else data_type
+ return sanitize_filename(stem, allowed="-_.")
diff --git a/src/carp/export/__init__.py b/src/carp/export/__init__.py
new file mode 100644
index 0000000..4d036f8
--- /dev/null
+++ b/src/carp/export/__init__.py
@@ -0,0 +1,5 @@
+"""Data export services."""
+
+from .service import ExportService
+
+__all__ = ["ExportService"]
diff --git a/src/carp/export/service.py b/src/carp/export/service.py
new file mode 100644
index 0000000..a04ba98
--- /dev/null
+++ b/src/carp/export/service.py
@@ -0,0 +1,93 @@
+"""JSON export and grouping services."""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable
+from pathlib import Path
+from typing import Any
+
+from carp.constants import UNKNOWN_VALUE
+from carp.core.fields import get_nested_value
+from carp.core.files import JsonArrayWriter
+from carp.core.naming import sanitize_filename
+
+
+class ExportService:
+ """Export CARP records to JSON arrays."""
+
+ def __init__(self, records: Any) -> None:
+ self._records = records
+
+ def export_json(self, output_path: str | Path, data_type: str | None = None) -> Path:
+ """Write matching records to a JSON array file."""
+
+ writer = JsonArrayWriter(Path(output_path))
+ try:
+ for item in self._records.iter_records(data_type):
+ writer.write(item)
+ finally:
+ writer.close()
+ return Path(output_path)
+
+ def group_by_field(self, field_path: str, output_dir: str | Path) -> list[Path]:
+ """Group records by a nested field path."""
+
+ def key_factory(item: dict[str, Any]) -> str:
+ value = get_nested_value(item, field_path, UNKNOWN_VALUE)
+ return sanitize_filename(str(value), allowed="-_.@")
+
+ return self._write_groups(Path(output_dir), self._records.iter_records(), key_factory)
+
+ def group_by_participant(self, output_dir: str | Path, data_type: str | None = None) -> list[Path]:
+ """Group records by unified participant identifier."""
+
+ def key_factory(item: dict[str, Any]) -> str:
+ participant = item.get("_participant", {})
+ return sanitize_filename(
+ str(participant.get("unified_participant_id", UNKNOWN_VALUE)),
+ allowed="-_.@",
+ )
+
+ return self._write_groups(Path(output_dir), self._records.iter_with_participants(data_type), key_factory)
+
+ def group_by_identity(
+ self,
+ field_name: str,
+ output_dir: str | Path,
+ data_type: str | None = None,
+ ) -> list[Path]:
+ """Group records by a participant identity field."""
+
+ def key_factory(item: dict[str, Any]) -> str:
+ participant = item.get("_participant", {})
+ value = participant.get(field_name) or UNKNOWN_VALUE
+ return sanitize_filename(str(value), allowed="-_.@")
+
+ return self._write_groups(
+ Path(output_dir),
+ self._records.iter_with_participants(data_type),
+ key_factory,
+ )
+
+ def _write_groups(
+ self,
+ output_dir: Path,
+ items: Iterable[dict[str, Any]],
+ key_factory: Callable[[dict[str, Any]], str],
+ ) -> list[Path]:
+ """Write grouped JSON files and return created paths."""
+
+ writers: dict[str, JsonArrayWriter] = {}
+ output_dir.mkdir(parents=True, exist_ok=True)
+ try:
+ for item in items:
+ key = key_factory(item)
+ writer = writers.get(key)
+ if writer is None:
+ writer = JsonArrayWriter(output_dir / f"{key}.json")
+ writers[key] = writer
+ writer.write(item)
+ finally:
+ for writer in writers.values():
+ writer.close()
+ return sorted(writer.output_path for writer in writers.values())
diff --git a/src/carp/frames/__init__.py b/src/carp/frames/__init__.py
new file mode 100644
index 0000000..5353942
--- /dev/null
+++ b/src/carp/frames/__init__.py
@@ -0,0 +1,5 @@
+"""Dataframe and parquet services."""
+
+from .service import FrameService
+
+__all__ = ["FrameService"]
diff --git a/src/carp/frames/service.py b/src/carp/frames/service.py
new file mode 100644
index 0000000..4b87173
--- /dev/null
+++ b/src/carp/frames/service.py
@@ -0,0 +1,139 @@
+"""Dataframe loading and parquet conversion for CARP studies."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+from carp.core.dependencies import import_or_raise
+from carp.core.naming import parquet_stem
+
+
+class FrameService:
+ """Load CARP data as dataframes or parquet files."""
+
+ def __init__(self, records: Any, participant_directory: Any) -> None:
+ self._records = records
+ self._participants = participant_directory
+
+ def parquet_path(self, data_type: str, output_dir: str | Path) -> Path:
+ """Return the namespace-aware parquet path for a data type."""
+
+ return Path(output_dir) / f"{parquet_stem(data_type)}.parquet"
+
+ def get_dataframe(self, data_type: str, parquet_dir: str | Path | None = None) -> Any:
+ """Return a dataframe for one data type."""
+
+ pandas = import_or_raise("pandas", "pandas")
+ if parquet_dir:
+ parquet_path = self.parquet_path(data_type, parquet_dir)
+ if parquet_path.exists():
+ return pandas.read_parquet(parquet_path)
+ return pandas.DataFrame(list(self._records.iter_records(data_type)))
+
+ def get_dataframe_with_participants(
+ self,
+ data_type: str,
+ parquet_dir: str | Path | None = None,
+ ) -> Any:
+ """Return a dataframe enriched with participant metadata."""
+
+ pandas = import_or_raise("pandas", "pandas")
+ frame = self.get_dataframe(data_type, parquet_dir)
+ if frame.empty:
+ return frame
+ deployment_ids = self._deployment_series(frame)
+ participant_rows = deployment_ids.apply(self._participant_row)
+ return pandas.concat([frame, participant_rows], axis=1)
+
+ def convert_to_parquet(
+ self,
+ output_dir: str | Path,
+ batch_size: int = 10_000,
+ ) -> list[Path]:
+ """Convert the study to namespace-aware parquet files."""
+
+ pyarrow = import_or_raise("pyarrow", "pandas")
+ parquet = import_or_raise("pyarrow.parquet", "pandas")
+ output_path = Path(output_dir)
+ output_path.mkdir(parents=True, exist_ok=True)
+ buffers: dict[str, list[dict[str, Any]]] = defaultdict(list)
+ writers: dict[str, Any] = {}
+ try:
+ for item in self._records.iter_records():
+ data_type = self._records.data_type(item)
+ buffers[data_type].append(item)
+ if len(buffers[data_type]) >= batch_size:
+ self._flush_buffer(pyarrow, parquet, output_path, data_type, buffers, writers)
+ finally:
+ for data_type, buffer in buffers.items():
+ if buffer:
+ self._flush_buffer(pyarrow, parquet, output_path, data_type, buffers, writers)
+ for writer in writers.values():
+ writer.close()
+ return sorted(self.parquet_path(data_type, output_path) for data_type in writers)
+
+ def _participant_row(self, deployment_id: str | None) -> Any:
+ """Return participant columns for one deployment identifier."""
+
+ pandas = import_or_raise("pandas", "pandas")
+ participant = self._participants.get_participant(deployment_id or "")
+ if not participant:
+ return pandas.Series(
+ {
+ "participant_id": None,
+ "participant_email": None,
+ "participant_folder": None,
+ }
+ )
+ return pandas.Series(
+ {
+ "participant_id": participant.unified_participant_id,
+ "participant_email": participant.email,
+ "participant_folder": participant.source_folder,
+ }
+ )
+
+ def _deployment_series(self, frame: Any) -> Any:
+ """Return deployment identifiers from a dataframe."""
+
+ if "studyDeploymentId" in frame.columns:
+ return frame["studyDeploymentId"]
+ return frame["dataStream"].apply(lambda value: value.get("studyDeploymentId") if isinstance(value, dict) else None)
+
+ def _flush_buffer(
+ self,
+ pyarrow: Any,
+ parquet: Any,
+ output_path: Path,
+ data_type: str,
+ buffers: dict[str, list[dict[str, Any]]],
+ writers: dict[str, Any],
+ ) -> None:
+ """Flush one buffered parquet batch to disk."""
+
+ table = pyarrow.Table.from_pylist(buffers[data_type])
+ path = self.parquet_path(data_type, output_path)
+ writer = writers.get(data_type)
+ if writer is None:
+ writers[data_type] = parquet.ParquetWriter(path, table.schema)
+ writer = writers[data_type]
+ elif not table.schema.equals(writer.schema):
+ table = self._align_table(pyarrow, table, writer.schema)
+ writer.write_table(table)
+ buffers[data_type].clear()
+
+ def _align_table(self, pyarrow: Any, table: Any, schema: Any) -> Any:
+ """Align a batch to an existing parquet schema."""
+
+ columns = []
+ for field in schema:
+ if field.name not in table.column_names:
+ columns.append(pyarrow.nulls(len(table), type=field.type))
+ continue
+ column = table[field.name]
+ if not column.type.equals(field.type):
+ column = column.cast(field.type)
+ columns.append(column)
+ return pyarrow.Table.from_arrays(columns, schema=schema)
diff --git a/src/carp/participants/__init__.py b/src/carp/participants/__init__.py
new file mode 100644
index 0000000..7bb3835
--- /dev/null
+++ b/src/carp/participants/__init__.py
@@ -0,0 +1,7 @@
+"""Participant services and models."""
+
+from .directory import ParticipantDirectory
+from .service import ParticipantService
+from .view import ParticipantView
+
+__all__ = ["ParticipantDirectory", "ParticipantService", "ParticipantView"]
diff --git a/src/carp/participants/directory.py b/src/carp/participants/directory.py
new file mode 100644
index 0000000..445efa3
--- /dev/null
+++ b/src/carp/participants/directory.py
@@ -0,0 +1,152 @@
+"""Participant lookup and unification services."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from pathlib import Path
+
+from carp.constants import PARTICIPANT_FILE
+from carp.core.models import ParticipantInfo
+
+from .parser import load_participant_file
+
+
+def _normalize(value: str | None) -> str | None:
+ """Normalize string identifiers for matching."""
+
+ if not value:
+ return None
+ clean = value.strip().lower()
+ return clean or None
+
+
+class ParticipantDirectory:
+ """Store participant metadata across one or more study phases."""
+
+ def __init__(self, participants_by_deployment: dict[str, ParticipantInfo] | None = None):
+ self.participants_by_deployment = participants_by_deployment or {}
+ self.unified_participants: dict[str, list[ParticipantInfo]] = {}
+ self._counter = 0
+ if self.participants_by_deployment:
+ self._unify()
+
+ @classmethod
+ def from_folders(cls, folders: tuple[Path, ...]) -> ParticipantDirectory:
+ """Build a participant directory from phase folders."""
+
+ participants: dict[str, ParticipantInfo] = {}
+ for folder in folders:
+ file_path = folder / PARTICIPANT_FILE
+ if file_path.exists():
+ participants.update(load_participant_file(file_path))
+ return cls(participants)
+
+ def get_participant(self, deployment_id: str) -> ParticipantInfo | None:
+ """Return one participant by deployment identifier."""
+
+ return self.participants_by_deployment.get(deployment_id)
+
+ def get_unified_participant(self, unified_id: str) -> list[ParticipantInfo]:
+ """Return all deployments for one unified participant."""
+
+ return list(self.unified_participants.get(unified_id, []))
+
+ def find_by_email(self, email: str) -> list[ParticipantInfo]:
+ """Find all participant deployments matching an email address."""
+
+ target = _normalize(email)
+ return [p for p in self.participants_by_deployment.values() if _normalize(p.email) == target]
+
+ def find_by_ssn(self, ssn: str) -> list[ParticipantInfo]:
+ """Find all participant deployments matching an SSN."""
+
+ return [p for p in self.participants_by_deployment.values() if p.ssn == ssn]
+
+ def find_by_name(self, name: str) -> list[ParticipantInfo]:
+ """Find all participant deployments matching a full name."""
+
+ target = _normalize(name)
+ return [p for p in self.participants_by_deployment.values() if _normalize(p.full_name) == target]
+
+ def deployment_ids(self, field_name: str, value: str) -> tuple[str, ...]:
+ """Return deployment identifiers for a participant lookup."""
+
+ matches = getattr(self, f"find_by_{field_name}")(value)
+ return tuple(participant.study_deployment_id for participant in matches)
+
+ def summary_rows(self) -> list[dict[str, str]]:
+ """Return human-readable participant summary rows."""
+
+ rows: list[dict[str, str]] = []
+ for unified_id, participants in self.unified_participants.items():
+ folders = sorted({p.source_folder for p in participants if p.source_folder})
+ emails = sorted({p.email for p in participants if p.email})
+ ssns = sorted({p.ssn for p in participants if p.ssn})
+ names = sorted({p.full_name for p in participants if p.full_name})
+ rows.append(
+ {
+ "unified_id": unified_id,
+ "deployments": str(len(participants)),
+ "folders": ", ".join(folders) or "N/A",
+ "emails": ", ".join(emails) or "N/A",
+ "ssns": ", ".join(ssns) or "N/A",
+ "names": ", ".join(names) or "N/A",
+ }
+ )
+ return rows
+
+ def _register_group(self, participants: list[ParticipantInfo], assigned: set[str]) -> None:
+ """Register one unified participant group."""
+
+ unified_id = f"P{self._counter:04d}"
+ self._counter += 1
+ for participant in participants:
+ participant.unified_participant_id = unified_id
+ assigned.add(participant.study_deployment_id)
+ self.unified_participants[unified_id] = participants
+
+ def _unify(self) -> None:
+ """Assign unified participant identifiers across phases."""
+
+ assigned: set[str] = set()
+ matchers = ("email", "ssn", "name")
+ grouped: dict[str, dict[str, list[ParticipantInfo]]] = {
+ "email": defaultdict(list),
+ "ssn": defaultdict(list),
+ "name": defaultdict(list),
+ }
+ for participant in self.participants_by_deployment.values():
+ if email := _normalize(participant.email):
+ grouped["email"][email].append(participant)
+ if participant.ssn:
+ grouped["ssn"][participant.ssn].append(participant)
+ if name := _normalize(participant.full_name):
+ grouped["name"][name].append(participant)
+ for matcher in matchers:
+ for participants in grouped[matcher].values():
+ pending = [participant for participant in participants if participant.study_deployment_id not in assigned]
+ if pending:
+ self._register_group(pending, assigned)
+ for participant in self.participants_by_deployment.values():
+ if participant.study_deployment_id not in assigned:
+ self._register_group([participant], assigned)
+ self._propagate()
+
+ def _propagate(self) -> None:
+ """Share the best known metadata across unified deployments."""
+
+ for participants in self.unified_participants.values():
+ fields = {
+ "full_name": next((p.full_name for p in participants if p.full_name), None),
+ "sex": next((p.sex for p in participants if p.sex), None),
+ "ssn": next((p.ssn for p in participants if p.ssn), None),
+ "email": next((p.email for p in participants if p.email), None),
+ "user_id": next((p.user_id for p in participants if p.user_id), None),
+ "consent_timestamp": next((p.consent_timestamp for p in participants if p.consent_timestamp), None),
+ }
+ signed = any(p.consent_signed for p in participants)
+ for participant in participants:
+ participant.consent_signed = signed
+ for field_name, value in fields.items():
+ if value and not getattr(participant, field_name):
+ setattr(participant, field_name, value)
diff --git a/src/carp/participants/parser.py b/src/carp/participants/parser.py
new file mode 100644
index 0000000..654d589
--- /dev/null
+++ b/src/carp/participants/parser.py
@@ -0,0 +1,78 @@
+"""Parsing helpers for `participant-data.json` files."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from carp.core.models import ParticipantInfo
+
+
+def _coerce_full_name(value: object) -> str | None:
+ """Normalize CARP full-name payloads."""
+
+ if isinstance(value, str):
+ return value or None
+ if not isinstance(value, dict):
+ return None
+ parts = [value.get(key) for key in ("firstName", "middleName", "lastName")]
+ clean = [part.strip() for part in parts if isinstance(part, str) and part.strip()]
+ return " ".join(clean) or None
+
+
+def _coerce_ssn(value: object) -> str | None:
+ """Normalize CARP SSN payloads."""
+
+ if isinstance(value, str):
+ return value or None
+ if isinstance(value, dict):
+ nested = value.get("socialSecurityNumber")
+ return str(nested) if nested else None
+ return None
+
+
+def _apply_consent(participant: ParticipantInfo, value: object) -> None:
+ """Populate consent-related participant fields."""
+
+ if not isinstance(value, dict):
+ return
+ participant.consent_signed = True
+ participant.consent_timestamp = value.get("signedTimestamp")
+ participant.user_id = value.get("userId")
+ participant.email = value.get("name")
+ if participant.full_name:
+ return
+ consent_payload = value.get("consent")
+ if not isinstance(consent_payload, str):
+ return
+ try:
+ signature = json.loads(consent_payload).get("signature", {})
+ except json.JSONDecodeError:
+ return
+ first_name = (signature.get("firstName") or "").strip()
+ last_name = (signature.get("lastName") or "").strip()
+ participant.full_name = f"{first_name} {last_name}".strip() or None
+
+
+def load_participant_file(file_path: Path) -> dict[str, ParticipantInfo]:
+ """Load participant records from a single phase folder."""
+
+ participants: dict[str, ParticipantInfo] = {}
+ data = json.loads(file_path.read_text(encoding="utf-8"))
+ for entry in data:
+ deployment_id = entry.get("studyDeploymentId")
+ if not deployment_id:
+ continue
+ for role in entry.get("roles", []):
+ info = ParticipantInfo(
+ study_deployment_id=deployment_id,
+ role_name=role.get("roleName", "Participant"),
+ source_folder=file_path.parent.name,
+ )
+ role_data = role.get("data", {})
+ info.full_name = _coerce_full_name(role_data.get("dk.carp.webservices.input.full_name"))
+ info.sex = role_data.get("dk.cachet.carp.input.sex")
+ info.ssn = _coerce_ssn(role_data.get("dk.carp.webservices.input.ssn"))
+ _apply_consent(info, role_data.get("dk.carp.webservices.input.informed_consent"))
+ participants[deployment_id] = info
+ return participants
diff --git a/src/carp/participants/service.py b/src/carp/participants/service.py
new file mode 100644
index 0000000..3a1847a
--- /dev/null
+++ b/src/carp/participants/service.py
@@ -0,0 +1,51 @@
+"""High-level participant service for `CarpStudy`."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .directory import ParticipantDirectory
+from .view import ParticipantView
+
+
+class ParticipantService:
+ """Expose participant-centric queries and views."""
+
+ def __init__(self, study: Any, directory: ParticipantDirectory) -> None:
+ self._study = study
+ self._directory = directory
+
+ def view(self, email: str) -> ParticipantView:
+ """Return a participant-scoped view by email."""
+
+ return ParticipantView(self._study, email)
+
+ def by_email(self, email: str) -> list[Any]:
+ """Return participant deployments for an email address."""
+
+ return self._directory.find_by_email(email)
+
+ def by_ssn(self, ssn: str) -> list[Any]:
+ """Return participant deployments for an SSN."""
+
+ return self._directory.find_by_ssn(ssn)
+
+ def by_name(self, name: str) -> list[Any]:
+ """Return participant deployments for a full name."""
+
+ return self._directory.find_by_name(name)
+
+ def deployment_ids(self, field_name: str, value: str) -> tuple[str, ...]:
+ """Return deployment identifiers for a participant lookup."""
+
+ return self._directory.deployment_ids(field_name, value)
+
+ def unified(self, unified_id: str) -> list[Any]:
+ """Return deployments for a unified participant identifier."""
+
+ return self._directory.get_unified_participant(unified_id)
+
+ def summary_rows(self) -> list[dict[str, str]]:
+ """Return participant summary rows for presentation layers."""
+
+ return self._directory.summary_rows()
diff --git a/src/carp/participants/view.py b/src/carp/participants/view.py
new file mode 100644
index 0000000..383c28c
--- /dev/null
+++ b/src/carp/participants/view.py
@@ -0,0 +1,105 @@
+"""Participant-centric study accessors."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from typing import Any
+
+
+class ParticipantView:
+ """Provide participant-scoped access to study data."""
+
+ def __init__(self, study: Any, email: str):
+ self._study = study
+ self._email = email
+
+ @property
+ def participants(self) -> list[Any]:
+ """Return underlying participant models for the view."""
+
+ return list(self._study.participants.by_email(self._email))
+
+ @property
+ def deployment_ids(self) -> tuple[str, ...]:
+ """Return deployment identifiers for the participant."""
+
+ return tuple(self._study.participants.deployment_ids("email", self._email))
+
+ @property
+ def exists(self) -> bool:
+ """Return whether the participant exists in the study."""
+
+ return bool(self.participants)
+
+ def info(self) -> dict[str, Any] | None:
+ """Return merged participant metadata."""
+
+ if not self.participants:
+ return None
+ base = self.participants[0]
+ return {
+ "email": self._email,
+ "unified_id": base.unified_participant_id,
+ "full_name": base.full_name,
+ "ssn": base.ssn,
+ "sex": base.sex,
+ "user_id": base.user_id,
+ "consent_signed": base.consent_signed,
+ "consent_timestamp": base.consent_timestamp,
+ "folders": sorted({p.source_folder for p in self.participants if p.source_folder}),
+ "deployment_ids": sorted(self.deployment_ids),
+ "num_deployments": len(self.deployment_ids),
+ }
+
+ def iter_records(self, data_type: str | None = None) -> Iterator[dict[str, Any]]:
+ """Yield participant records with an optional data-type filter."""
+
+ yield from self._study.records.iter_records(data_type, self.deployment_ids)
+
+ def available_fields(self, sample_size: int = 100) -> list[str]:
+ """Return participant-visible field paths."""
+
+ fields: set[str] = set()
+ for index, item in enumerate(self.iter_records()):
+ if index >= sample_size:
+ break
+ fields.update(self._study.records.collect_fields(item))
+ return sorted(fields)
+
+ def data_types(self) -> list[str]:
+ """Return unique data types for the participant."""
+
+ return sorted({self._study.records.data_type(item) for item in self.iter_records()})
+
+ def count(self, data_type: str | None = None) -> int:
+ """Return the number of participant records."""
+
+ return sum(1 for _ in self.iter_records(data_type))
+
+ def dataframe(self, data_type: str, parquet_dir: str | None = None) -> Any:
+ """Return a dataframe filtered to the participant."""
+
+ frame = self._study.frames.get_dataframe(data_type, parquet_dir)
+ if frame is None or frame.empty:
+ return frame
+ deployment_ids = self._study.plots.candidate_series(
+ frame,
+ ["studyDeploymentId", "dataStream.studyDeploymentId"],
+ )
+ return frame if deployment_ids is None else frame[deployment_ids.isin(self.deployment_ids)]
+
+ def plot_location(
+ self,
+ output_file: str | None = None,
+ parquet_dir: str | None = None,
+ include_steps: bool = True,
+ ) -> str | None:
+ """Render a location plot for the participant."""
+
+ result = self._study.plots.participant(
+ self._email,
+ output_file=output_file,
+ parquet_dir=parquet_dir,
+ include_steps=include_steps,
+ )
+ return None if result is None else str(result)
diff --git a/src/carp/plotting/__init__.py b/src/carp/plotting/__init__.py
index 2a8f3b3..97cd027 100644
--- a/src/carp/plotting/__init__.py
+++ b/src/carp/plotting/__init__.py
@@ -1,10 +1,5 @@
-"""
-Visualization module for CARP Analytics data.
+"""Plotting services for CARP studies."""
-This module provides visualization tools for participant location data,
-including heatmaps and geographic visualizations.
-"""
+from .service import PlotService
-from .map_viz import LocationVisualizer, ParticipantVisualizer
-
-__all__ = ["LocationVisualizer", "ParticipantVisualizer"]
+__all__ = ["PlotService"]
diff --git a/src/carp/plotting/map_viz.py b/src/carp/plotting/map_viz.py
deleted file mode 100644
index 3c567d5..0000000
--- a/src/carp/plotting/map_viz.py
+++ /dev/null
@@ -1,416 +0,0 @@
-import pandas as pd
-import folium
-from folium.plugins import HeatMap
-from typing import Optional, List, Any, Set, TYPE_CHECKING
-
-if TYPE_CHECKING:
- from ..reader import CarpDataStream
-
-from rich.console import Console
-
-console = Console()
-
-
-class ParticipantVisualizer:
- """
- Fluent API for visualizing participant data.
- Usage: sd.participant("email").visualize.location()
- """
-
- def __init__(self, sleepiness_data: 'CarpDataStream', deployment_ids: Set[str], email: str):
- self._sd = sleepiness_data
- self._deployment_ids = deployment_ids
- self._email = email
-
- def location(
- self,
- output_file: Optional[str] = None,
- location_type: str = "dk.cachet.carp.geolocation",
- step_type: str = "dk.cachet.carp.stepcount",
- include_steps: bool = True,
- parquet_dir: Optional[str] = "output_parquet"
- ) -> Optional[str]:
- """
- Generate a location heatmap for this participant.
-
- Args:
- output_file: Output HTML file path. Defaults to "{email}_location.html"
- location_type: Data type for location data
- step_type: Data type for step count data
- include_steps: Whether to overlay step count markers
- parquet_dir: Directory with parquet files for faster loading
-
- Returns:
- Path to the generated HTML file, or None if no data found
- """
- if output_file is None:
- # Sanitize email for filename
- safe_email = self._email.replace("@", "_at_").replace(".", "_")
- output_file = f"{safe_email}_location.html"
-
- console.print(f"[bold blue]Generating location heatmap for {self._email}...[/bold blue]")
-
- # Load location data
- df_loc = self._sd.get_dataframe(location_type, parquet_dir)
-
- if df_loc is None or df_loc.empty:
- console.print(f"[bold red]No location data found for type {location_type}[/bold red]")
- return None
-
- # Filter by deployment IDs
- user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_loc is not None:
- df_loc = df_loc[user_series_loc.isin(self._deployment_ids)]
-
- if df_loc.empty:
- console.print(f"[bold red]No location data found for {self._email}[/bold red]")
- return None
-
- # Load step data if requested
- df_steps = pd.DataFrame()
- if include_steps:
- df_steps_raw = self._sd.get_dataframe(step_type, parquet_dir)
- if df_steps_raw is not None and not df_steps_raw.empty:
- user_series_steps = self._get_field(df_steps_raw, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_steps is not None:
- df_steps = df_steps_raw[user_series_steps.isin(self._deployment_ids)]
-
- # Extract coordinates
- df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude'])
- df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude'])
- df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all():
- console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]")
- return None
-
- # Extract step data
- if not df_steps.empty:
- df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps'])
- df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- # Render the map
- self._render_map(df_loc, df_steps, output_file)
- return output_file
-
- def _get_field(self, df: pd.DataFrame, candidates: List[str]) -> Optional[pd.Series]:
- """Extract a series from dataframe using candidate field paths."""
- for path in candidates:
- if path in df.columns:
- return df[path]
-
- parts = path.split('.')
- if parts[0] in df.columns:
- try:
- series = df[parts[0]]
- for part in parts[1:]:
- series = series.apply(lambda x: x.get(part) if isinstance(x, dict) else None)
- return series
- except Exception:
- pass
- return None
-
- def _render_map(self, df_loc: pd.DataFrame, df_steps: pd.DataFrame, output_file: str):
- """Render the heatmap to an HTML file."""
- df_loc = df_loc.dropna(subset=['_lat', '_lon'])
-
- if df_loc.empty:
- console.print("[bold red]No valid coordinates found after filtering[/bold red]")
- return
-
- center_lat = df_loc['_lat'].mean()
- center_lon = df_loc['_lon'].mean()
-
- m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
-
- # Add heatmap layer
- heat_data = df_loc[['_lat', '_lon']].values.tolist()
- HeatMap(heat_data).add_to(m)
-
- # Add step markers
- if not df_steps.empty and '_steps' in df_steps.columns and '_time' in df_steps.columns:
- if '_time' in df_loc.columns:
- df_loc_sorted = df_loc.sort_values('_time')
- df_steps_sorted = df_steps.sort_values('_time')
-
- df_loc_sorted['_time'] = df_loc_sorted['_time'].astype('int64')
- df_steps_sorted['_time'] = df_steps_sorted['_time'].astype('int64')
-
- merged = pd.merge_asof(
- df_steps_sorted,
- df_loc_sorted[['_time', '_lat', '_lon']],
- on='_time',
- direction='nearest',
- tolerance=300_000_000 # 5 minutes in microseconds
- )
-
- for _, row in merged.iterrows():
- if pd.notnull(row['_lat']) and pd.notnull(row['_lon']) and pd.notnull(row['_steps']):
- steps = row['_steps']
- if steps > 0:
- folium.CircleMarker(
- location=[row['_lat'], row['_lon']],
- radius=min(max(steps / 10, 3), 20),
- popup=f"Steps: {steps}
Time: {row['_time']}",
- color="blue",
- fill=True,
- fill_color="blue"
- ).add_to(m)
-
- m.save(output_file)
- console.print(f"[bold green]Heatmap saved to {output_file}[/bold green]")
-
-
-class LocationVisualizer:
- def __init__(self, sd: 'CarpDataStream'):
- self.sd = sd
-
- def _get_field(self, df: pd.DataFrame, candidates: List[str]) -> Optional[pd.Series]:
- """
- Tries to extract a series from the dataframe using a list of candidate field paths.
- Supports dot-notation for nested dict columns.
- """
- for path in candidates:
- if path in df.columns:
- return df[path]
-
- # Try nested
- parts = path.split('.')
- if parts[0] in df.columns:
- try:
- series = df[parts[0]]
- for part in parts[1:]:
- # Handle None/NaN
- series = series.apply(lambda x: x.get(part) if isinstance(x, dict) else None)
- return series
- except Exception:
- pass
- return None
-
- def _render_map(self, df_loc: pd.DataFrame, df_steps: pd.DataFrame, output_file: str):
- """
- Internal method to render the map from prepared dataframes.
- Expects df_loc to have _lat, _lon, _time columns.
- Expects df_steps to have _steps, _time columns.
- """
- # Drop NaNs in location
- df_loc = df_loc.dropna(subset=['_lat', '_lon'])
-
- if df_loc.empty:
- console.print("[bold red]No valid coordinates found after filtering[/bold red]")
- return
-
- # Create Map
- center_lat = df_loc['_lat'].mean()
- center_lon = df_loc['_lon'].mean()
-
- m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
-
- # Add Heatmap
- heat_data = df_loc[['_lat', '_lon']].values.tolist()
- HeatMap(heat_data).add_to(m)
-
- # Add Step Markers
- if not df_steps.empty:
- if '_steps' in df_steps.columns and '_time' in df_steps.columns and '_time' in df_loc.columns:
- # Sort by time
- df_loc = df_loc.sort_values('_time')
- df_steps = df_steps.sort_values('_time')
-
- # Ensure types match
- df_loc['_time'] = df_loc['_time'].astype('int64')
- df_steps['_time'] = df_steps['_time'].astype('int64')
-
- merged = pd.merge_asof(
- df_steps,
- df_loc[['_time', '_lat', '_lon']],
- on='_time',
- direction='nearest',
- tolerance=300_000_000 # 5 minutes in microseconds
- )
-
- for idx, row in merged.iterrows():
- if pd.notnull(row['_lat']) and pd.notnull(row['_lon']) and pd.notnull(row['_steps']):
- steps = row['_steps']
- if steps > 0:
- folium.CircleMarker(
- location=[row['_lat'], row['_lon']],
- radius=min(max(steps / 10, 3), 20),
- popup=f"Steps: {steps}
Time: {row['_time']}",
- color="blue",
- fill=True,
- fill_color="blue"
- ).add_to(m)
-
- # Save
- m.save(output_file)
- console.print(f"[bold green]Heatmap saved to {output_file}[/bold green]")
-
- def plot_heatmap_from_items(
- self,
- location_items: List[Any],
- step_items: Optional[List[Any]] = None,
- output_file: str = "user_heatmap.html"
- ):
- """
- Generates a heatmap from a list of type-safe objects (e.g. generated SleepinessItem).
- """
- console.print(f"[bold blue]Generating heatmap from {len(location_items)} location items...[/bold blue]")
-
- # Helper to safely get attributes
- def get_attr(obj, path):
- parts = path.split('.')
- curr = obj
- for p in parts:
- if curr is None:
- return None
- curr = getattr(curr, p, None)
- return curr
-
- # Extract Location Data
- loc_data = []
- for item in location_items:
- lat = get_attr(item, 'measurement.data.latitude')
- lon = get_attr(item, 'measurement.data.longitude')
- time = get_attr(item, 'measurement.sensorStartTime')
-
- if lat is not None and lon is not None:
- loc_data.append({'_lat': lat, '_lon': lon, '_time': time})
-
- df_loc = pd.DataFrame(loc_data)
-
- if df_loc.empty:
- console.print("[bold red]No valid coordinates found in location items[/bold red]")
- return
-
- # Extract Step Data
- df_steps = pd.DataFrame()
- if step_items:
- step_data = []
- for item in step_items:
- steps = get_attr(item, 'measurement.data.steps')
- time = get_attr(item, 'measurement.sensorStartTime')
- if steps is not None:
- step_data.append({'_steps': steps, '_time': time})
- df_steps = pd.DataFrame(step_data)
-
- self._render_map(df_loc, df_steps, output_file)
-
- def plot_user_heatmap(
- self,
- study_deployment_id: str,
- location_type: str = "dk.cachet.carp.geolocation",
- step_type: str = "dk.cachet.carp.stepcount",
- parquet_dir: Optional[str] = "output_parquet",
- output_file: str = "user_heatmap.html"
- ):
- """
- Generates a heatmap of user locations and overlays step count data.
- """
- console.print(f"[bold blue]Generating heatmap for user {study_deployment_id}...[/bold blue]")
-
- # 1. Load Data
- df_loc = self.sd.get_dataframe(location_type, parquet_dir)
- df_steps = self.sd.get_dataframe(step_type, parquet_dir)
-
- if df_loc is None or df_loc.empty:
- console.print(f"[bold red]No location data found for type {location_type}[/bold red]")
- return
-
- if df_steps is None:
- console.print(f"[yellow]No step data found for type {step_type}. Plotting location only.[/yellow]")
- df_steps = pd.DataFrame()
-
- # 2. Filter by User
- user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_loc is not None:
- df_loc = df_loc[user_series_loc == study_deployment_id]
-
- if df_loc.empty:
- console.print(f"[bold red]No location data found for user {study_deployment_id}[/bold red]")
- return
-
- if not df_steps.empty:
- user_series_steps = self._get_field(df_steps, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_steps is not None:
- df_steps = df_steps[user_series_steps == study_deployment_id]
-
- # 3. Extract Coordinates and Time
- df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude'])
- df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude'])
- df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all():
- console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]")
- return
-
- # 6. Add Step Markers
- if not df_steps.empty:
- df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps'])
- df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- self._render_map(df_loc, df_steps, output_file)
-
- def plot_participant_heatmap(
- self,
- unified_participant_id: str,
- location_type: str = "dk.cachet.carp.geolocation",
- step_type: str = "dk.cachet.carp.stepcount",
- parquet_dir: Optional[str] = "output_parquet",
- output_file: str = "participant_heatmap.html"
- ):
- """
- Generates a heatmap for a specific unified participant across all their deployments.
- This aggregates data from all phases/folders for the same participant.
- """
- # Get all deployment IDs for this participant
- participants = self.sd.participant_manager.get_unified_participant(unified_participant_id)
- if not participants:
- console.print(f"[bold red]No participant found with ID {unified_participant_id}[/bold red]")
- return
-
- deployment_ids = [p.study_deployment_id for p in participants]
- console.print(f"[bold blue]Generating heatmap for participant {unified_participant_id} "
- f"({len(deployment_ids)} deployments)...[/bold blue]")
-
- # 1. Load Data
- df_loc = self.sd.get_dataframe(location_type, parquet_dir)
- df_steps = self.sd.get_dataframe(step_type, parquet_dir)
-
- if df_loc is None or df_loc.empty:
- console.print(f"[bold red]No location data found for type {location_type}[/bold red]")
- return
-
- if df_steps is None:
- console.print(f"[yellow]No step data found for type {step_type}. Plotting location only.[/yellow]")
- df_steps = pd.DataFrame()
-
- # 2. Filter by all User deployments
- user_series_loc = self._get_field(df_loc, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_loc is not None:
- df_loc = df_loc[user_series_loc.isin(deployment_ids)]
-
- if df_loc.empty:
- console.print(f"[bold red]No location data found for participant {unified_participant_id}[/bold red]")
- return
-
- if not df_steps.empty:
- user_series_steps = self._get_field(df_steps, ['studyDeploymentId', 'dataStream.studyDeploymentId'])
- if user_series_steps is not None:
- df_steps = df_steps[user_series_steps.isin(deployment_ids)]
-
- # 3. Extract Coordinates and Time
- df_loc['_lat'] = self._get_field(df_loc, ['measurement.data.latitude', 'latitude'])
- df_loc['_lon'] = self._get_field(df_loc, ['measurement.data.longitude', 'longitude'])
- df_loc['_time'] = self._get_field(df_loc, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- if df_loc['_lat'].isnull().all() or df_loc['_lon'].isnull().all():
- console.print("[bold red]Could not find latitude/longitude columns in location data[/bold red]")
- return
-
- # 4. Add Step Markers
- if not df_steps.empty:
- df_steps['_steps'] = self._get_field(df_steps, ['measurement.data.steps', 'steps'])
- df_steps['_time'] = self._get_field(df_steps, ['measurement.sensorStartTime', 'sensorStartTime'])
-
- self._render_map(df_loc, df_steps, output_file)
diff --git a/src/carp/plotting/prepare.py b/src/carp/plotting/prepare.py
new file mode 100644
index 0000000..9dc40a4
--- /dev/null
+++ b/src/carp/plotting/prepare.py
@@ -0,0 +1,81 @@
+"""Plot-data preparation helpers."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any
+
+from carp.core.dependencies import import_or_raise
+
+
+def _extract_part(value: Any, part: str) -> Any:
+ """Extract one nested key from a dictionary value."""
+
+ return value.get(part) if isinstance(value, dict) else None
+
+
+def candidate_series(frame: Any, candidates: Iterable[str]) -> Any:
+ """Return the first matching dataframe series for the given candidates."""
+
+ for path in candidates:
+ if path in frame.columns:
+ return frame[path]
+ parts = path.split(".")
+ if parts[0] not in frame.columns:
+ continue
+ series = frame[parts[0]]
+ for part in parts[1:]:
+ series = series.apply(_extract_part, args=(part,))
+ return series
+ return None
+
+
+def prepare_location_frame(frame: Any) -> Any:
+ """Add normalized plotting columns to a location dataframe."""
+
+ location = frame.copy()
+ location["_lat"] = candidate_series(location, ["measurement.data.latitude", "latitude"])
+ location["_lon"] = candidate_series(location, ["measurement.data.longitude", "longitude"])
+ location["_time"] = candidate_series(
+ location,
+ ["measurement.sensorStartTime", "sensorStartTime"],
+ )
+ return location
+
+
+def prepare_step_frame(frame: Any) -> Any:
+ """Add normalized plotting columns to a step dataframe."""
+
+ steps = frame.copy()
+ steps["_steps"] = candidate_series(steps, ["measurement.data.steps", "steps"])
+ steps["_time"] = candidate_series(steps, ["measurement.sensorStartTime", "sensorStartTime"])
+ return steps
+
+
+def frames_from_items(location_items: list[Any], step_items: list[Any] | None = None) -> tuple[Any, Any]:
+ """Build plotting dataframes from type-safe objects."""
+
+ pandas = import_or_raise("pandas", "viz")
+
+ def attr_path(value: Any, path: str) -> Any:
+ current = value
+ for part in path.split("."):
+ current = getattr(current, part, None)
+ if current is None:
+ return None
+ return current
+
+ location_rows = []
+ for item in location_items:
+ latitude = attr_path(item, "measurement.data.latitude")
+ longitude = attr_path(item, "measurement.data.longitude")
+ timestamp = attr_path(item, "measurement.sensorStartTime")
+ if latitude is not None and longitude is not None:
+ location_rows.append({"_lat": latitude, "_lon": longitude, "_time": timestamp})
+ step_rows = []
+ for item in step_items or []:
+ steps = attr_path(item, "measurement.data.steps")
+ timestamp = attr_path(item, "measurement.sensorStartTime")
+ if steps is not None:
+ step_rows.append({"_steps": steps, "_time": timestamp})
+ return pandas.DataFrame(location_rows), pandas.DataFrame(step_rows)
diff --git a/src/carp/plotting/render.py b/src/carp/plotting/render.py
new file mode 100644
index 0000000..d1a32c2
--- /dev/null
+++ b/src/carp/plotting/render.py
@@ -0,0 +1,56 @@
+"""HTML map rendering helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from carp.core.dependencies import import_or_raise
+
+
+def render_heatmap(location_frame: Any, step_frame: Any, output_path: str | Path) -> str | None:
+ """Render a heatmap and optional step markers to HTML."""
+
+ pandas = import_or_raise("pandas", "viz")
+ folium = import_or_raise("folium", "viz")
+ heatmap = import_or_raise("folium.plugins", "viz").HeatMap
+ if {"_lat", "_lon"} - set(location_frame.columns):
+ return None
+ location = location_frame.dropna(subset=["_lat", "_lon"])
+ if location.empty:
+ return None
+ map_view = folium.Map(location=[location["_lat"].mean(), location["_lon"].mean()], zoom_start=12)
+ heatmap(location[["_lat", "_lon"]].values.tolist()).add_to(map_view)
+ if not step_frame.empty and {"_steps", "_time"} <= set(step_frame.columns):
+ merged = _merge_steps(pandas, location, step_frame)
+ for _, row in merged.iterrows():
+ if row["_steps"] and pandas.notnull(row["_lat"]) and pandas.notnull(row["_lon"]):
+ folium.CircleMarker(
+ location=[row["_lat"], row["_lon"]],
+ radius=min(max(row["_steps"] / 10, 3), 20),
+ popup=f"Steps: {row['_steps']}
Time: {row['_time']}",
+ color="blue",
+ fill=True,
+ fill_color="blue",
+ ).add_to(map_view)
+ path = Path(output_path)
+ map_view.save(path)
+ return str(path)
+
+
+def _merge_steps(pandas: Any, location: Any, step_frame: Any) -> Any:
+ """Merge step markers onto the nearest location timestamps."""
+
+ if "_time" not in location.columns or "_time" not in step_frame.columns:
+ return step_frame.iloc[0:0]
+ sorted_location = location.sort_values("_time").copy()
+ sorted_steps = step_frame.sort_values("_time").copy()
+ sorted_location["_time"] = sorted_location["_time"].astype("int64")
+ sorted_steps["_time"] = sorted_steps["_time"].astype("int64")
+ return pandas.merge_asof(
+ sorted_steps,
+ sorted_location[["_time", "_lat", "_lon"]],
+ on="_time",
+ direction="nearest",
+ tolerance=300_000_000,
+ )
diff --git a/src/carp/plotting/service.py b/src/carp/plotting/service.py
new file mode 100644
index 0000000..f4db132
--- /dev/null
+++ b/src/carp/plotting/service.py
@@ -0,0 +1,130 @@
+"""High-level plotting service for study and participant data."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from carp.constants import DEFAULT_LOCATION_TYPE, DEFAULT_STEP_TYPE
+from carp.core.naming import sanitize_filename
+
+from .prepare import candidate_series, frames_from_items, prepare_location_frame, prepare_step_frame
+from .render import render_heatmap
+
+
+class PlotService:
+ """Render HTML maps from study data or typed objects."""
+
+ def __init__(self, frames: Any, participants: Any) -> None:
+ self._frames = frames
+ self._participants = participants
+ self.candidate_series = candidate_series
+
+ def participant(
+ self,
+ email: str,
+ output_file: str | None = None,
+ location_type: str = DEFAULT_LOCATION_TYPE,
+ step_type: str = DEFAULT_STEP_TYPE,
+ parquet_dir: str | None = None,
+ include_steps: bool = True,
+ ) -> str | None:
+ """Render a participant heatmap from an email address."""
+
+ view = self._participants.view(email)
+ if not view.exists:
+ return None
+ default_name = sanitize_filename(email.replace("@", "_at_"), allowed="-_.")
+ return self._plot_for_deployments(
+ view.deployment_ids,
+ output_file or f"{default_name}_location.html",
+ location_type,
+ step_type,
+ parquet_dir,
+ include_steps,
+ )
+
+ def deployment(
+ self,
+ deployment_id: str,
+ output_file: str = "deployment_heatmap.html",
+ location_type: str = DEFAULT_LOCATION_TYPE,
+ step_type: str = DEFAULT_STEP_TYPE,
+ parquet_dir: str | None = None,
+ include_steps: bool = True,
+ ) -> str | None:
+ """Render a heatmap for a single deployment."""
+
+ return self._plot_for_deployments(
+ (deployment_id,),
+ output_file,
+ location_type,
+ step_type,
+ parquet_dir,
+ include_steps,
+ )
+
+ def unified(
+ self,
+ unified_id: str,
+ output_file: str = "participant_heatmap.html",
+ location_type: str = DEFAULT_LOCATION_TYPE,
+ step_type: str = DEFAULT_STEP_TYPE,
+ parquet_dir: str | None = None,
+ include_steps: bool = True,
+ ) -> str | None:
+ """Render a heatmap for a unified participant."""
+
+ deployment_ids = tuple(participant.study_deployment_id for participant in self._participants.unified(unified_id))
+ if not deployment_ids:
+ return None
+ return self._plot_for_deployments(
+ deployment_ids,
+ output_file,
+ location_type,
+ step_type,
+ parquet_dir,
+ include_steps,
+ )
+
+ def from_items(
+ self,
+ location_items: list[Any],
+ step_items: list[Any] | None = None,
+ output_file: str = "user_heatmap.html",
+ ) -> str | None:
+ """Render a heatmap from type-safe Python objects."""
+
+ location_frame, step_frame = frames_from_items(location_items, step_items)
+ return render_heatmap(location_frame, step_frame, output_file)
+
+ def _plot_for_deployments(
+ self,
+ deployment_ids: tuple[str, ...],
+ output_file: str,
+ location_type: str,
+ step_type: str,
+ parquet_dir: str | None,
+ include_steps: bool,
+ ) -> str | None:
+ """Render a heatmap for a set of deployments."""
+
+ location_frame = self._frames.get_dataframe(location_type, parquet_dir)
+ if location_frame.empty:
+ return None
+ location_ids = candidate_series(location_frame, ["studyDeploymentId", "dataStream.studyDeploymentId"])
+ if location_ids is None:
+ return None
+ filtered_location = prepare_location_frame(location_frame[location_ids.isin(deployment_ids)])
+ if filtered_location.empty:
+ return None
+ if not include_steps:
+ return render_heatmap(filtered_location, filtered_location.iloc[0:0], output_file)
+ step_frame = self._frames.get_dataframe(step_type, parquet_dir)
+ if step_frame.empty:
+ return render_heatmap(filtered_location, step_frame, output_file)
+ step_ids = candidate_series(step_frame, ["studyDeploymentId", "dataStream.studyDeploymentId"])
+ if step_ids is None:
+ return render_heatmap(filtered_location, step_frame.iloc[0:0], Path(output_file))
+ filtered_steps = prepare_step_frame(step_frame[step_ids.isin(deployment_ids)])
+ return render_heatmap(filtered_location, filtered_steps, Path(output_file))
diff --git a/src/carp/reader.py b/src/carp/reader.py
deleted file mode 100644
index 1c18585..0000000
--- a/src/carp/reader.py
+++ /dev/null
@@ -1,1417 +0,0 @@
-import ijson
-from pathlib import Path
-from typing import Generator, Any, Dict, Optional, List, Set
-from collections import defaultdict
-from rich.console import Console
-from rich.table import Table
-from tqdm import tqdm
-import json
-from dataclasses import dataclass, field
-
-console = Console()
-
-
-@dataclass
-class ParticipantInfo:
- """Represents participant information from participant-data.json"""
-
- study_deployment_id: str
- role_name: str = "Participant"
- full_name: Optional[str] = None
- sex: Optional[str] = None
- ssn: Optional[str] = None
- user_id: Optional[str] = None
- email: Optional[str] = None
- consent_signed: bool = False
- consent_timestamp: Optional[str] = None
- source_folder: Optional[str] = None
- # Unified participant ID assigned when same participant is detected across folders
- unified_participant_id: Optional[str] = None
-
- def to_dict(self) -> Dict[str, Any]:
- return {
- "study_deployment_id": self.study_deployment_id,
- "role_name": self.role_name,
- "full_name": self.full_name,
- "sex": self.sex,
- "ssn": self.ssn,
- "user_id": self.user_id,
- "email": self.email,
- "consent_signed": self.consent_signed,
- "consent_timestamp": self.consent_timestamp,
- "source_folder": self.source_folder,
- "unified_participant_id": self.unified_participant_id,
- }
-
-
-class ParticipantManager:
- """
- Manages participant data across multiple data folders.
- Links participants across folders using SSN or user ID as identifiers.
- """
-
- def __init__(self):
- # studyDeploymentId -> ParticipantInfo
- self.participants_by_deployment: Dict[str, ParticipantInfo] = {}
- # unified_participant_id -> list of ParticipantInfo (same person across folders)
- self.unified_participants: Dict[str, List[ParticipantInfo]] = {}
- # For generating unified IDs
- self._unified_id_counter = 0
-
- def load_participant_data(self, data_folders: List[Path]):
- """
- Loads participant data from participant-data.json files in each data folder.
- """
- console.print(
- f"[bold blue]Loading participant data from {len(data_folders)} folders...[/bold blue]"
- )
-
- for folder in data_folders:
- participant_file = folder / "participant-data.json"
- if participant_file.exists():
- self._load_single_file(participant_file, folder.name)
- else:
- console.print(f"[yellow]No participant-data.json found in {folder}[/yellow]")
-
- # After loading all, unify participants
- self._unify_participants()
-
- console.print(
- f"[bold green]Loaded {len(self.participants_by_deployment)} participant deployments, "
- f"{len(self.unified_participants)} unique participants[/bold green]"
- )
-
- def _load_single_file(self, file_path: Path, folder_name: str):
- """Load participant data from a single file."""
- try:
- with open(file_path, "r") as f:
- data = json.load(f)
- except (json.JSONDecodeError, IOError) as e:
- console.print(f"[red]Error reading {file_path}: {e}[/red]")
- return
-
- for entry in data:
- deployment_id = entry.get("studyDeploymentId")
- if not deployment_id:
- continue
-
- roles = entry.get("roles", [])
- for role in roles:
- role_name = role.get("roleName", "Unknown")
- role_data = role.get("data", {})
-
- # Extract participant info from various fields
- participant = ParticipantInfo(
- study_deployment_id=deployment_id,
- role_name=role_name,
- source_folder=folder_name,
- )
-
- # Extract full name (can be a dict with firstName/lastName or a string)
- full_name_data = role_data.get("dk.carp.webservices.input.full_name")
- if full_name_data:
- if isinstance(full_name_data, dict):
- # Combine firstName, middleName, lastName
- parts = []
- if full_name_data.get("firstName"):
- parts.append(full_name_data["firstName"])
- if full_name_data.get("middleName"):
- parts.append(full_name_data["middleName"])
- if full_name_data.get("lastName"):
- parts.append(full_name_data["lastName"])
- if parts:
- participant.full_name = " ".join(parts)
- elif isinstance(full_name_data, str):
- participant.full_name = full_name_data
-
- # Extract sex
- sex_data = role_data.get("dk.cachet.carp.input.sex")
- if sex_data:
- participant.sex = sex_data
-
- # Extract SSN (can be a dict with socialSecurityNumber or a string)
- ssn_data = role_data.get("dk.carp.webservices.input.ssn")
- if ssn_data:
- if isinstance(ssn_data, dict):
- ssn_value = ssn_data.get("socialSecurityNumber")
- if ssn_value:
- participant.ssn = str(ssn_value)
- elif isinstance(ssn_data, str):
- participant.ssn = ssn_data
-
- # Extract consent info
- consent_data = role_data.get("dk.carp.webservices.input.informed_consent")
- if consent_data:
- participant.consent_signed = True
- if isinstance(consent_data, dict):
- participant.consent_timestamp = consent_data.get("signedTimestamp")
- participant.user_id = consent_data.get("userId")
- participant.email = consent_data.get(
- "name"
- ) # email is stored in "name" field
-
- # Extract name from consent signature if not already set
- if not participant.full_name:
- consent_json_str = consent_data.get("consent")
- if consent_json_str and isinstance(consent_json_str, str):
- try:
- consent_doc = json.loads(consent_json_str)
- signature = consent_doc.get("signature", {})
- if isinstance(signature, dict):
- first_name = (signature.get("firstName") or "").strip()
- last_name = (signature.get("lastName") or "").strip()
- if first_name or last_name:
- participant.full_name = (
- f"{first_name} {last_name}".strip()
- )
- except json.JSONDecodeError:
- pass
-
- self.participants_by_deployment[deployment_id] = participant
-
- def _unify_participants(self):
- """
- Identify the same participant across different folders/deployments.
- Uses email as primary identifier (most accurate), falls back to SSN, then full name.
- """
- # Group by identifier
- by_email: Dict[str, List[ParticipantInfo]] = defaultdict(list)
- by_ssn: Dict[str, List[ParticipantInfo]] = defaultdict(list)
- by_name: Dict[str, List[ParticipantInfo]] = defaultdict(list)
-
- for p in self.participants_by_deployment.values():
- # Email, SSN, name must be strings for use as dict keys
- if p.email and isinstance(p.email, str):
- by_email[p.email.lower()].append(p) # normalize email to lowercase
- if p.ssn and isinstance(p.ssn, str):
- by_ssn[p.ssn].append(p)
- if p.full_name and isinstance(p.full_name, str):
- by_name[p.full_name.strip().lower()].append(p) # normalize name
-
- # Assign unified IDs, preferring email grouping (most accurate)
- assigned: Set[str] = set() # deployment IDs already assigned
-
- # First pass: use email (most accurate identifier)
- for email, participants in by_email.items():
- unified_id = f"P{self._unified_id_counter:04d}"
- self._unified_id_counter += 1
-
- for p in participants:
- if p.study_deployment_id not in assigned:
- p.unified_participant_id = unified_id
- assigned.add(p.study_deployment_id)
-
- self.unified_participants[unified_id] = participants
-
- # Second pass: use SSN for remaining
- for ssn, participants in by_ssn.items():
- unassigned = [p for p in participants if p.study_deployment_id not in assigned]
- if not unassigned:
- continue
-
- unified_id = f"P{self._unified_id_counter:04d}"
- self._unified_id_counter += 1
-
- for p in unassigned:
- p.unified_participant_id = unified_id
- assigned.add(p.study_deployment_id)
-
- self.unified_participants[unified_id] = unassigned
-
- # Third pass: use full name for remaining
- for name, participants in by_name.items():
- unassigned = [p for p in participants if p.study_deployment_id not in assigned]
- if not unassigned:
- continue
-
- unified_id = f"P{self._unified_id_counter:04d}"
- self._unified_id_counter += 1
-
- for p in unassigned:
- p.unified_participant_id = unified_id
- assigned.add(p.study_deployment_id)
-
- self.unified_participants[unified_id] = unassigned
-
- # Remaining participants get their own unified ID
- for p in self.participants_by_deployment.values():
- if p.study_deployment_id not in assigned:
- unified_id = f"P{self._unified_id_counter:04d}"
- self._unified_id_counter += 1
- p.unified_participant_id = unified_id
- self.unified_participants[unified_id] = [p]
-
- # Propagate name/SSN data across unified participants
- # If any deployment has name/SSN, share it with all deployments of same participant
- self._propagate_participant_data()
-
- def _propagate_participant_data(self):
- """
- Propagate name, SSN, and other data to all records of the same unified participant.
- If one deployment has data that others don't, copy it to all.
- """
- for unified_id, participants in self.unified_participants.items():
- # Collect best available data from all records
- best_full_name = None
- best_ssn = None
- best_sex = None
-
- for p in participants:
- if p.full_name and isinstance(p.full_name, str) and not best_full_name:
- best_full_name = p.full_name
- if p.ssn and isinstance(p.ssn, str) and not best_ssn:
- best_ssn = p.ssn
- if p.sex and not best_sex:
- best_sex = p.sex
-
- # Apply to all records
- for p in participants:
- if best_full_name and not p.full_name:
- p.full_name = best_full_name
- if best_ssn and not p.ssn:
- p.ssn = best_ssn
- if best_sex and not p.sex:
- p.sex = best_sex
-
- def get_participant(self, study_deployment_id: str) -> Optional[ParticipantInfo]:
- """Get participant info by study deployment ID."""
- return self.participants_by_deployment.get(study_deployment_id)
-
- def get_unified_participant(self, unified_id: str) -> List[ParticipantInfo]:
- """Get all deployments for a unified participant."""
- return self.unified_participants.get(unified_id, [])
-
- def find_by_email(self, email: str) -> List[ParticipantInfo]:
- """Find all participant records matching an email address."""
- email_lower = email.lower()
- return [
- p
- for p in self.participants_by_deployment.values()
- if p.email and p.email.lower() == email_lower
- ]
-
- def find_by_ssn(self, ssn: str) -> List[ParticipantInfo]:
- """Find all participant records matching an SSN."""
- return [p for p in self.participants_by_deployment.values() if p.ssn and p.ssn == ssn]
-
- def find_by_name(self, name: str) -> List[ParticipantInfo]:
- """Find all participant records matching a full name (case-insensitive)."""
- name_lower = name.strip().lower()
- return [
- p
- for p in self.participants_by_deployment.values()
- if p.full_name and p.full_name.strip().lower() == name_lower
- ]
-
- def get_deployment_ids_by_email(self, email: str) -> List[str]:
- """Get all deployment IDs for a participant by email."""
- return [p.study_deployment_id for p in self.find_by_email(email)]
-
- def get_deployment_ids_by_ssn(self, ssn: str) -> List[str]:
- """Get all deployment IDs for a participant by SSN."""
- return [p.study_deployment_id for p in self.find_by_ssn(ssn)]
-
- def get_deployment_ids_by_name(self, name: str) -> List[str]:
- """Get all deployment IDs for a participant by name."""
- return [p.study_deployment_id for p in self.find_by_name(name)]
-
- def print_summary(self):
- """Print a summary table of participants."""
- table = Table(title="Participants Summary")
- table.add_column("Unified ID", style="cyan")
- table.add_column("Deployments", style="magenta")
- table.add_column("Folders", style="green")
- table.add_column("Email", style="yellow")
- table.add_column("SSN", style="red")
- table.add_column("Full Name", style="white")
-
- for unified_id, participants in self.unified_participants.items():
- folders = set(p.source_folder for p in participants if p.source_folder)
- emails = set(p.email for p in participants if p.email and isinstance(p.email, str))
- ssns = set(p.ssn for p in participants if p.ssn and isinstance(p.ssn, str))
- names = set(
- p.full_name for p in participants if p.full_name and isinstance(p.full_name, str)
- )
- table.add_row(
- unified_id,
- str(len(participants)),
- ", ".join(sorted(folders)),
- ", ".join(emails) if emails else "N/A",
- ", ".join(ssns) if ssns else "N/A",
- ", ".join(names) if names else "N/A",
- )
-
- console.print(table)
-
-
-class ParticipantAccessor:
- """
- Fluent API for accessing participant data.
- Usage: sd.participant("email@example.com").info(), .all_data(), .available_fields()
- sd.participant("email@example.com").visualize.location()
- """
-
- def __init__(self, sleepiness_data: "CarpDataStream", email: str):
- self._sd = sleepiness_data
- self._email = email
- self._participants = sleepiness_data.participant_manager.find_by_email(email)
- self._deployment_ids = set(
- sleepiness_data.participant_manager.get_deployment_ids_by_email(email)
- )
- self._visualizer = None
-
- @property
- def exists(self) -> bool:
- """Check if participant exists."""
- return len(self._participants) > 0
-
- @property
- def visualize(self):
- """
- Access visualization methods for this participant.
- Usage: sd.participant("email").visualize.location()
- """
- if self._visualizer is None:
- from .plotting import ParticipantVisualizer
-
- self._visualizer = ParticipantVisualizer(self._sd, self._deployment_ids, self._email)
- return self._visualizer
-
- def info(self) -> Optional[Dict[str, Any]]:
- """
- Get participant information as a dictionary.
- Returns combined info from all deployments for this participant.
- """
- if not self._participants:
- return None
-
- # Get first participant as base
- base = self._participants[0]
-
- # Combine info from all records
- all_folders = set()
- all_deployment_ids = set()
-
- for p in self._participants:
- if p.source_folder:
- all_folders.add(p.source_folder)
- all_deployment_ids.add(p.study_deployment_id)
-
- return {
- "email": self._email,
- "unified_id": base.unified_participant_id,
- "full_name": base.full_name,
- "ssn": base.ssn,
- "sex": base.sex,
- "user_id": base.user_id,
- "consent_signed": base.consent_signed,
- "consent_timestamp": base.consent_timestamp,
- "folders": sorted(all_folders),
- "deployment_ids": sorted(all_deployment_ids),
- "num_deployments": len(all_deployment_ids),
- }
-
- def print_info(self):
- """Print participant information in a formatted table."""
- info = self.info()
- if not info:
- console.print(f"[red]No participant found with email: {self._email}[/red]")
- return
-
- table = Table(title=f"Participant: {self._email}")
- table.add_column("Field", style="cyan")
- table.add_column("Value", style="white")
-
- for key, value in info.items():
- if isinstance(value, list):
- value = ", ".join(str(v) for v in value)
- table.add_row(key, str(value) if value is not None else "N/A")
-
- console.print(table)
-
- def all_data(self, data_type: Optional[str] = None) -> Generator[Dict[str, Any], None, None]:
- """
- Get all data items for this participant.
- Optionally filter by data type (e.g., "dk.cachet.carp.stepcount").
- """
- yield from self._sd._get_data_by_deployment_ids(self._deployment_ids, data_type)
-
- def available_fields(self, sample_size: int = 100) -> Set[str]:
- """
- Discover all available fields in this participant's data.
- Scans a sample of records and returns field paths in dot-notation.
- """
- fields = set()
- count = 0
-
- for item in self.all_data():
- if count >= sample_size:
- break
- self._collect_fields(item, "", fields)
- count += 1
-
- return fields
-
- def _collect_fields(self, obj: Any, prefix: str, fields: Set[str]):
- """Recursively collect field paths."""
- if isinstance(obj, dict):
- for key, value in obj.items():
- path = f"{prefix}.{key}" if prefix else key
- fields.add(path)
- self._collect_fields(value, path, fields)
- elif isinstance(obj, list) and obj:
- # Sample first item in list
- self._collect_fields(obj[0], f"{prefix}[]", fields)
-
- def print_available_fields(self, sample_size: int = 100):
- """Print all available fields in a formatted list."""
- fields = self.available_fields(sample_size)
- console.print(f"[bold]Available fields for {self._email}:[/bold]")
- for f in sorted(fields):
- console.print(f" - {f}")
-
- def data_types(self) -> Set[str]:
- """Get all unique data types for this participant."""
- types = set()
- for item in self.all_data():
- data_stream = item.get("dataStream", {})
- data_type = data_stream.get("dataType", {})
- type_name = data_type.get("name")
- if type_name:
- types.add(type_name)
- return types
-
- def print_data_types(self):
- """Print all data types available for this participant."""
- types = self.data_types()
- console.print(f"[bold]Data types for {self._email}:[/bold]")
- for t in sorted(types):
- console.print(f" - {t}")
-
- def count(self, data_type: Optional[str] = None) -> int:
- """Count total data items for this participant."""
- return sum(1 for _ in self.all_data(data_type))
-
- def dataframe(self, data_type: str, parquet_dir: Optional[str] = None):
- """
- Get a pandas DataFrame of this participant's data for a specific type.
- Uses parquet files if available and parquet_dir is specified.
- """
- try:
- import pandas as pd
- except ImportError:
- console.print(
- "[red]pandas is required for dataframe(). Install with: pip install pandas[/red]"
- )
- return None
-
- if parquet_dir:
- # Try to load from parquet and filter
- df = self._sd.get_dataframe(data_type, parquet_dir)
- if df is not None and not df.empty:
- return df[df["studyDeploymentId"].isin(self._deployment_ids)]
-
- # Fall back to streaming
- items = list(self.all_data(data_type))
- if not items:
- return pd.DataFrame()
- return pd.DataFrame(items)
-
-
-class CarpDataStream:
- def __init__(self, file_paths: str | Path | List[str | Path], load_participants: bool = True):
- if isinstance(file_paths, (str, Path)):
- file_paths = [file_paths]
-
- self.file_paths = [Path(p) for p in file_paths]
- for p in self.file_paths:
- if not p.exists():
- raise FileNotFoundError(f"File not found: {p}")
-
- self.schema_cache = {}
- self.participant_manager = ParticipantManager()
-
- # Auto-detect and load participant data from parent folders
- if load_participants:
- self._auto_load_participants()
-
- def _auto_load_participants(self):
- """
- Automatically detect and load participant data from the data folders
- containing the input files.
- """
- data_folders = set()
- for file_path in self.file_paths:
- # Each file is typically in a phase folder like data/phase-1-1/data-streams.json
- parent = file_path.parent
- if (parent / "participant-data.json").exists():
- data_folders.add(parent)
-
- if data_folders:
- self.participant_manager.load_participant_data(list(data_folders))
-
- def load_participants_from_folders(self, folders: List[str | Path]):
- """
- Manually load participant data from specific folders.
- Useful when files are in a different location than the input data.
- """
- folder_paths = [Path(f) for f in folders]
- self.participant_manager.load_participant_data(folder_paths)
-
- def participant(self, email: str) -> ParticipantAccessor:
- """
- Access participant data via email using a fluent API.
-
- Usage:
- sd.participant("email@example.com").info()
- sd.participant("email@example.com").all_data()
- sd.participant("email@example.com").available_fields()
- sd.participant("email@example.com").data_types()
- sd.participant("email@example.com").dataframe("dk.cachet.carp.stepcount")
- """
- return ParticipantAccessor(self, email)
-
- def get_participant(self, study_deployment_id: str) -> Optional[ParticipantInfo]:
- """Get participant info by study deployment ID."""
- return self.participant_manager.get_participant(study_deployment_id)
-
- def find_participant_by_email(self, email: str) -> List[ParticipantInfo]:
- """Find all participant records matching an email address."""
- return self.participant_manager.find_by_email(email)
-
- def find_participant_by_ssn(self, ssn: str) -> List[ParticipantInfo]:
- """Find all participant records matching an SSN."""
- return self.participant_manager.find_by_ssn(ssn)
-
- def find_participant_by_name(self, name: str) -> List[ParticipantInfo]:
- """Find all participant records matching a full name."""
- return self.participant_manager.find_by_name(name)
-
- def get_data_by_email(
- self, email: str, data_type: Optional[str] = None
- ) -> Generator[Dict[str, Any], None, None]:
- """
- Get all data items for a participant identified by email.
- Optionally filter by data type.
- """
- deployment_ids = set(self.participant_manager.get_deployment_ids_by_email(email))
- yield from self._get_data_by_deployment_ids(deployment_ids, data_type)
-
- def get_data_by_ssn(
- self, ssn: str, data_type: Optional[str] = None
- ) -> Generator[Dict[str, Any], None, None]:
- """
- Get all data items for a participant identified by SSN.
- Optionally filter by data type.
- """
- deployment_ids = set(self.participant_manager.get_deployment_ids_by_ssn(ssn))
- yield from self._get_data_by_deployment_ids(deployment_ids, data_type)
-
- def get_data_by_name(
- self, name: str, data_type: Optional[str] = None
- ) -> Generator[Dict[str, Any], None, None]:
- """
- Get all data items for a participant identified by full name.
- Optionally filter by data type.
- """
- deployment_ids = set(self.participant_manager.get_deployment_ids_by_name(name))
- yield from self._get_data_by_deployment_ids(deployment_ids, data_type)
-
- def _get_data_by_deployment_ids(
- self, deployment_ids: set, data_type: Optional[str] = None
- ) -> Generator[Dict[str, Any], None, None]:
- """Internal helper to filter data by deployment IDs and optionally by type."""
- if not deployment_ids:
- return
-
- for item in self._get_item_generator():
- item_deployment_id = item.get("studyDeploymentId")
- if not item_deployment_id:
- item_deployment_id = item.get("dataStream", {}).get("studyDeploymentId")
-
- if item_deployment_id not in deployment_ids:
- continue
-
- if data_type:
- dt = item.get("dataStream", {}).get("dataType", {})
- target_namespace, target_name = data_type.rsplit(".", 1)
- if dt.get("name") != target_name or dt.get("namespace") != target_namespace:
- continue
-
- yield item
-
- def print_participants(self):
- """Print a summary of all participants."""
- self.participant_manager.print_summary()
-
- def _get_item_generator(self) -> Generator[Dict[str, Any], None, None]:
- """
- Returns a generator that yields items from the JSON files.
- Uses ijson for memory-efficient streaming.
- """
- for file_path in self.file_paths:
- with open(file_path, "rb") as f:
- # Assuming the file is a list of objects.
- # 'item' matches objects in a list.
- # use_float=True ensures numbers are floats, avoiding Decimal schema mismatches in PyArrow
- yield from ijson.items(f, "item", use_float=True)
-
- def _get_item_generator_with_participant(
- self, include_participant: bool = False
- ) -> Generator[Dict[str, Any], None, None]:
- """
- Returns a generator that yields items from the JSON files,
- optionally enriched with participant info.
- """
- for item in self._get_item_generator():
- if include_participant:
- deployment_id = item.get("studyDeploymentId")
- if not deployment_id:
- deployment_id = item.get("dataStream", {}).get("studyDeploymentId")
-
- if deployment_id:
- participant = self.participant_manager.get_participant(deployment_id)
- if participant:
- item = item.copy() # Don't mutate original
- item["_participant"] = participant.to_dict()
-
- yield item
-
- def get_data_with_participants(
- self, data_type: Optional[str] = None
- ) -> Generator[Dict[str, Any], None, None]:
- """
- Yields items enriched with participant information.
- If data_type is specified, filters to that type.
- """
- gen = self._get_item_generator_with_participant(include_participant=True)
-
- if data_type:
- target_namespace, target_name = data_type.rsplit(".", 1)
- for item in gen:
- dt = item.get("dataStream", {}).get("dataType", {})
- if dt.get("name") == target_name and dt.get("namespace") == target_namespace:
- yield item
- else:
- yield from gen
-
- def group_by_participant(self, output_dir: str | Path, data_type: Optional[str] = None):
- """
- Groups data by unified participant ID and exports each group to a separate JSON file.
- Useful for analyzing individual participant data across all phases.
- """
- output_dir = Path(output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- console.print(f"[bold blue]Grouping data by participant into {output_dir}...[/bold blue]")
-
- files = {}
- gen = self.get_data_with_participants(data_type)
-
- try:
- for item in tqdm(gen, desc="Grouping by participant"):
- participant_info = item.get("_participant", {})
- unified_id = participant_info.get("unified_participant_id", "unknown")
-
- if unified_id not in files:
- f = open(output_dir / f"{unified_id}.json", "w")
- f.write("[")
- files[unified_id] = {"handle": f, "first": True}
-
- f_info = files[unified_id]
- if not f_info["first"]:
- f_info["handle"].write(",")
- json.dump(item, f_info["handle"])
- f_info["first"] = False
-
- finally:
- for f_info in files.values():
- f_info["handle"].write("]")
- f_info["handle"].close()
-
- console.print(
- f"[bold green]Grouping complete! Created {len(files)} participant files.[/bold green]"
- )
-
- def group_by_email(self, output_dir: str | Path, data_type: Optional[str] = None):
- """
- Groups data by participant email and exports each group to a separate JSON file.
- """
- self._group_by_field_value(output_dir, "email", data_type)
-
- def group_by_ssn(self, output_dir: str | Path, data_type: Optional[str] = None):
- """
- Groups data by participant SSN and exports each group to a separate JSON file.
- """
- self._group_by_field_value(output_dir, "ssn", data_type)
-
- def group_by_name(self, output_dir: str | Path, data_type: Optional[str] = None):
- """
- Groups data by participant full name and exports each group to a separate JSON file.
- """
- self._group_by_field_value(output_dir, "full_name", data_type)
-
- def _group_by_field_value(
- self, output_dir: str | Path, field: str, data_type: Optional[str] = None
- ):
- """Internal helper to group data by a participant field (email, ssn, or full_name)."""
- output_dir = Path(output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- console.print(f"[bold blue]Grouping data by {field} into {output_dir}...[/bold blue]")
-
- files = {}
- gen = self.get_data_with_participants(data_type)
-
- try:
- for item in tqdm(gen, desc=f"Grouping by {field}"):
- participant_info = item.get("_participant", {})
- value = participant_info.get(field, "unknown")
-
- if not value or not isinstance(value, str):
- value = "unknown"
-
- # Sanitize filename
- safe_value = "".join(
- c for c in value if c.isalnum() or c in ("-", "_", "@", ".")
- ).strip()
- if not safe_value:
- safe_value = "unknown"
-
- if safe_value not in files:
- f = open(output_dir / f"{safe_value}.json", "w")
- f.write("[")
- files[safe_value] = {"handle": f, "first": True}
-
- f_info = files[safe_value]
- if not f_info["first"]:
- f_info["handle"].write(",")
- json.dump(item, f_info["handle"])
- f_info["first"] = False
-
- finally:
- for f_info in files.values():
- f_info["handle"].write("]")
- f_info["handle"].close()
-
- console.print(f"[bold green]Grouping complete! Created {len(files)} files.[/bold green]")
-
- def get_dataframe_with_participants(
- self, data_type: str, parquet_dir: Optional[str | Path] = None
- ):
- """
- Returns a pandas DataFrame for the specified data type, enriched with participant info.
- Adds columns: participant_id, participant_email, participant_folder
- """
- try:
- import pandas as pd
- except ImportError:
- console.print("[bold red]pandas is required for DataFrame conversion.[/bold red]")
- return None
-
- # Get base dataframe
- df = self.get_dataframe(data_type, parquet_dir)
- if df is None or df.empty:
- return df
-
- # Add participant columns
- def get_participant_info(deployment_id):
- p = self.participant_manager.get_participant(deployment_id)
- if p:
- return pd.Series(
- {
- "participant_id": p.unified_participant_id,
- "participant_email": p.email,
- "participant_folder": p.source_folder,
- }
- )
- return pd.Series(
- {"participant_id": None, "participant_email": None, "participant_folder": None}
- )
-
- # Extract studyDeploymentId from dataStream column if it exists
- if "dataStream" in df.columns:
- deployment_ids = df["dataStream"].apply(
- lambda x: x.get("studyDeploymentId") if isinstance(x, dict) else None
- )
- elif "studyDeploymentId" in df.columns:
- deployment_ids = df["studyDeploymentId"]
- else:
- console.print("[yellow]Could not find studyDeploymentId column[/yellow]")
- return df
-
- participant_info = deployment_ids.apply(get_participant_info)
- return pd.concat([df, participant_info], axis=1)
-
- def scan_schema(self) -> Dict[str, Any]:
- """
- Scans the entire file to infer the schema of the data.
- Returns a dictionary mapping data types to their field structures.
- """
- schemas = defaultdict(set)
-
- # We need to count items for tqdm, but counting requires a pass.
- # For very large files, we might just use file size or unknown length.
- # Let's try to estimate or just use a simple progress bar.
-
- console.print(f"[bold blue]Scanning schema for {len(self.file_paths)} files...[/bold blue]")
-
- # We can use tqdm wrapping the generator, but we don't know total length easily without reading.
- # We can use file size as a proxy if we read raw bytes, but ijson handles the reading.
- # Let's just use a counter.
-
- count = 0
- with tqdm(desc="Processing items", unit=" items") as pbar:
- for item in self._get_item_generator():
- data_type = item.get("dataStream", {}).get("dataType", {}).get("name", "unknown")
- namespace = (
- item.get("dataStream", {}).get("dataType", {}).get("namespace", "unknown")
- )
- full_type = f"{namespace}.{data_type}"
-
- measurement_data = item.get("measurement", {}).get("data", {})
-
- # Collect keys
- for key in measurement_data.keys():
- schemas[full_type].add(key)
-
- count += 1
- if count % 1000 == 0:
- pbar.update(1000)
- pbar.update(count % 1000)
-
- # Convert sets to lists for JSON serialization/display
- self.schema_cache = {k: list(v) for k, v in schemas.items()}
- return self.schema_cache
-
- def print_schema(self):
- if not self.schema_cache:
- self.scan_schema()
-
- table = Table(title="Inferred Schema")
- table.add_column("Data Type", style="cyan")
- table.add_column("Fields", style="magenta")
-
- for dtype, fields in self.schema_cache.items():
- table.add_row(dtype, ", ".join(sorted(fields)))
-
- console.print(table)
-
- def get_data_by_type(self, target_type: str) -> Generator[Dict[str, Any], None, None]:
- """
- Yields items of a specific data type.
- """
- target_namespace, target_name = target_type.rsplit(".", 1)
-
- for item in self._get_item_generator():
- dt = item.get("dataStream", {}).get("dataType", {})
- if dt.get("name") == target_name and dt.get("namespace") == target_namespace:
- yield item
-
- def export_to_json(self, output_path: str, data_type: Optional[str] = None):
- """
- Exports data to a JSON file. Can filter by data type.
- """
- console.print(f"[bold green]Exporting data to {output_path}...[/bold green]")
-
- generator = self.get_data_by_type(data_type) if data_type else self._get_item_generator()
-
- with open(output_path, "w") as f:
- f.write("[")
- first = True
- for item in tqdm(generator, desc="Exporting"):
- if not first:
- f.write(",")
- json.dump(item, f)
- first = False
- f.write("]")
-
- console.print("[bold green]Export complete![/bold green]")
-
- def group_by_field(self, field_path: str, output_dir: str | Path):
- """
- Groups data by a specific field and exports each group to a separate JSON file.
- field_path is a dot-separated string, e.g., 'dataStream.dataType.name'.
- """
- output_dir = Path(output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- console.print(f"[bold blue]Grouping data by {field_path} into {output_dir}...[/bold blue]")
-
- # We can't keep all files open if there are too many groups.
- # But for things like dataType, there are usually < 20 groups.
- # A safe approach for low memory is to read the file once and append to files,
- # but opening/closing files for every line is slow.
- # A middle ground is to keep a cache of open file handles, closing LRU if too many.
-
- # For simplicity and speed assuming reasonable number of groups (<100):
- files = {}
-
- try:
- for item in tqdm(self._get_item_generator(), desc="Grouping"):
- # Extract value
- value = item
- for part in field_path.split("."):
- if isinstance(value, dict):
- value = value.get(part)
- else:
- value = None
- break
-
- if value is None:
- value = "unknown"
-
- value = str(value)
- # Sanitize filename
- safe_value = "".join(c for c in value if c.isalnum() or c in ("-", "_")).strip()
- if not safe_value:
- safe_value = "unknown"
-
- if safe_value not in files:
- f = open(output_dir / f"{safe_value}.json", "w")
- f.write("[")
- files[safe_value] = {"handle": f, "first": True}
-
- f_info = files[safe_value]
- if not f_info["first"]:
- f_info["handle"].write(",")
- json.dump(item, f_info["handle"])
- f_info["first"] = False
-
- finally:
- for f_info in files.values():
- f_info["handle"].write("]")
- f_info["handle"].close()
-
- console.print(f"[bold green]Grouping complete! Created {len(files)} files.[/bold green]")
-
- def count_items(self) -> int:
- """
- Counts the total number of items in the JSON file.
- """
- console.print(f"[bold blue]Counting items in {len(self.file_paths)} files...[/bold blue]")
- count = 0
- for _ in tqdm(self._get_item_generator(), desc="Counting"):
- count += 1
- return count
-
- def convert_to_parquet(self, output_dir: str | Path, batch_size: int = 10000):
- """
- Converts the JSON data to Parquet files, grouped by data type.
- Requires pyarrow and pandas.
- """
- import importlib.util
-
- if not importlib.util.find_spec("pyarrow") or not importlib.util.find_spec("pandas"):
- console.print(
- "[bold red]pyarrow and pandas are required for Parquet conversion. Please install them.[/bold red]"
- )
- return
-
- output_dir = Path(output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- console.print(f"[bold blue]Converting to Parquet in {output_dir}...[/bold blue]")
-
- writers = {}
- buffers = defaultdict(list)
-
- try:
- for item in tqdm(self._get_item_generator(), desc="Converting"):
- # Determine type
- try:
- dtype = item.get("dataStream", {}).get("dataType", {}).get("name", "unknown")
- # Sanitize
- safe_name = "".join(c for c in dtype if c.isalnum() or c in ("-", "_")).strip()
- if not safe_name:
- safe_name = "unknown"
- except (AttributeError, TypeError):
- safe_name = "unknown"
-
- buffers[safe_name].append(item)
-
- if len(buffers[safe_name]) >= batch_size:
- self._flush_buffer_to_parquet(
- safe_name, buffers[safe_name], writers, output_dir
- )
- buffers[safe_name].clear()
-
- finally:
- # Flush remaining
- for name, buf in buffers.items():
- if buf:
- self._flush_buffer_to_parquet(name, buf, writers, output_dir)
-
- # Close writers
- for writer in writers.values():
- writer.close()
-
- console.print(
- f"[bold green]Conversion complete! Created {len(writers)} Parquet files.[/bold green]"
- )
-
- def _flush_buffer_to_parquet(self, name, buffer, writers, output_dir):
- import pandas as pd
- import pyarrow as pa
- import pyarrow.parquet as pq
-
- if not buffer:
- return
-
- try:
- # Flatten nested dicts into dot-separated columns (e.g. measurement.data.steps).
- # This avoids PyArrow inferring empty struct types (like "metadata": {})
- # which Parquet cannot represent, and also eliminates schema drift caused
- # by nested fields appearing/disappearing across batches.
- df = pd.json_normalize(buffer)
- table = pa.Table.from_pandas(df)
- except Exception as e:
- console.print(f"[red]Error converting batch for {name}: {e}[/red]")
- return
-
- if name not in writers:
- file_path = output_dir / f"{name}.parquet"
- writers[name] = pq.ParquetWriter(file_path, table.schema)
-
- try:
- if not table.schema.equals(writers[name].schema):
- # Unify schemas: merge the writer's existing schema with the new batch's
- # schema so that columns present in either side are kept (new columns
- # get nulls in earlier batches, missing columns get nulls here).
- merged = pa.unify_schemas(
- [writers[name].schema, table.schema], promote_options="permissive"
- )
-
- # Reopen writer with the wider schema
- writers[name].close()
- file_path = output_dir / f"{name}.parquet"
- writers[name] = pq.ParquetWriter(file_path, merged)
-
- table = table.cast(merged)
-
- writers[name].write_table(table)
- except Exception as e:
- console.print(f"[red]Error writing batch for {name}: {e}[/red]")
-
- def get_dataframe(self, data_type: str, parquet_dir: Optional[str | Path] = None):
- """
- Returns a pandas DataFrame for the specified data type.
- If parquet_dir is provided and contains the corresponding parquet file, it loads from there.
- Otherwise, it scans the JSON file (which is slower).
- """
- try:
- import pandas as pd
- except ImportError:
- console.print(
- "[bold red]pandas is required for DataFrame conversion. Please install it.[/bold red]"
- )
- return None
-
- # Check Parquet first
- if parquet_dir:
- parquet_dir = Path(parquet_dir)
- # data_type might be full namespace "dk.cachet.carp.heartbeat"
- # or just "heartbeat" if we simplified names in conversion.
- # Our conversion uses simplified names.
-
- simple_name = data_type.split(".")[-1]
- parquet_path = parquet_dir / f"{simple_name}.parquet"
-
- if parquet_path.exists():
- console.print(f"[bold blue]Loading {data_type} from {parquet_path}...[/bold blue]")
- return pd.read_parquet(parquet_path)
-
- # Try full name just in case
- safe_full_name = "".join(c for c in data_type if c.isalnum() or c in ("-", "_")).strip()
- parquet_path_full = parquet_dir / f"{safe_full_name}.parquet"
- if parquet_path_full.exists():
- console.print(
- f"[bold blue]Loading {data_type} from {parquet_path_full}...[/bold blue]"
- )
- return pd.read_parquet(parquet_path_full)
-
- # Fallback to JSON scan
- console.print(
- f"[bold yellow]Parquet file not found. Scanning JSON for {data_type}...[/bold yellow]"
- )
- data = list(tqdm(self.get_data_by_type(data_type), desc="Loading to DataFrame"))
- return pd.DataFrame(data)
-
- def list_all_fields(self, sample_size: int = 100) -> List[str]:
- """
- Scans a sample of items to find all available dot-separated field paths.
- Useful for determining what fields can be used in group_by_field.
- """
- console.print(
- f"[bold blue]Scanning first {sample_size} items to find field paths...[/bold blue]"
- )
- paths = set()
-
- def _recurse(obj, current_path):
- if isinstance(obj, dict):
- for k, v in obj.items():
- new_path = f"{current_path}.{k}" if current_path else k
- paths.add(new_path)
- _recurse(v, new_path)
-
- count = 0
- for item in self._get_item_generator():
- _recurse(item, "")
- count += 1
- if count >= sample_size:
- break
-
- return sorted(list(paths))
-
- def generate_type_definitions(
- self, output_file: str = "generated_types.py", sample_size: int = 1000
- ):
- """
- Generates a Python module with dataclasses representing the data schema.
- Detects nested JSON strings and generates types for them as well.
- """
- console.print(f"[bold blue]Inferring schema from first {sample_size} items...[/bold blue]")
- schema = self._infer_full_schema(sample_size)
-
- console.print("[bold blue]Generating code...[/bold blue]")
- code = self._generate_code_from_schema(schema)
-
- with open(output_file, "w") as f:
- f.write(code)
- console.print(f"[bold green]Generated type definitions in {output_file}[/bold green]")
-
- def _infer_full_schema(self, sample_size: int) -> Dict[str, Any]:
- root_schema = {"type": "object", "fields": {}}
-
- def merge(schema, value):
- if value is None:
- schema["nullable"] = True
- return
-
- if isinstance(value, dict):
- if schema.get("type") and schema["type"] != "object":
- schema["type"] = "Any" # Conflict
- return
- schema["type"] = "object"
- if "fields" not in schema:
- schema["fields"] = {}
-
- for k, v in value.items():
- if k not in schema["fields"]:
- schema["fields"][k] = {}
- merge(schema["fields"][k], v)
-
- elif isinstance(value, list):
- if schema.get("type") and schema["type"] != "list":
- schema["type"] = "Any"
- return
- schema["type"] = "list"
- if "item_type" not in schema:
- schema["item_type"] = {}
-
- for item in value:
- merge(schema["item_type"], item)
-
- else:
- # Primitive
- # Check if string is JSON
- is_json = False
- if isinstance(value, str):
- try:
- if (value.strip().startswith("{") and value.strip().endswith("}")) or (
- value.strip().startswith("[") and value.strip().endswith("]")
- ):
- parsed = json.loads(value)
- if isinstance(parsed, (dict, list)):
- is_json = True
- schema["is_json_string"] = True
- merge(schema, parsed)
- return
- except (json.JSONDecodeError, TypeError):
- pass
-
- if not is_json:
- py_type = type(value).__name__
- # Map python types to type hints
- if py_type == "float":
- py_type = "float"
- elif py_type == "int":
- py_type = "int"
- elif py_type == "str":
- py_type = "str"
- elif py_type == "bool":
- py_type = "bool"
-
- if schema.get("type") == "primitive" and schema.get("python_type") != py_type:
- # If mixing int and float, upgrade to float
- if {schema.get("python_type"), py_type} == {"int", "float"}:
- schema["python_type"] = "float"
- else:
- schema["python_type"] = "Any"
- else:
- schema["type"] = "primitive"
- schema["python_type"] = py_type
-
- count = 0
- for item in self._get_item_generator():
- merge(root_schema, item)
- count += 1
- if count >= sample_size:
- break
-
- return root_schema
-
- def _generate_code_from_schema(self, schema: Dict[str, Any]) -> str:
- classes = {} # name -> definition
-
- def get_type_name(schema, context_name):
- if schema.get("type") == "object":
- class_name = "".join(x[:1].upper() + x[1:] for x in context_name.split("_"))
- if not class_name:
- class_name = "Root"
-
- # Handle collision
- base_name = class_name
- counter = 1
- while (
- class_name in classes
- and classes[class_name] is not None
- and classes[class_name] != schema.get("fields")
- ):
- # Note: comparing fields is a weak check for equality, but sufficient for now
- class_name = f"{base_name}{counter}"
- counter += 1
-
- if class_name not in classes:
- classes[class_name] = None # Placeholder
- fields = []
- for k, v in schema.get("fields", {}).items():
- field_type = get_type_name(v, k)
- fields.append(
- (
- k,
- field_type,
- v.get("nullable", False),
- v.get("is_json_string", False),
- )
- )
- classes[class_name] = fields
-
- return class_name
-
- elif schema.get("type") == "list":
- item_type = get_type_name(schema.get("item_type", {}), context_name + "_item")
- return f"List[{item_type}]"
-
- elif schema.get("type") == "primitive":
- t = schema.get("python_type", "Any")
- return "Any" if t == "Any" else t
-
- return "Any"
-
- get_type_name(schema, "SleepinessItem")
-
- # Generate Code
- lines = [
- "# Auto-generated type definitions",
- "",
- "from __future__ import annotations",
- "from dataclasses import dataclass",
- "from typing import List, Optional, Any, Dict",
- "import json",
- "",
- "def parse_json_field(value):",
- " if isinstance(value, str):",
- " try:",
- " return json.loads(value)",
- " except:",
- " return value",
- " return value",
- "",
- ]
-
- for name, fields in classes.items():
- if fields is None:
- continue # Should not happen if recursion finished
-
- lines.append("@dataclass")
- lines.append(f"class {name}:")
- if not fields:
- lines.append(" pass")
-
- for fname, ftype, nullable, is_json in fields:
- safe_fname = fname
- if safe_fname in (
- "from",
- "class",
- "def",
- "return",
- "import",
- "type",
- "global",
- "for",
- "if",
- "else",
- "while",
- ):
- safe_fname = f"{fname}_"
-
- type_hint = ftype
- if nullable:
- type_hint = f"Optional[{type_hint}]"
-
- lines.append(f" {safe_fname}: {type_hint} = None")
-
- # Add from_dict method
- lines.append("")
- lines.append(" @classmethod")
- lines.append(" def from_dict(cls, obj: Any) -> Any:")
- lines.append(" if not isinstance(obj, dict): return obj")
- lines.append(" instance = cls()")
- for fname, ftype, nullable, is_json in fields:
- safe_fname = fname
- if safe_fname in (
- "from",
- "class",
- "def",
- "return",
- "import",
- "type",
- "global",
- "for",
- "if",
- "else",
- "while",
- ):
- safe_fname = f"{fname}_"
-
- base_type = ftype
- is_list = False
- if ftype.startswith("List[") and ftype.endswith("]"):
- base_type = ftype[5:-1]
- is_list = True
-
- is_custom_class = base_type in classes
-
- lines.append(f" val = obj.get('{fname}')")
- if is_json:
- lines.append(" if isinstance(val, str): val = parse_json_field(val)")
-
- if is_custom_class:
- if is_list:
- lines.append(" if val is not None and isinstance(val, list):")
- lines.append(
- f" instance.{safe_fname} = [{base_type}.from_dict(x) for x in val]"
- )
- else:
- lines.append(" if val is not None:")
- lines.append(
- f" instance.{safe_fname} = {base_type}.from_dict(val)"
- )
- else:
- lines.append(f" instance.{safe_fname} = val")
-
- lines.append(" return instance")
- lines.append("")
-
- return "\n".join(lines)
diff --git a/src/carp/records/__init__.py b/src/carp/records/__init__.py
new file mode 100644
index 0000000..639e1be
--- /dev/null
+++ b/src/carp/records/__init__.py
@@ -0,0 +1,5 @@
+"""Record iteration services."""
+
+from .service import RecordService
+
+__all__ = ["RecordService"]
diff --git a/src/carp/records/service.py b/src/carp/records/service.py
new file mode 100644
index 0000000..8637457
--- /dev/null
+++ b/src/carp/records/service.py
@@ -0,0 +1,81 @@
+"""CARP record iteration, filtering, and inspection."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Iterator
+from typing import Any
+
+from carp.core.fields import collect_field_paths, deployment_id_from_record, full_data_type
+from carp.core.files import iter_json_array
+
+
+class RecordService:
+ """Stream and filter CARP records."""
+
+ def __init__(self, file_paths: tuple[Any, ...], participant_directory: Any) -> None:
+ self._file_paths = file_paths
+ self._participants = participant_directory
+
+ def iter_records(
+ self,
+ data_type: str | None = None,
+ deployment_ids: Iterable[str] | None = None,
+ ) -> Iterator[dict[str, Any]]:
+ """Yield records matching optional data-type and deployment filters."""
+
+ allowed_ids = set(deployment_ids or [])
+ for file_path in self._file_paths:
+ for item in iter_json_array(file_path):
+ if allowed_ids and deployment_id_from_record(item) not in allowed_ids:
+ continue
+ if data_type and full_data_type(item) != data_type:
+ continue
+ yield item
+
+ def iter_with_participants(self, data_type: str | None = None) -> Iterator[dict[str, Any]]:
+ """Yield records enriched with participant metadata."""
+
+ for item in self.iter_records(data_type):
+ participant = self._participants.get_participant(deployment_id_from_record(item) or "")
+ if not participant:
+ yield item
+ continue
+ enriched = dict(item)
+ enriched["_participant"] = participant.to_dict()
+ yield enriched
+
+ def count(
+ self,
+ data_type: str | None = None,
+ deployment_ids: Iterable[str] | None = None,
+ ) -> int:
+ """Return the number of matching records."""
+
+ return sum(1 for _ in self.iter_records(data_type, deployment_ids))
+
+ def list_fields(self, sample_size: int = 100) -> list[str]:
+ """Return field paths sampled from the first records."""
+
+ fields: set[str] = set()
+ for index, item in enumerate(self.iter_records()):
+ if index >= sample_size:
+ break
+ fields.update(self.collect_fields(item))
+ return sorted(fields)
+
+ def data_types(self) -> list[str]:
+ """Return all observed record data types."""
+
+ return sorted({self.data_type(item) for item in self.iter_records()})
+
+ @staticmethod
+ def collect_fields(item: dict[str, Any]) -> set[str]:
+ """Collect field paths for one record."""
+
+ return collect_field_paths(item)
+
+ @staticmethod
+ def data_type(item: dict[str, Any]) -> str:
+ """Return the fully qualified data type for one record."""
+
+ return full_data_type(item)
diff --git a/src/carp/schema/__init__.py b/src/carp/schema/__init__.py
new file mode 100644
index 0000000..29e6fac
--- /dev/null
+++ b/src/carp/schema/__init__.py
@@ -0,0 +1,5 @@
+"""Schema discovery services."""
+
+from .service import SchemaService
+
+__all__ = ["SchemaService"]
diff --git a/src/carp/schema/service.py b/src/carp/schema/service.py
new file mode 100644
index 0000000..f3583ab
--- /dev/null
+++ b/src/carp/schema/service.py
@@ -0,0 +1,30 @@
+"""Schema discovery for CARP studies."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Any
+
+
+class SchemaService:
+ """Infer lightweight measurement schemas grouped by data type."""
+
+ def __init__(self, records: Any) -> None:
+ self._records = records
+ self._cache: dict[str, list[str]] = {}
+
+ def scan(self) -> dict[str, list[str]]:
+ """Return inferred measurement keys grouped by data type."""
+
+ schemas: dict[str, set[str]] = defaultdict(set)
+ for item in self._records.iter_records():
+ measurement = item.get("measurement", {}).get("data", {})
+ for key in measurement.keys():
+ schemas[self._records.data_type(item)].add(key)
+ self._cache = {key: sorted(values) for key, values in sorted(schemas.items())}
+ return self._cache
+
+ def cached(self) -> dict[str, list[str]]:
+ """Return the cached schema, scanning the study if needed."""
+
+ return self._cache or self.scan()
diff --git a/src/carp/study.py b/src/carp/study.py
new file mode 100644
index 0000000..88ea2c0
--- /dev/null
+++ b/src/carp/study.py
@@ -0,0 +1,47 @@
+"""Composition root for the modular CARP Analytics API."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from carp.constants import PARTICIPANT_FILE
+from carp.core.files import resolve_paths
+from carp.export import ExportService
+from carp.frames import FrameService
+from carp.participants import ParticipantDirectory, ParticipantService
+from carp.plotting import PlotService
+from carp.records import RecordService
+from carp.schema import SchemaService
+from carp.types import TypeDefinitionService
+
+
+def _discover_participant_folders(file_paths: tuple[Path, ...]) -> tuple[Path, ...]:
+ """Return phase folders that contain participant metadata."""
+
+ folders = {path.parent for path in file_paths if (path.parent / PARTICIPANT_FILE).exists()}
+ return tuple(sorted(folders))
+
+
+class CarpStudy:
+ """Primary public entrypoint for working with CARP study data."""
+
+ def __init__(
+ self,
+ file_paths: str | Path | tuple[str | Path, ...] | list[str | Path],
+ load_participants: bool = True,
+ ):
+ self.file_paths = resolve_paths(file_paths)
+ participant_folders = _discover_participant_folders(self.file_paths) if load_participants else ()
+ self._directory = ParticipantDirectory.from_folders(participant_folders)
+ self.records = RecordService(self.file_paths, self._directory)
+ self.participants = ParticipantService(self, self._directory)
+ self.schema = SchemaService(self.records)
+ self.export = ExportService(self.records)
+ self.frames = FrameService(self.records, self._directory)
+ self.types = TypeDefinitionService(self.records)
+ self.plots = PlotService(self.frames, self.participants)
+
+ def participant(self, email: str) -> object:
+ """Return a participant-scoped view by email."""
+
+ return self.participants.view(email)
diff --git a/src/carp/types/__init__.py b/src/carp/types/__init__.py
new file mode 100644
index 0000000..240dfa7
--- /dev/null
+++ b/src/carp/types/__init__.py
@@ -0,0 +1,5 @@
+"""Type-generation services."""
+
+from .service import TypeDefinitionService
+
+__all__ = ["TypeDefinitionService"]
diff --git a/src/carp/types/infer.py b/src/carp/types/infer.py
new file mode 100644
index 0000000..f235e7d
--- /dev/null
+++ b/src/carp/types/infer.py
@@ -0,0 +1,64 @@
+"""Schema inference helpers for generated type definitions."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def _maybe_json_string(value: object) -> Any | None:
+ """Parse JSON-like strings when possible."""
+
+ if not isinstance(value, str):
+ return None
+ stripped = value.strip()
+ if not stripped or stripped[0] not in "[{" or stripped[-1] not in "]}":
+ return None
+ try:
+ parsed = json.loads(stripped)
+ except json.JSONDecodeError:
+ return None
+ return parsed if isinstance(parsed, (dict, list)) else None
+
+
+def merge_schema(schema: dict[str, Any], value: Any) -> None:
+ """Merge a Python value into an inferred schema."""
+
+ if value is None:
+ schema["nullable"] = True
+ return
+ parsed = _maybe_json_string(value)
+ if parsed is not None:
+ schema["is_json_string"] = True
+ merge_schema(schema, parsed)
+ return
+ if isinstance(value, dict):
+ schema["type"] = "object"
+ fields = schema.setdefault("fields", {})
+ for key, child in value.items():
+ merge_schema(fields.setdefault(key, {}), child)
+ return
+ if isinstance(value, list):
+ schema["type"] = "list"
+ item_type = schema.setdefault("item_type", {})
+ for child in value:
+ merge_schema(item_type, child)
+ return
+ python_type = type(value).__name__
+ if schema.get("type") == "primitive" and schema.get("python_type") != python_type:
+ pair = {schema.get("python_type"), python_type}
+ schema["python_type"] = "float" if pair == {"int", "float"} else "Any"
+ return
+ schema["type"] = "primitive"
+ schema["python_type"] = python_type
+
+
+def infer_schema(records: Any, sample_size: int) -> dict[str, Any]:
+ """Infer a schema from sampled study records."""
+
+ root = {"type": "object", "fields": {}}
+ for index, item in enumerate(records):
+ if index >= sample_size:
+ break
+ merge_schema(root, item)
+ return root
diff --git a/src/carp/types/render.py b/src/carp/types/render.py
new file mode 100644
index 0000000..6a3896c
--- /dev/null
+++ b/src/carp/types/render.py
@@ -0,0 +1,97 @@
+"""Code rendering for inferred type definitions."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def render_types(schema: dict[str, Any], root_name: str = "StudyItem") -> str:
+ """Render dataclass code from an inferred schema."""
+
+ classes: dict[str, list[tuple[str, str, bool, bool]] | None] = {}
+
+ def type_name(node: dict[str, Any], context: str) -> str:
+ if node.get("type") == "object":
+ class_name = "".join(part[:1].upper() + part[1:] for part in context.split("_")) or root_name
+ while class_name in classes:
+ class_name = f"{class_name}Item"
+ classes[class_name] = None
+ fields = []
+ for key, value in node.get("fields", {}).items():
+ fields.append(
+ (
+ key,
+ type_name(value, key),
+ value.get("nullable", False),
+ value.get("is_json_string", False),
+ )
+ )
+ classes[class_name] = fields
+ return class_name
+ if node.get("type") == "list":
+ return f"list[{type_name(node.get('item_type', {}), context + '_item')}]"
+ if node.get("type") == "primitive":
+ return str(node.get("python_type", "Any"))
+ return "Any"
+
+ type_name(schema, root_name)
+ lines = [
+ '"""Auto-generated type definitions for CARP data."""',
+ "",
+ "from __future__ import annotations",
+ "",
+ "import json",
+ "from dataclasses import dataclass",
+ "from typing import Any",
+ "",
+ "",
+ "def parse_json_field(value: Any) -> Any:",
+ ' """Parse JSON-like string fields when possible."""',
+ "",
+ " if not isinstance(value, str):",
+ " return value",
+ " try:",
+ " return json.loads(value)",
+ " except json.JSONDecodeError:",
+ " return value",
+ "",
+ ]
+ for class_name, fields in classes.items():
+ lines.extend(["@dataclass(slots=True)", f"class {class_name}:", f' """Generated dataclass for `{class_name}`."""'])
+ if not fields:
+ lines.extend([" pass", ""])
+ continue
+ for name, annotation, nullable, _ in fields:
+ type_hint = f"{annotation} | None" if nullable else annotation
+ safe_name = f"{name}_" if name in {"class", "from", "type"} else name
+ lines.append(f" {safe_name}: {type_hint} = None")
+ lines.extend(
+ [
+ "",
+ " @classmethod",
+ " def from_dict(cls, obj: Any) -> Any:",
+ ' """Build an instance from a dictionary."""',
+ "",
+ " if not isinstance(obj, dict):",
+ " return obj",
+ " instance = cls()",
+ ]
+ )
+ for name, annotation, _, is_json in fields:
+ safe_name = f"{name}_" if name in {"class", "from", "type"} else name
+ base_type = annotation.removeprefix("list[").removesuffix("]")
+ lines.append(f" value = obj.get('{name}')")
+ if is_json:
+ lines.append(" value = parse_json_field(value)")
+ if annotation.startswith("list[") and base_type in classes:
+ lines.extend(
+ [
+ " if isinstance(value, list):",
+ f" value = [{base_type}.from_dict(item) for item in value]",
+ ]
+ )
+ elif base_type in classes:
+ lines.extend([" if value is not None:", f" value = {base_type}.from_dict(value)"])
+ lines.append(f" instance.{safe_name} = value")
+ lines.extend([" return instance", ""])
+ return "\n".join(lines)
diff --git a/src/carp/types/service.py b/src/carp/types/service.py
new file mode 100644
index 0000000..a274261
--- /dev/null
+++ b/src/carp/types/service.py
@@ -0,0 +1,28 @@
+"""Type-definition generation services."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from .infer import infer_schema
+from .render import render_types
+
+
+class TypeDefinitionService:
+ """Generate typed Python models from sampled CARP records."""
+
+ def __init__(self, records: Any) -> None:
+ self._records = records
+
+ def generate(
+ self,
+ output_file: str | Path = "generated_types.py",
+ sample_size: int = 1_000,
+ ) -> Path:
+ """Generate a Python module containing inferred dataclasses."""
+
+ schema = infer_schema(self._records.iter_records(), sample_size)
+ output_path = Path(output_file)
+ output_path.write_text(render_types(schema), encoding="utf-8")
+ return output_path
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..49c7d1b
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,33 @@
+"""Shared pytest fixtures for CARP Analytics."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from carp import CarpStudy
+
+
+@pytest.fixture()
+def fixture_root() -> Path:
+ """Return the self-contained multi-phase fixture root."""
+
+ return Path(__file__).parent / "fixtures" / "multi_phase"
+
+
+@pytest.fixture()
+def study_paths(fixture_root: Path) -> list[Path]:
+ """Return the default synthetic study file paths."""
+
+ return [
+ fixture_root / "phase_a" / "data-streams.json",
+ fixture_root / "phase_b" / "data-streams.json",
+ ]
+
+
+@pytest.fixture()
+def study(study_paths: list[Path]) -> CarpStudy:
+ """Return a study backed by self-contained fixtures."""
+
+ return CarpStudy(study_paths)
diff --git a/tests/fixtures/multi_phase/phase_a/data-streams.json b/tests/fixtures/multi_phase/phase_a/data-streams.json
new file mode 100644
index 0000000..111918d
--- /dev/null
+++ b/tests/fixtures/multi_phase/phase_a/data-streams.json
@@ -0,0 +1,109 @@
+[
+ {
+ "studyDeploymentId": "deploy-email-a",
+ "dataStream": {
+ "studyDeploymentId": "deploy-email-a",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "stepcount"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 1000,
+ "data": {
+ "steps": 100
+ }
+ },
+ "sequenceId": 1,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-email-a",
+ "dataStream": {
+ "studyDeploymentId": "deploy-email-a",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "location"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 1000,
+ "data": {
+ "latitude": 55.1,
+ "longitude": 12.1
+ }
+ },
+ "sequenceId": 2,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-ssn-a",
+ "dataStream": {
+ "studyDeploymentId": "deploy-ssn-a",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "stepcount"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 2000,
+ "data": {
+ "steps": 50
+ }
+ },
+ "sequenceId": 3,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "dataStream": {
+ "studyDeploymentId": "deploy-name-a",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "survey"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 3000,
+ "data": {
+ "response_json": "{\"score\": 3, \"tags\": [\"rested\", \"calm\"]}"
+ }
+ },
+ "sequenceId": 4,
+ "syncPoint": 1,
+ "triggerIds": [
+ "survey"
+ ],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-unknown-a",
+ "dataStream": {
+ "studyDeploymentId": "deploy-unknown-a",
+ "dataType": {
+ "namespace": "com.acme",
+ "name": "stepcount"
+ },
+ "deviceRoleName": "Watch"
+ },
+ "measurement": {
+ "sensorStartTime": 4000,
+ "data": {
+ "steps": 9
+ }
+ },
+ "sequenceId": 5,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Watch"
+ }
+]
diff --git a/tests/fixtures/multi_phase/phase_a/participant-data.json b/tests/fixtures/multi_phase/phase_a/participant-data.json
new file mode 100644
index 0000000..3f4d24d
--- /dev/null
+++ b/tests/fixtures/multi_phase/phase_a/participant-data.json
@@ -0,0 +1,64 @@
+[
+ {
+ "studyDeploymentId": "deploy-email-a",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.full_name": {
+ "firstName": "Alice",
+ "lastName": "Example"
+ },
+ "dk.carp.webservices.input.informed_consent": {
+ "signedTimestamp": "2024-01-01T00:00:00Z",
+ "userId": "user-email-a",
+ "name": "alice@example.com",
+ "consent": "{\"signature\": {\"firstName\": \"Alice\", \"lastName\": \"Example\"}}"
+ }
+ }
+ }
+ ],
+ "common": {}
+ },
+ {
+ "studyDeploymentId": "deploy-ssn-a",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.full_name": "Bob Example",
+ "dk.carp.webservices.input.ssn": {
+ "socialSecurityNumber": "1111"
+ },
+ "dk.cachet.carp.input.sex": "male"
+ }
+ }
+ ],
+ "common": {}
+ },
+ {
+ "studyDeploymentId": "deploy-name-a",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.full_name": {
+ "firstName": "Charlie",
+ "lastName": "Example"
+ }
+ }
+ }
+ ],
+ "common": {}
+ },
+ {
+ "studyDeploymentId": "deploy-unknown-a",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {}
+ }
+ ],
+ "common": {}
+ }
+]
diff --git a/tests/fixtures/multi_phase/phase_b/data-streams.json b/tests/fixtures/multi_phase/phase_b/data-streams.json
new file mode 100644
index 0000000..167d50d
--- /dev/null
+++ b/tests/fixtures/multi_phase/phase_b/data-streams.json
@@ -0,0 +1,133 @@
+[
+ {
+ "studyDeploymentId": "deploy-email-b",
+ "dataStream": {
+ "studyDeploymentId": "deploy-email-b",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "stepcount"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 5000,
+ "data": {
+ "steps": 150,
+ "cadence": 90
+ }
+ },
+ "sequenceId": 6,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-email-b",
+ "dataStream": {
+ "studyDeploymentId": "deploy-email-b",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "location"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 5000,
+ "data": {
+ "latitude": 55.2,
+ "longitude": 12.2
+ }
+ },
+ "sequenceId": 7,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-ssn-b",
+ "dataStream": {
+ "studyDeploymentId": "deploy-ssn-b",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "stepcount"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 6000,
+ "data": {
+ "steps": 70
+ }
+ },
+ "sequenceId": 8,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-name-b",
+ "dataStream": {
+ "studyDeploymentId": "deploy-name-b",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "survey"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 7000,
+ "data": {
+ "response_json": "{\"score\": 5}"
+ }
+ },
+ "sequenceId": 9,
+ "syncPoint": 1,
+ "triggerIds": [
+ "survey"
+ ],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-name-b",
+ "dataStream": {
+ "studyDeploymentId": "deploy-name-b",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "location"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 7100,
+ "data": {
+ "latitude": 56.0,
+ "longitude": 13.0
+ }
+ },
+ "sequenceId": 10,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ },
+ {
+ "studyDeploymentId": "deploy-orphan",
+ "dataStream": {
+ "studyDeploymentId": "deploy-orphan",
+ "dataType": {
+ "namespace": "dk.cachet.carp",
+ "name": "weather"
+ },
+ "deviceRoleName": "Phone"
+ },
+ "measurement": {
+ "sensorStartTime": 8000,
+ "data": {
+ "temperature": 21
+ }
+ },
+ "sequenceId": 11,
+ "syncPoint": 1,
+ "triggerIds": [],
+ "deviceRoleName": "Phone"
+ }
+]
diff --git a/tests/fixtures/multi_phase/phase_b/participant-data.json b/tests/fixtures/multi_phase/phase_b/participant-data.json
new file mode 100644
index 0000000..5cd0b16
--- /dev/null
+++ b/tests/fixtures/multi_phase/phase_b/participant-data.json
@@ -0,0 +1,46 @@
+[
+ {
+ "studyDeploymentId": "deploy-email-b",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.informed_consent": {
+ "signedTimestamp": "2024-01-02T00:00:00Z",
+ "userId": "user-email-b",
+ "name": "alice@example.com",
+ "consent": "{\"signature\": {\"firstName\": \"Alice\", \"lastName\": \"Example\"}}"
+ }
+ }
+ }
+ ],
+ "common": {}
+ },
+ {
+ "studyDeploymentId": "deploy-ssn-b",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.full_name": "Robert Example",
+ "dk.carp.webservices.input.ssn": {
+ "socialSecurityNumber": "1111"
+ }
+ }
+ }
+ ],
+ "common": {}
+ },
+ {
+ "studyDeploymentId": "deploy-name-b",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.full_name": "Charlie Example"
+ }
+ }
+ ],
+ "common": {}
+ }
+]
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..5896922
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,57 @@
+"""Tests for CLI wiring and command execution."""
+
+from __future__ import annotations
+
+from argparse import Namespace
+
+from carp.commandline import app as cli_app
+
+
+def test_cli_commands_and_help(capsys, study_paths, tmp_path) -> None:
+ """Exercise the public CLI commands."""
+
+ assert cli_app.main(["--version"]) == 0
+ assert cli_app.main([]) == 0
+ assert cli_app.main(["schema", *map(str, study_paths)]) == 0
+ assert cli_app.main(["count", *map(str, study_paths)]) == 0
+ assert cli_app.main(["participants", *map(str, study_paths)]) == 0
+ assert cli_app.main(
+ ["export", *map(str, study_paths), "-o", str(tmp_path / "export.json"), "-t", "dk.cachet.carp.location"]
+ ) == 0
+ assert cli_app.main(["group", *map(str, study_paths), "-o", str(tmp_path / "grouped")]) == 0
+ captured = capsys.readouterr().out
+ assert "carp-analytics-python version" in captured
+ assert "Total items" in captured
+
+
+def test_cli_convert_and_error_paths(monkeypatch, capsys, study_paths, tmp_path) -> None:
+ """Exercise CLI conversion and exception-handling branches."""
+
+ assert cli_app.main(["convert", *map(str, study_paths), "-o", str(tmp_path / "parquet"), "--batch-size", "1"]) == 0
+ assert cli_app.main(["count", "missing.json"]) == 1
+
+ class FakeParser:
+ """Minimal fake parser for exception tests."""
+
+ def parse_args(self, _argv):
+ return Namespace(version=False, command="test", handler=lambda _args: (_ for _ in ()).throw(KeyboardInterrupt()))
+
+ def print_help(self):
+ return None
+
+ monkeypatch.setattr(cli_app, "_build_parser", lambda: FakeParser())
+ assert cli_app.main(["ignored"]) == 130
+ monkeypatch.setattr(
+ cli_app,
+ "_build_parser",
+ lambda: type(
+ "BrokenParser",
+ (),
+ {
+ "parse_args": lambda self, _argv: Namespace(version=False, command="x", handler=lambda _args: (_ for _ in ()).throw(ValueError("boom"))),
+ "print_help": lambda self: None,
+ },
+ )(),
+ )
+ assert cli_app.main(["ignored"]) == 1
+ assert "Error: boom" in capsys.readouterr().out
diff --git a/tests/test_core.py b/tests/test_core.py
new file mode 100644
index 0000000..110303a
--- /dev/null
+++ b/tests/test_core.py
@@ -0,0 +1,55 @@
+"""Tests for shared CARP helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from carp.core.dependencies import import_or_raise, module_available
+from carp.core.fields import collect_field_paths, deployment_id_from_record, get_nested_value
+from carp.core.files import JsonArrayWriter, iter_json_array, resolve_paths
+from carp.core.naming import parquet_stem, sanitize_filename
+from carp.participants.directory import ParticipantDirectory
+from carp.participants.parser import load_participant_file
+
+
+def test_core_helpers_cover_nested_values_and_paths(study_paths: list[Path]) -> None:
+ """Exercise shared path and field helpers."""
+
+ record = next(iter_json_array(study_paths[0]))
+ assert resolve_paths(study_paths) == tuple(study_paths)
+ assert get_nested_value(record, "measurement.data.steps") == 100
+ assert get_nested_value(record, "missing.value", "fallback") == "fallback"
+ assert deployment_id_from_record(record) == "deploy-email-a"
+ assert "measurement.data.steps" in collect_field_paths(record)
+ assert sanitize_filename("alice@example.com", allowed="-_.@") == "alice@example.com"
+ assert parquet_stem("dk.cachet.carp.stepcount") == "dk.cachet.carp__stepcount"
+
+
+def test_json_array_writer_and_module_helpers(tmp_path: Path) -> None:
+ """Exercise JSON writing and optional dependency errors."""
+
+ output_path = tmp_path / "output.json"
+ writer = JsonArrayWriter(output_path)
+ writer.write({"value": 1})
+ writer.write({"value": 2})
+ writer.close()
+ assert output_path.read_text(encoding="utf-8") == '[{"value": 1},{"value": 2}]'
+ assert module_available("json") is True
+ with pytest.raises(RuntimeError):
+ import_or_raise("module_that_does_not_exist_for_tests", "test")
+
+
+def test_participant_loader_handles_invalid_consent(tmp_path: Path) -> None:
+ """Exercise parser branches for invalid consent payloads and missing folders."""
+
+ participant_file = tmp_path / "participant-data.json"
+ participant_file.write_text(
+ '[{"studyDeploymentId":"x","roles":[{"roleName":"Participant","data":{"dk.carp.webservices.input.informed_consent":"broken"}}]}]',
+ encoding="utf-8",
+ )
+ loaded = load_participant_file(participant_file)
+ assert loaded["x"].consent_signed is False
+ empty_directory = ParticipantDirectory.from_folders((tmp_path / "missing",))
+ assert empty_directory.summary_rows() == []
diff --git a/tests/test_edge_frames_plotting.py b/tests/test_edge_frames_plotting.py
new file mode 100644
index 0000000..f8eb23e
--- /dev/null
+++ b/tests/test_edge_frames_plotting.py
@@ -0,0 +1,84 @@
+"""Additional edge-case coverage for frames and plotting."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from carp.core.dependencies import import_or_raise
+from carp.core.fields import collect_field_paths
+from carp.plotting.prepare import candidate_series, frames_from_items, prepare_location_frame, prepare_step_frame
+from carp.plotting.render import _merge_steps, render_heatmap
+
+
+def test_frame_service_edge_branches(study, tmp_path) -> None:
+ """Exercise dataframe and parquet helper branches."""
+
+ pandas = import_or_raise("pandas", "test")
+ pyarrow = import_or_raise("pyarrow", "test")
+ assert study.records.list_fields(sample_size=0) == []
+ assert collect_field_paths([]) == set()
+ assert "items[]" in collect_field_paths({"items": []})
+ assert study.frames.get_dataframe_with_participants("missing.type").empty
+ nested = pandas.DataFrame({"dataStream": [{"studyDeploymentId": "nested-id"}]})
+ assert study.frames._deployment_series(nested).tolist() == ["nested-id"]
+ assert study.frames._participant_row("deploy-email-a")["participant_email"] == "alice@example.com"
+ aligned = study.frames._align_table(
+ pyarrow,
+ pyarrow.Table.from_pylist([{"steps": 1}]),
+ pyarrow.schema([("steps", pyarrow.float64()), ("cadence", pyarrow.int64())]),
+ )
+ assert aligned.column_names == ["steps", "cadence"]
+ assert aligned["steps"][0].as_py() == 1.0
+ assert aligned["cadence"][0].as_py() is None
+ assert study.participant("alice@example.com").dataframe("missing.type").empty
+ assert study.participant("alice@example.com").available_fields(sample_size=0) == []
+ assert study.frames.convert_to_parquet(tmp_path / "flush", batch_size=50)
+ assert study.frames.get_dataframe("missing.type", tmp_path / "flush").empty
+
+
+def test_plotting_helpers_and_edge_paths(study, tmp_path, monkeypatch) -> None:
+ """Exercise helper functions and low-probability plotting branches."""
+
+ pandas = import_or_raise("pandas", "test")
+ location_items = [
+ SimpleNamespace(
+ measurement=SimpleNamespace(
+ data=SimpleNamespace(latitude=1.0, longitude=2.0),
+ sensorStartTime=10,
+ )
+ ),
+ SimpleNamespace(measurement=None),
+ ]
+ step_items = [
+ SimpleNamespace(measurement=SimpleNamespace(data=SimpleNamespace(steps=3), sensorStartTime=10)),
+ SimpleNamespace(measurement=SimpleNamespace(data=SimpleNamespace(steps=None), sensorStartTime=11)),
+ ]
+ location_frame, step_frame = frames_from_items(location_items, step_items)
+ assert not location_frame.empty and not step_frame.empty
+ assert candidate_series(pandas.DataFrame({"value": [1]}), ["missing", "value"]).tolist() == [1]
+ assert candidate_series(pandas.DataFrame({"nested": [{"a": {"b": 1}}]}), ["nested.a.b"]).tolist() == [1]
+ assert candidate_series(pandas.DataFrame({"value": [1]}), ["missing.path"]) is None
+ assert list(prepare_location_frame(study.frames.get_dataframe("dk.cachet.carp.location"))["_lat"]) == [55.1, 55.2, 56.0]
+ assert list(prepare_step_frame(study.frames.get_dataframe("dk.cachet.carp.stepcount"))["_steps"]) == [100, 50, 150, 70]
+ assert render_heatmap(location_frame.iloc[0:0], step_frame, tmp_path / "empty.html") is None
+ assert render_heatmap(location_frame, pandas.DataFrame({"_steps": [0], "_time": [10], "_lat": [1.0], "_lon": [2.0]}), tmp_path / "zero.html") is not None
+ assert _merge_steps(pandas, location_frame, pandas.DataFrame({"_steps": [1]})).empty
+ assert study.plots.unified("missing") is None
+ assert study.plots.deployment("missing", output_file=str(tmp_path / "missing.html")) is None
+ assert study.plots.deployment("deploy-email-a", location_type="missing.type", output_file=str(tmp_path / "noloc.html")) is None
+ assert study.plots.deployment("deploy-email-a", step_type="missing.type", output_file=str(tmp_path / "nosteps.html")) is not None
+ assert study.plots.from_items(location_items, step_items, output_file=str(tmp_path / "objects.html")) is not None
+ monkeypatch.setattr(study.plots, "candidate_series", lambda *_args, **_kwargs: None)
+ assert study.participant("alice@example.com").dataframe("dk.cachet.carp.stepcount").shape[0] == 4
+
+ calls = {"count": 0}
+
+ def staged_series(*_args, **_kwargs):
+ calls["count"] += 1
+ frame = _args[0]
+ if calls["count"] == 1:
+ return pandas.Series(["deploy-email-a"] * len(frame), index=frame.index)
+ return None
+
+ monkeypatch.setattr("carp.plotting.service.candidate_series", staged_series)
+ assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "staged.html")) is not None
diff --git a/tests/test_edge_types_cli.py b/tests/test_edge_types_cli.py
new file mode 100644
index 0000000..34690ab
--- /dev/null
+++ b/tests/test_edge_types_cli.py
@@ -0,0 +1,103 @@
+"""Additional edge-case coverage for CLI and type-generation helpers."""
+
+from __future__ import annotations
+
+import runpy
+from pathlib import Path
+
+from carp import CarpStudy
+from carp.core.fields import get_nested_value
+from carp.participants.parser import load_participant_file
+from carp.types.infer import _maybe_json_string, infer_schema, merge_schema
+from carp.types.render import render_types
+
+
+def test_cli_module_entrypoint(monkeypatch) -> None:
+ """Execute the module-level CLI entrypoint."""
+
+ exit_codes = []
+ monkeypatch.setattr("carp.commandline.app.main", lambda: 7)
+ monkeypatch.setattr("sys.exit", lambda code: exit_codes.append(code))
+ runpy.run_module("carp.cli", run_name="__main__")
+ assert exit_codes == [7]
+
+
+def test_parser_and_schema_edge_branches(study_paths: list[Path], tmp_path: Path) -> None:
+ """Exercise parser branches not covered by the default fixture."""
+
+ assert get_nested_value({"a": 1}, "a.b", "fallback") == "fallback"
+ assert CarpStudy(study_paths).schema.cached()["dk.cachet.carp.location"] == ["latitude", "longitude"]
+
+ participant_file = tmp_path / "participant-data.json"
+ participant_file.write_text(
+ """
+ [
+ {"roles": [{"data": {}}]},
+ {
+ "studyDeploymentId": "string-ssn",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.ssn": "2222",
+ "dk.carp.webservices.input.informed_consent": {
+ "name": "eve@example.com",
+ "consent": "{broken json}",
+ "note": 1
+ }
+ }
+ }
+ ]
+ },
+ {
+ "studyDeploymentId": "non-string-consent",
+ "roles": [
+ {
+ "roleName": "Participant",
+ "data": {
+ "dk.carp.webservices.input.informed_consent": {
+ "name": "nonstr@example.com",
+ "consent": 1
+ }
+ }
+ }
+ ]
+ }
+ ]
+ """,
+ encoding="utf-8",
+ )
+ loaded = load_participant_file(participant_file)
+ assert loaded["string-ssn"].ssn == "2222"
+ assert loaded["string-ssn"].email == "eve@example.com"
+ assert loaded["string-ssn"].full_name is None
+ assert loaded["non-string-consent"].email == "nonstr@example.com"
+
+
+def test_type_inference_and_rendering_edge_branches() -> None:
+ """Exercise edge branches in schema inference and code rendering."""
+
+ assert _maybe_json_string("plain text") is None
+ assert _maybe_json_string("{broken}") is None
+ schema = {}
+ merge_schema(schema, None)
+ merge_schema(schema, {"value": [1, 2.0]})
+ assert schema["nullable"] is True
+ assert infer_schema(iter([{"a": 1}, {"a": 2}]), sample_size=0)["fields"] == {}
+
+ rendered = render_types(
+ {
+ "type": "object",
+ "fields": {
+ "child": {"type": "object", "fields": {}},
+ "other": {"type": "object", "fields": {"child": {"type": "object", "fields": {"value": {"type": "primitive", "python_type": "int"}}}}},
+ "matching": {"type": "object", "fields": {"child": {"type": "object", "fields": {}}}},
+ "items": {"type": "list", "item_type": {"type": "object", "fields": {"from": {"type": "primitive", "python_type": "str"}}}},
+ "mystery": {},
+ },
+ }
+ )
+ assert "class Child:" in rendered
+ assert "class ChildItem:" in rendered
+ assert "from_: str = None" in rendered
+ assert "mystery: Any = None" in rendered
diff --git a/tests/test_export.py b/tests/test_export.py
new file mode 100644
index 0000000..8f618f1
--- /dev/null
+++ b/tests/test_export.py
@@ -0,0 +1,24 @@
+"""Tests for JSON export and grouping flows."""
+
+from __future__ import annotations
+
+import json
+
+
+def test_export_json_and_group_by_field(study, tmp_path) -> None:
+ """Exercise JSON export and field-based grouping."""
+
+ export_path = study.export.export_json(tmp_path / "records.json", "dk.cachet.carp.location")
+ payload = json.loads(export_path.read_text(encoding="utf-8"))
+ assert len(payload) == 3
+ grouped = study.export.group_by_field("dataStream.dataType.namespace", tmp_path / "grouped")
+ assert {path.name for path in grouped} == {"com.acme.json", "dk.cachet.carp.json"}
+
+
+def test_group_by_participant_and_identity(study, tmp_path) -> None:
+ """Exercise participant-aware grouping flows."""
+
+ participant_files = study.export.group_by_participant(tmp_path / "participants")
+ identity_files = study.export.group_by_identity("email", tmp_path / "emails")
+ assert len(participant_files) == 5
+ assert {path.name for path in identity_files} == {"alice@example.com.json", "unknown.json"}
diff --git a/tests/test_frames.py b/tests/test_frames.py
new file mode 100644
index 0000000..5198f84
--- /dev/null
+++ b/tests/test_frames.py
@@ -0,0 +1,29 @@
+"""Tests for dataframe and parquet services."""
+
+from __future__ import annotations
+
+
+def test_dataframe_loading_and_participant_columns(study) -> None:
+ """Exercise dataframe loading from JSON and participant enrichment."""
+
+ frame = study.frames.get_dataframe("dk.cachet.carp.stepcount")
+ assert frame.shape[0] == 4
+ enriched = study.frames.get_dataframe_with_participants("dk.cachet.carp.weather")
+ assert enriched.loc[0, "participant_id"] is None
+ assert study.frames.parquet_path("dk.cachet.carp.stepcount", "out").name == "dk.cachet.carp__stepcount.parquet"
+
+
+def test_parquet_conversion_and_reload(study, tmp_path) -> None:
+ """Exercise namespace-aware parquet conversion and reload."""
+
+ output_dir = tmp_path / "parquet"
+ created = study.frames.convert_to_parquet(output_dir, batch_size=1)
+ assert {path.name for path in created} == {
+ "com.acme__stepcount.parquet",
+ "dk.cachet.carp__location.parquet",
+ "dk.cachet.carp__stepcount.parquet",
+ "dk.cachet.carp__survey.parquet",
+ "dk.cachet.carp__weather.parquet",
+ }
+ frame = study.frames.get_dataframe("dk.cachet.carp.stepcount", output_dir)
+ assert set(frame.columns) >= {"studyDeploymentId", "measurement"}
diff --git a/tests/test_participants.py b/tests/test_participants.py
new file mode 100644
index 0000000..c2e5db2
--- /dev/null
+++ b/tests/test_participants.py
@@ -0,0 +1,42 @@
+"""Tests for participant lookup and unified views."""
+
+from __future__ import annotations
+
+from carp.participants.view import ParticipantView
+
+
+def test_participant_lookups_and_summary(study) -> None:
+ """Exercise participant lookup methods and summary rows."""
+
+ assert len(study.participants.by_email("alice@example.com")) == 2
+ assert len(study.participants.by_ssn("1111")) == 2
+ assert len(study.participants.by_name("Charlie Example")) == 2
+ summary_rows = study.participants.summary_rows()
+ assert len(summary_rows) == 4
+ assert any(row["emails"] == "alice@example.com" for row in summary_rows)
+
+
+def test_participant_view_info_fields_and_dataframe(study, tmp_path) -> None:
+ """Exercise the participant-scoped view object."""
+
+ participant = study.participant("alice@example.com")
+ assert isinstance(participant, ParticipantView)
+ info = participant.info()
+ assert info is not None
+ assert info["num_deployments"] == 2
+ assert participant.count() == 4
+ assert participant.data_types() == ["dk.cachet.carp.location", "dk.cachet.carp.stepcount"]
+ assert "measurement.data.latitude" in participant.available_fields()
+ assert "measurement.data.steps" in participant.available_fields()
+ assert participant.dataframe("dk.cachet.carp.stepcount").shape[0] == 2
+ assert participant.plot_location(output_file=str(tmp_path / "participant.html")) is not None
+
+
+def test_missing_participant_view_and_unified_lookup(study) -> None:
+ """Exercise missing participants and unified participant lookups."""
+
+ missing = study.participant("nobody@example.com")
+ assert missing.exists is False
+ assert missing.info() is None
+ unified_id = study.participant("alice@example.com").info()["unified_id"]
+ assert len(study.participants.unified(unified_id)) == 2
diff --git a/tests/test_real_data.py b/tests/test_real_data.py
new file mode 100644
index 0000000..82945e3
--- /dev/null
+++ b/tests/test_real_data.py
@@ -0,0 +1,23 @@
+"""Optional real-data integration tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from carp import CarpStudy
+
+SLEEP_DATA_ROOT = Path(__file__).resolve().parents[1] / "sleep-data"
+
+
+@pytest.mark.skipif(not SLEEP_DATA_ROOT.exists(), reason="sleep-data is not available")
+def test_real_data_smoke() -> None:
+ """Exercise stable invariants on local real study data."""
+
+ file_paths = sorted(SLEEP_DATA_ROOT.glob("phase-*/data-streams.json"))
+ study = CarpStudy(file_paths)
+ assert study.records.count() > 0
+ assert len(study.records.data_types()) >= 3
+ assert len(study.schema.scan()) >= 3
+ assert len(study.participants.summary_rows()) >= 1
diff --git a/tests/test_records_schema.py b/tests/test_records_schema.py
new file mode 100644
index 0000000..55bbd13
--- /dev/null
+++ b/tests/test_records_schema.py
@@ -0,0 +1,29 @@
+"""Tests for record iteration and schema discovery."""
+
+from __future__ import annotations
+
+
+def test_record_filters_and_participant_enrichment(study) -> None:
+ """Exercise record filtering and participant enrichment."""
+
+ assert study.records.count() == 11
+ assert study.records.count("dk.cachet.carp.stepcount") == 4
+ filtered = list(study.records.iter_records(deployment_ids=("deploy-email-a",)))
+ assert len(filtered) == 2
+ enriched = list(study.records.iter_with_participants("dk.cachet.carp.stepcount"))
+ assert all("_participant" in item for item in enriched)
+
+
+def test_record_field_listing_data_types_and_schema_cache(study) -> None:
+ """Exercise schema discovery and deployment-id fallback paths."""
+
+ data_types = study.records.data_types()
+ assert "com.acme.stepcount" in data_types
+ assert "dk.cachet.carp.survey" in data_types
+ assert "triggerIds[]" in study.records.list_fields()
+ survey = list(study.records.iter_records("dk.cachet.carp.survey"))
+ assert len(survey) == 2
+ assert study.records.count(deployment_ids=("deploy-name-a", "deploy-name-b")) == 3
+ schema = study.schema.scan()
+ assert schema["dk.cachet.carp.stepcount"] == ["cadence", "steps"]
+ assert study.schema.cached() == schema
diff --git a/tests/test_structure.py b/tests/test_structure.py
new file mode 100644
index 0000000..8eab46b
--- /dev/null
+++ b/tests/test_structure.py
@@ -0,0 +1,22 @@
+"""Structural repository tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def test_python_files_stay_under_two_hundred_lines() -> None:
+ """Enforce the 200-line limit for Python source and test files."""
+
+ root = Path(__file__).resolve().parents[1]
+ python_files = [
+ path
+ for path in root.rglob("*.py")
+ if all(part not in {".venv.nosync", "dist", "__pycache__"} for part in path.parts)
+ ]
+ offenders = []
+ for path in python_files:
+ line_count = len(path.read_text(encoding="utf-8").splitlines())
+ if line_count > 200:
+ offenders.append((path.relative_to(root), line_count))
+ assert offenders == []
diff --git a/tests/test_types_plotting.py b/tests/test_types_plotting.py
new file mode 100644
index 0000000..b25deb7
--- /dev/null
+++ b/tests/test_types_plotting.py
@@ -0,0 +1,44 @@
+"""Tests for generated types and plotting services."""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+
+
+def test_generate_type_definitions(study, tmp_path) -> None:
+ """Exercise generated type definitions for JSON-string payloads."""
+
+ output_path = study.types.generate(tmp_path / "generated_types.py", sample_size=11)
+ code = output_path.read_text(encoding="utf-8")
+ assert "parse_json_field" in code
+ assert "class StudyItem" in code
+ spec = importlib.util.spec_from_file_location("generated_types", output_path)
+ module = importlib.util.module_from_spec(spec)
+ assert spec.loader is not None
+ sys.modules[spec.name] = module
+ spec.loader.exec_module(module)
+ payload = {"measurement": {"data": {"response_json": '{"score": 1}'}}}
+ instance = module.StudyItem.from_dict(payload)
+ assert instance.measurement.data.response_json.score == 1
+
+
+def test_plot_service_outputs_html(study, tmp_path) -> None:
+ """Exercise participant, deployment, unified, and item-based plots."""
+
+ participant_path = study.plots.participant("alice@example.com", output_file=str(tmp_path / "alice.html"))
+ assert participant_path is not None
+ assert "leaflet" in (tmp_path / "alice.html").read_text(encoding="utf-8").lower()
+ unified_id = study.participant("alice@example.com").info()["unified_id"]
+ assert study.plots.unified(unified_id, output_file=str(tmp_path / "unified.html")) is not None
+ assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "solo.html"), include_steps=False) is not None
+ location_items = []
+ assert study.plots.from_items(location_items, output_file=str(tmp_path / "none.html")) is None
+
+
+def test_plot_service_handles_missing_filters(study, monkeypatch, tmp_path) -> None:
+ """Exercise plot branches for missing participants and missing columns."""
+
+ assert study.plots.participant("missing@example.com") is None
+ monkeypatch.setattr("carp.plotting.service.candidate_series", lambda *_args, **_kwargs: None)
+ assert study.plots.deployment("deploy-email-a", output_file=str(tmp_path / "missing.html")) is None
diff --git a/uv.lock b/uv.lock
index f4509f9..78780d6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -7,6 +7,24 @@ resolution-markers = [
"python_full_version < '3.11'",
]
+[[package]]
+name = "alabaster"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" },
+]
+
+[[package]]
+name = "babel"
+version = "2.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" },
+]
+
[[package]]
name = "branca"
version = "0.8.2"
@@ -65,6 +83,21 @@ dev = [
{ name = "pytest" },
{ name = "pytest-cov" },
{ name = "ruff" },
+ { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" },
+ { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+ { name = "sphinx-rtd-theme" },
+]
+docs = [
+ { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" },
+ { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+test = [
+ { name = "folium" },
+ { name = "matplotlib" },
+ { name = "pandas" },
+ { name = "pyarrow" },
]
[package.metadata]
@@ -91,6 +124,15 @@ dev = [
{ name = "pytest", specifier = ">=7.4.0" },
{ name = "pytest-cov", specifier = ">=4.1.0" },
{ name = "ruff", specifier = ">=0.1.0" },
+ { name = "sphinx", specifier = ">=8.1.3" },
+ { name = "sphinx-rtd-theme", specifier = ">=3.1.0" },
+]
+docs = [{ name = "sphinx", specifier = ">=8.0.0" }]
+test = [
+ { name = "folium", specifier = ">=0.14.0" },
+ { name = "matplotlib", specifier = ">=3.7.0" },
+ { name = "pandas", specifier = ">=2.0.0" },
+ { name = "pyarrow", specifier = ">=14.0.0" },
]
[[package]]
@@ -487,6 +529,31 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" },
]
+[[package]]
+name = "docutils"
+version = "0.21.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.11'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" },
+]
+
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.12'",
+ "python_full_version == '3.11.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
[[package]]
name = "exceptiongroup"
version = "1.3.1"
@@ -691,6 +758,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/f2/53b6e9bdd2a91202066764eaa74b572ba4dede0fe47a5a26f4de34b7541a/ijson-3.4.0.post0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a0fedf09c0f6ffa2a99e7e7fd9c5f3caf74e655c1ee015a0797383e99382ebc3", size = 54657, upload-time = "2025-10-10T05:29:24.482Z" },
]
+[[package]]
+name = "imagesize"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6c/e6/7bf14eeb8f8b7251141944835abd42eb20a658d89084b7e1f3e5fe394090/imagesize-2.0.0.tar.gz", hash = "sha256:8e8358c4a05c304f1fccf7ff96f036e7243a189e9e42e90851993c558cfe9ee3", size = 1773045, upload-time = "2026-03-03T14:18:29.941Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5f/53/fb7122b71361a0d121b669dcf3d31244ef75badbbb724af388948de543e2/imagesize-2.0.0-py2.py3-none-any.whl", hash = "sha256:5667c5bbb57ab3f1fa4bc366f4fbc971db3d5ed011fd2715fd8001f782718d96", size = 9441, upload-time = "2026-03-03T14:18:27.892Z" },
+]
+
[[package]]
name = "iniconfig"
version = "2.3.0"
@@ -1790,6 +1866,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" },
]
+[[package]]
+name = "roman-numerals"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/41dc953bbeb056c17d5f7a519f50fdf010bd0553be2d630bc69d1e022703/roman_numerals-4.1.0.tar.gz", hash = "sha256:1af8b147eb1405d5839e78aeb93131690495fe9da5c91856cb33ad55a7f1e5b2", size = 9077, upload-time = "2025-12-17T18:25:34.381Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/54/6f679c435d28e0a568d8e8a7c0a93a09010818634c3c3907fc98d8983770/roman_numerals-4.1.0-py3-none-any.whl", hash = "sha256:647ba99caddc2cc1e55a51e4360689115551bf4476d90e8162cf8c345fe233c7", size = 7676, upload-time = "2025-12-17T18:25:33.098Z" },
+]
+
[[package]]
name = "ruff"
version = "0.14.7"
@@ -2005,6 +2090,193 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
]
+[[package]]
+name = "snowballstemmer"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "8.1.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.11'",
+]
+dependencies = [
+ { name = "alabaster", marker = "python_full_version < '3.11'" },
+ { name = "babel", marker = "python_full_version < '3.11'" },
+ { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" },
+ { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "imagesize", marker = "python_full_version < '3.11'" },
+ { name = "jinja2", marker = "python_full_version < '3.11'" },
+ { name = "packaging", marker = "python_full_version < '3.11'" },
+ { name = "pygments", marker = "python_full_version < '3.11'" },
+ { name = "requests", marker = "python_full_version < '3.11'" },
+ { name = "snowballstemmer", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.11'" },
+ { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.11'" },
+ { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611, upload-time = "2024-10-13T20:27:13.93Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125, upload-time = "2024-10-13T20:27:10.448Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.0.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version == '3.11.*'",
+]
+dependencies = [
+ { name = "alabaster", marker = "python_full_version == '3.11.*'" },
+ { name = "babel", marker = "python_full_version == '3.11.*'" },
+ { name = "colorama", marker = "python_full_version == '3.11.*' and sys_platform == 'win32'" },
+ { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" },
+ { name = "imagesize", marker = "python_full_version == '3.11.*'" },
+ { name = "jinja2", marker = "python_full_version == '3.11.*'" },
+ { name = "packaging", marker = "python_full_version == '3.11.*'" },
+ { name = "pygments", marker = "python_full_version == '3.11.*'" },
+ { name = "requests", marker = "python_full_version == '3.11.*'" },
+ { name = "roman-numerals", marker = "python_full_version == '3.11.*'" },
+ { name = "snowballstemmer", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-applehelp", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-devhelp", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-htmlhelp", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-qthelp", marker = "python_full_version == '3.11.*'" },
+ { name = "sphinxcontrib-serializinghtml", marker = "python_full_version == '3.11.*'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.12'",
+]
+dependencies = [
+ { name = "alabaster", marker = "python_full_version >= '3.12'" },
+ { name = "babel", marker = "python_full_version >= '3.12'" },
+ { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
+ { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+ { name = "imagesize", marker = "python_full_version >= '3.12'" },
+ { name = "jinja2", marker = "python_full_version >= '3.12'" },
+ { name = "packaging", marker = "python_full_version >= '3.12'" },
+ { name = "pygments", marker = "python_full_version >= '3.12'" },
+ { name = "requests", marker = "python_full_version >= '3.12'" },
+ { name = "roman-numerals", marker = "python_full_version >= '3.12'" },
+ { name = "snowballstemmer", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" },
+]
+
+[[package]]
+name = "sphinx-rtd-theme"
+version = "3.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" },
+ { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+ { name = "sphinxcontrib-jquery" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/84/68/a1bfbf38c0f7bccc9b10bbf76b94606f64acb1552ae394f0b8285bfaea25/sphinx_rtd_theme-3.1.0.tar.gz", hash = "sha256:b44276f2c276e909239a4f6c955aa667aaafeb78597923b1c60babc76db78e4c", size = 7620915, upload-time = "2026-01-12T16:03:31.17Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/87/c7/b5c8015d823bfda1a346adb2c634a2101d50bb75d421eb6dcb31acd25ebc/sphinx_rtd_theme-3.1.0-py2.py3-none-any.whl", hash = "sha256:1785824ae8e6632060490f67cf3a72d404a85d2d9fc26bce3619944de5682b89", size = 7655617, upload-time = "2026-01-12T16:03:28.101Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-jquery"
+version = "4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" },
+ { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/de/f3/aa67467e051df70a6330fe7770894b3e4f09436dea6881ae0b4f3d87cad8/sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", size = 122331, upload-time = "2023-03-14T15:01:01.944Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/76/85/749bd22d1a68db7291c89e2ebca53f4306c3f205853cf31e9de279034c3c/sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae", size = 121104, upload-time = "2023-03-14T15:01:00.356Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" },
+]
+
[[package]]
name = "threadpoolctl"
version = "3.6.0"