diff --git a/.claude/hooks/session-start.sh b/.claude/hooks/session-start.sh
new file mode 100755
index 0000000..d96ed93
--- /dev/null
+++ b/.claude/hooks/session-start.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -euo pipefail
+
+# Only run in remote (Claude Code on the web) environments
+if [ "${CLAUDE_CODE_REMOTE:-}" != "true" ]; then
+  exit 0
+fi
+
+# Install system dependencies required by pygraphviz
+if ! dpkg -s libgraphviz-dev >/dev/null 2>&1; then
+  sudo apt-get update -qq
+  sudo apt-get install -y -qq graphviz libgraphviz-dev >/dev/null 2>&1
+fi
+
+# Install Python dependencies using uv
+cd "$CLAUDE_PROJECT_DIR"
+uv sync --group dev
diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..e06b033
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,14 @@
+{
+  "hooks": {
+    "SessionStart": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/session-start.sh"
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..bbb1df8
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,39 @@
+name: Deploy docs
+
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v4
+
+      - name: Install docs dependencies
+        run: uv sync --group docs
+
+      - name: Build documentation
+        run: uv run mkdocs build
+
+      - uses: actions/upload-pages-artifact@v3
+        with:
+          path: site/
+
+      - id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
index f49339f..1bbd3f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -221,3 +221,6 @@ dj_*_conf.json
 # pixi environments
 .pixi/*
 !.pixi/config.toml
+
+# Nested repo clone (pre-existing artifact)
+orcapod-python/
diff --git a/.zed/rules b/.zed/rules
index c1acd66..d3c9941 100644
--- a/.zed/rules
+++ b/.zed/rules
@@ -1,3 +1,8 @@
+## Naming convention
+
+Always write "orcapod" with a lowercase p — never "OrcaPod" or "Orcapod". This applies
+everywhere: documentation, docstrings, comments, commit messages, and code comments.
+
 ## Running commands
 
 Always run Python commands via `uv run`, e.g.:
@@ -7,6 +12,14 @@ Always run Python commands via `uv run`, e.g.:
 
 Never use `python`, `pytest`, or `python3` directly.
 
+## Branch hygiene
+
+Periodically check the target branch (typically dev) for updates and incorporate them into
+your working branch. Before pushing, fetch and rebase onto the latest target branch to avoid
+divergence and merge conflicts. If cherry-picking is needed due to unrelated commit history,
+prefer cherry-picking your commits onto a fresh branch from the target rather than resolving
+massive rebase conflicts.
+
 ## Updating agent instructions
 
 When adding or changing any instruction, update BOTH:
@@ -57,37 +70,49 @@ Examples:
 ## Project layout
 
 src/orcapod/
-  types.py                    — Schema, ColumnConfig, ContentHash
+  types.py                    — Schema, ColumnConfig, ContentHash, PipelineConfig,
+                                NodeConfig, ExecutorType, CacheMode
   system_constants.py         — Column prefixes and separators
   errors.py                   — InputValidationError, DuplicateTagError, FieldNotResolvableError
   config.py                   — Config dataclass
+  channels.py                 — Async channel primitives (Channel, BroadcastChannel,
+                                ReadableChannel, WritableChannel, ChannelClosed)
   contexts/                   — DataContext (semantic_hasher, arrow_hasher, type_converter)
   protocols/
     hashing_protocols.py      — PipelineElementProtocol, ContentIdentifiableProtocol
+    database_protocols.py     — ArrowDatabaseProtocol
+    pipeline_protocols.py     — Pipeline-level protocols
+    semantic_types_protocols.py — TypeConverterProtocol
     core_protocols/           — StreamProtocol, PodProtocol, SourceProtocol,
                                 PacketFunctionProtocol, DatagramProtocol, TagProtocol,
-                                PacketProtocol, TrackerProtocol
+                                PacketProtocol, TrackerProtocol, AsyncExecutableProtocol,
+                                PacketFunctionExecutorProtocol, OperatorPodProtocol,
+                                LabelableProtocol, TemporalProtocol, TraceableProtocol
   core/
-    base.py                   — ContentIdentifiableBase, PipelineElementBase, TraceableBase
-    static_output_pod.py      — StaticOutputPod (operator base), DynamicPodStream
-    function_pod.py           — FunctionPod, FunctionPodStream, FunctionNode
+    base.py                   — LabelableMixin, DataContextMixin, TraceableBase
+    function_pod.py           — FunctionPod, FunctionPodStream, @function_pod decorator
     packet_function.py        — PacketFunctionBase, PythonPacketFunction, CachedPacketFunction
-    operator_node.py          — OperatorNode (DB-backed operator execution)
-    tracker.py                — Invocation tracking
+    tracker.py                — BasicTrackerManager, GraphTracker
     datagrams/
       datagram.py             — Datagram (unified dict/Arrow backing, lazy conversion)
       tag_packet.py           — Tag (+ system tags), Packet (+ source info)
     sources/
       base.py                 — RootSource (abstract, no upstream)
       arrow_table_source.py   — Core source — all other sources delegate to it
-      derived_source.py       — DerivedSource (backed by FunctionNode/OperatorNode DB)
+      persistent_source.py    — PersistentSource (DB-backed caching wrapper)
+      derived_source.py       — DerivedSource (backed by node DB records)
       csv_source.py, dict_source.py, list_source.py,
         data_frame_source.py, delta_table_source.py — Delegating wrappers
       source_registry.py      — SourceRegistry for provenance resolution
     streams/
       base.py                 — StreamBase (abstract)
       arrow_table_stream.py   — ArrowTableStream (concrete, immutable)
+    nodes/
+      function_node.py        — FunctionNode, PersistentFunctionNode
+      operator_node.py        — OperatorNode, PersistentOperatorNode
+      source_node.py          — SourceNode (leaf stream in graph)
     operators/
+      static_output_pod.py    — StaticOutputOperatorPod, DynamicPodStream
       base.py                 — UnaryOperator, BinaryOperator, NonZeroInputOperator
       join.py                 — Join (N-ary inner join, commutative)
       merge_join.py           — MergeJoin (binary, colliding cols → sorted list[T])
@@ -96,27 +121,61 @@ src/orcapod/
       column_selection.py     — Select/Drop Tag/Packet columns
       mappers.py              — MapTags, MapPackets (rename columns)
       filters.py              — PolarsFilter
+    executors/
+      base.py                 — PacketFunctionExecutorBase (ABC)
+      local.py                — LocalExecutor (default in-process)
+      ray.py                  — RayExecutor (dispatch to Ray cluster)
+  pipeline/
+    graph.py                  — Pipeline (extends GraphTracker, compiles to persistent nodes)
+    nodes.py                  — PersistentSourceNode (DB-backed leaf wrapper)
+    orchestrator.py           — AsyncPipelineOrchestrator (channel-based concurrent execution)
   hashing/
-    semantic_hashing/         — BaseSemanticHasher, type handlers
-  semantic_types/             — Type conversion (Python ↔ Arrow)
-  databases/                  — ArrowDatabaseProtocol implementations (Delta Lake, in-memory)
+    file_hashers.py           — BasicFileHasher, CachedFileHasher
+    arrow_hashers.py          — Arrow-specific hashing
+    arrow_serialization.py    — Arrow serialization utilities
+    arrow_utils.py            — Arrow manipulation for hashing
+    defaults.py               — Factory functions for default hashers
+    hash_utils.py             — hash_file(), get_function_components()
+    string_cachers.py         — String caching strategies
+    versioned_hashers.py      — Versioned hasher support
+    visitors.py               — Visitor pattern for hashing
+    semantic_hashing/         — BaseSemanticHasher, type handlers, TypeHandlerRegistry
+  semantic_types/             — Type conversion (Python ↔ Arrow), UniversalTypeConverter,
+                                SemanticTypeRegistry, type inference
+  databases/
+    delta_lake_databases.py   — DeltaTableDatabase
+    in_memory_databases.py    — InMemoryArrowDatabase
+    noop_database.py          — NoOpArrowDatabase
+    file_utils.py             — File utilities for database operations
+  execution_engines/
+    ray_execution_engine.py   — RayEngine (execution on Ray clusters)
   utils/
     arrow_data_utils.py       — System tag manipulation, source info, column helpers
     arrow_utils.py            — Arrow table utilities
     schema_utils.py           — Schema extraction, union, intersection, compatibility
     lazy_module.py            — LazyModule for deferred heavy imports
+    function_info.py          — Function introspection utilities
+    git_utils.py              — Git metadata extraction
+    name.py                   — Name utilities
+    object_spec.py            — Object specification/serialization
+    polars_data_utils.py      — Polars-specific utilities
 
 tests/
   test_core/
     datagrams/                — Lazy conversion, dict/Arrow round-trip
-    sources/                  — Source construction, protocol conformance, DerivedSource
-    streams/                  — ArrowTableStream behavior
-    function_pod/             — FunctionPod, FunctionNode, pipeline hash integration
+    sources/                  — Source construction, protocol conformance, DerivedSource,
+                                PersistentSource
+    streams/                  — ArrowTableStream behavior, convenience methods
+    function_pod/             — FunctionPod, FunctionNode, pipeline hash integration,
+                                @function_pod decorator
     operators/                — All operators, OperatorNode, MergeJoin
-    packet_function/          — PacketFunction, CachedPacketFunction
-  test_hashing/               — Semantic hasher, hash stability
+    packet_function/          — PacketFunction, CachedPacketFunction, executor
+  test_channels/              — Async channels, async_execute for operators/nodes/pods,
+                                native async operators, pipeline integration
+  test_pipeline/              — Pipeline compilation, AsyncPipelineOrchestrator
+  test_hashing/               — Semantic hasher, hash stability, file hashers, string cachers
   test_databases/             — Delta Lake, in-memory, no-op databases
-  test_semantic_types/        — Type converter tests
+  test_semantic_types/        — Type converter, semantic registry, struct converters
 
 ---
 
@@ -126,8 +185,12 @@ See orcapod-design.md at the project root for the full design specification.
 
 ### Core data flow
 
+Pull-based (synchronous):
   RootSource → ArrowTableStream → [Operator / FunctionPod] → ArrowTableStream → ...
 
+Push-based (async pipeline):
+  Pipeline.compile() → AsyncPipelineOrchestrator.run() → channels → persistent nodes → DB
+
 Every stream is an immutable sequence of (Tag, Packet) pairs backed by a PyArrow Table.
 Tag columns are join keys and metadata; packet columns are the data payload.
 
@@ -143,22 +206,53 @@ Key methods: output_schema(), keys(), iter_packets(), as_table().
 
 Source (core/sources/) — produces a stream from external data. ArrowTableSource is the core
 implementation; CSV/Delta/DataFrame/Dict/List sources all delegate to it internally. Each
-source adds source-info columns and a system tag column. DerivedSource wraps a
-FunctionNode/OperatorNode's DB records as a new source.
+source adds source-info columns and a system tag column. DerivedSource wraps a node's DB
+records as a new source. PersistentSource wraps any RootSource with DB-backed caching
+(deduped by per-row content hash).
 
 Function Pod (core/function_pod.py) — wraps a PacketFunction that transforms individual
-packets. Never inspects tags. Two execution models:
-- FunctionPod → FunctionPodStream: lazy, in-memory
-- FunctionNode: DB-backed, two-phase (yield cached results first, then compute missing)
+packets. Never inspects tags. Supports async functions via PythonPacketFunction. The
+@function_pod decorator creates FunctionPod instances directly from Python functions.
+
+Node (core/nodes/) — graph-aware wrappers that participate in the computation DAG:
+- SourceNode — leaf stream in the graph (wraps a StreamProtocol)
+- FunctionNode / PersistentFunctionNode — packet function invocations (persistent variant
+  is DB-backed with two-phase execution: yield cached, then compute missing)
+- OperatorNode / PersistentOperatorNode — operator invocations (persistent variant
+  is DB-backed with deduplication)
 
 Operator (core/operators/) — structural pod transforming streams without synthesizing new
-packet values. All subclass StaticOutputPod:
+packet values. All subclass StaticOutputOperatorPod. Each operator also implements
+AsyncExecutableProtocol for push-based channel execution:
 - UnaryOperator — 1 input (Batch, Select/Drop columns, Map, Filter)
 - BinaryOperator — 2 inputs (MergeJoin, SemiJoin)
 - NonZeroInputOperator — 1+ inputs (Join)
 
-OperatorNode (core/operator_node.py) — DB-backed operator execution, analogous to
-FunctionNode.
+Executor (core/executors/) — pluggable execution backends for packet functions:
+- LocalExecutor — default in-process execution
+- RayExecutor — dispatches to a Ray cluster
+
+Channel (channels.py) — async primitives for push-based pipeline execution:
+- Channel[T] — bounded async channel with backpressure and close signaling
+- BroadcastChannel[T] — fan-out channel for multiple consumers
+- ReadableChannel[T] / WritableChannel[T] — consumer/producer protocols
+
+Pipeline (pipeline/) — persistent, async-capable pipeline infrastructure:
+- Pipeline — extends GraphTracker; records invocations during a with block, then compile()
+  replaces every node with its persistent variant (leaf streams → PersistentSourceNode,
+  function nodes → PersistentFunctionNode, operator nodes → PersistentOperatorNode)
+- AsyncPipelineOrchestrator — executes a compiled pipeline using channels; walks the
+  persistent node graph in topological order, creates bounded channels between nodes,
+  launches all nodes concurrently via asyncio.TaskGroup
+
+### Async execution model
+
+All pipeline nodes implement AsyncExecutableProtocol:
+  async def async_execute(inputs, output) → None
+
+The orchestrator wires channels between nodes and launches tasks without knowing node types.
+PipelineConfig controls buffer sizes (channel_buffer_size) and concurrency limits
+(default_max_concurrency). Per-node overrides are set via NodeConfig.
 
 ### Strict operator / function pod boundary
 
@@ -199,18 +293,28 @@ Prefixes are computed from SystemConstant in system_constants.py.
 - LazyModule("pyarrow") — deferred import for heavy deps. Used in
   if TYPE_CHECKING: / else: blocks.
 - Argument symmetry — operators return frozenset (commutative) or tuple (ordered).
-- StaticOutputPod.process() → DynamicPodStream — wraps static_process() with staleness
-  detection and automatic recomputation.
+- StaticOutputOperatorPod.process() → DynamicPodStream — wraps static_process() with
+  staleness detection and automatic recomputation.
 - Source delegation — CSVSource, DictSource, etc. create an internal ArrowTableSource.
+- Pipeline context manager — records non-persistent nodes during with block, then compile()
+  promotes them to persistent variants with DB backing.
+- AsyncExecutableProtocol — unified interface for all pipeline nodes. The orchestrator
+  wires channels and launches tasks without knowing node types.
+- GraphTracker — tracks operator/function pod invocations in a NetworkX DAG; Pipeline
+  extends it to add compilation and persistence.
 
 ### Important implementation details
 
 - ArrowTableSource raises ValueError if any tag_columns are not in the table.
 - ArrowTableStream requires at least one packet column; raises ValueError otherwise.
-- FunctionNode Phase 1 returns ALL records in the shared pipeline_path DB table.
+- PersistentFunctionNode Phase 1 returns ALL records in the shared pipeline_path DB table.
   Phase 2 skips inputs whose hash is already in the DB.
 - Empty data → ArrowTableSource raises ValueError("Table is empty").
 - DerivedSource before run() → raises ValueError (no computed records).
 - Join requires non-overlapping packet columns; raises InputValidationError on collision.
 - MergeJoin requires colliding columns to have identical types; merges into sorted list[T].
 - Operators predict output schema (including system tag names) without computation.
+- CachedFileHasher uses mtime+size cache busting to detect file changes without re-hashing.
+- PersistentSource cache is always on; returns the union of all cached data across runs.
+- AsyncPipelineOrchestrator uses BroadcastChannel for fan-out (one node feeding multiple
+  downstream consumers).
diff --git a/CLAUDE.md b/CLAUDE.md
index 79612da..1f2b78c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,5 +1,10 @@
 # Claude Code instructions for orcapod-python
 
+## Naming convention
+
+Always write "orcapod" with a **lowercase p** — never "OrcaPod" or "Orcapod". This applies
+everywhere: documentation, docstrings, comments, commit messages, and code comments.
+
 ## Running commands
 
 Always run Python commands via `uv run`, e.g.:
@@ -11,6 +16,14 @@ uv run python -c "..."
 
 Never use `python`, `pytest`, or `python3` directly.
 
+## Branch hygiene
+
+Periodically check the target branch (typically `dev`) for updates and incorporate them into
+your working branch. Before pushing, fetch and rebase onto the latest target branch to avoid
+divergence and merge conflicts. If cherry-picking is needed due to unrelated commit history,
+prefer cherry-picking your commits onto a fresh branch from the target rather than resolving
+massive rebase conflicts.
+
 ## Updating agent instructions
 
 When adding or changing any instruction, update BOTH:
@@ -64,66 +77,112 @@ Examples:
 
 ```
 src/orcapod/
-├── types.py                    # Schema, ColumnConfig, ContentHash
+├── types.py                    # Schema, ColumnConfig, ContentHash, PipelineConfig,
+│                               # NodeConfig, ExecutorType, CacheMode
 ├── system_constants.py         # Column prefixes and separators
 ├── errors.py                   # InputValidationError, DuplicateTagError, FieldNotResolvableError
 ├── config.py                   # Config dataclass
+├── channels.py                 # Async channel primitives (Channel, BroadcastChannel,
+│                               # ReadableChannel, WritableChannel, ChannelClosed)
 ├── contexts/                   # DataContext (semantic_hasher, arrow_hasher, type_converter)
 ├── protocols/
 │   ├── hashing_protocols.py    # PipelineElementProtocol, ContentIdentifiableProtocol
+│   ├── database_protocols.py   # ArrowDatabaseProtocol
+│   ├── pipeline_protocols.py   # Pipeline-level protocols
+│   ├── semantic_types_protocols.py  # TypeConverterProtocol
 │   └── core_protocols/         # StreamProtocol, PodProtocol, SourceProtocol,
 │                               # PacketFunctionProtocol, DatagramProtocol, TagProtocol,
-│                               # PacketProtocol, TrackerProtocol
+│                               # PacketProtocol, TrackerProtocol, AsyncExecutableProtocol,
+│                               # PacketFunctionExecutorProtocol, OperatorPodProtocol,
+│                               # LabelableProtocol, TemporalProtocol, TraceableProtocol
 ├── core/
-│   ├── base.py                 # ContentIdentifiableBase, PipelineElementBase, TraceableBase
-│   ├── static_output_pod.py    # StaticOutputPod (operator base), DynamicPodStream
-│   ├── function_pod.py         # FunctionPod, FunctionPodStream, FunctionNode
+│   ├── base.py                 # LabelableMixin, DataContextMixin, TraceableBase
+│   ├── function_pod.py         # FunctionPod, FunctionPodStream, @function_pod decorator
 │   ├── packet_function.py      # PacketFunctionBase, PythonPacketFunction, CachedPacketFunction
-│   ├── operator_node.py        # OperatorNode (DB-backed operator execution)
-│   ├── tracker.py              # Invocation tracking
+│   ├── tracker.py              # BasicTrackerManager, GraphTracker
 │   ├── datagrams/
 │   │   ├── datagram.py         # Datagram (unified dict/Arrow backing, lazy conversion)
 │   │   └── tag_packet.py       # Tag (+ system tags), Packet (+ source info)
 │   ├── sources/
 │   │   ├── base.py             # RootSource (abstract, no upstream)
 │   │   ├── arrow_table_source.py  # Core source — all other sources delegate to it
-│   │   ├── derived_source.py   # DerivedSource (backed by FunctionNode/OperatorNode DB)
+│   │   ├── persistent_source.py   # PersistentSource (DB-backed caching wrapper)
+│   │   ├── derived_source.py   # DerivedSource (backed by node DB records)
 │   │   ├── csv_source.py, dict_source.py, list_source.py,
 │   │   │   data_frame_source.py, delta_table_source.py  # Delegating wrappers
 │   │   └── source_registry.py  # SourceRegistry for provenance resolution
 │   ├── streams/
 │   │   ├── base.py             # StreamBase (abstract)
 │   │   └── arrow_table_stream.py  # ArrowTableStream (concrete, immutable)
-│   └── operators/
-│       ├── base.py             # UnaryOperator, BinaryOperator, NonZeroInputOperator
-│       ├── join.py             # Join (N-ary inner join, commutative)
-│       ├── merge_join.py       # MergeJoin (binary, colliding cols → sorted list[T])
-│       ├── semijoin.py         # SemiJoin (binary, non-commutative)
-│       ├── batch.py            # Batch (group rows, types become list[T])
-│       ├── column_selection.py # Select/Drop Tag/Packet columns
-│       ├── mappers.py          # MapTags, MapPackets (rename columns)
-│       └── filters.py          # PolarsFilter
+│   ├── nodes/
+│   │   ├── function_node.py    # FunctionNode, PersistentFunctionNode
+│   │   ├── operator_node.py    # OperatorNode, PersistentOperatorNode
+│   │   └── source_node.py      # SourceNode (leaf stream in graph)
+│   ├── operators/
+│   │   ├── static_output_pod.py  # StaticOutputOperatorPod, DynamicPodStream
+│   │   ├── base.py             # UnaryOperator, BinaryOperator, NonZeroInputOperator
+│   │   ├── join.py             # Join (N-ary inner join, commutative)
+│   │   ├── merge_join.py       # MergeJoin (binary, colliding cols → sorted list[T])
+│   │   ├── semijoin.py         # SemiJoin (binary, non-commutative)
+│   │   ├── batch.py            # Batch (group rows, types become list[T])
+│   │   ├── column_selection.py # Select/Drop Tag/Packet columns
+│   │   ├── mappers.py          # MapTags, MapPackets (rename columns)
+│   │   └── filters.py          # PolarsFilter
+│   └── executors/
+│       ├── base.py             # PacketFunctionExecutorBase (ABC)
+│       ├── local.py            # LocalExecutor (default in-process)
+│       └── ray.py              # RayExecutor (dispatch to Ray cluster)
+├── pipeline/
+│   ├── graph.py                # Pipeline (extends GraphTracker, compiles to persistent nodes)
+│   ├── nodes.py                # PersistentSourceNode (DB-backed leaf wrapper)
+│   └── orchestrator.py         # AsyncPipelineOrchestrator (channel-based concurrent execution)
 ├── hashing/
-│   └── semantic_hashing/       # BaseSemanticHasher, type handlers
-├── semantic_types/             # Type conversion (Python ↔ Arrow)
-├── databases/                  # ArrowDatabaseProtocol implementations (Delta Lake, in-memory)
+│   ├── file_hashers.py         # BasicFileHasher, CachedFileHasher
+│   ├── arrow_hashers.py        # Arrow-specific hashing
+│   ├── arrow_serialization.py  # Arrow serialization utilities
+│   ├── arrow_utils.py          # Arrow manipulation for hashing
+│   ├── defaults.py             # Factory functions for default hashers
+│   ├── hash_utils.py           # hash_file(), get_function_components()
+│   ├── string_cachers.py       # String caching strategies
+│   ├── versioned_hashers.py    # Versioned hasher support
+│   ├── visitors.py             # Visitor pattern for hashing
+│   └── semantic_hashing/       # BaseSemanticHasher, type handlers, TypeHandlerRegistry
+├── semantic_types/             # Type conversion (Python ↔ Arrow), UniversalTypeConverter,
+│                               # SemanticTypeRegistry, type inference
+├── databases/                  # ArrowDatabaseProtocol implementations
+│   ├── delta_lake_databases.py # DeltaTableDatabase
+│   ├── in_memory_databases.py  # InMemoryArrowDatabase
+│   ├── noop_database.py        # NoOpArrowDatabase
+│   └── file_utils.py           # File utilities for database operations
+├── execution_engines/
+│   └── ray_execution_engine.py # RayEngine (execution on Ray clusters)
 └── utils/
     ├── arrow_data_utils.py     # System tag manipulation, source info, column helpers
     ├── arrow_utils.py          # Arrow table utilities
     ├── schema_utils.py         # Schema extraction, union, intersection, compatibility
-    └── lazy_module.py          # LazyModule for deferred heavy imports
+    ├── lazy_module.py          # LazyModule for deferred heavy imports
+    ├── function_info.py        # Function introspection utilities
+    ├── git_utils.py            # Git metadata extraction
+    ├── name.py                 # Name utilities
+    ├── object_spec.py          # Object specification/serialization
+    └── polars_data_utils.py    # Polars-specific utilities
 
 tests/
 ├── test_core/
 │   ├── datagrams/              # Lazy conversion, dict/Arrow round-trip
-│   ├── sources/                # Source construction, protocol conformance, DerivedSource
-│   ├── streams/                # ArrowTableStream behavior
-│   ├── function_pod/           # FunctionPod, FunctionNode, pipeline hash integration
+│   ├── sources/                # Source construction, protocol conformance, DerivedSource,
+│   │                           # PersistentSource
+│   ├── streams/                # ArrowTableStream behavior, convenience methods
+│   ├── function_pod/           # FunctionPod, FunctionNode, pipeline hash integration,
+│   │                           # @function_pod decorator
 │   ├── operators/              # All operators, OperatorNode, MergeJoin
-│   └── packet_function/        # PacketFunction, CachedPacketFunction
-├── test_hashing/               # Semantic hasher, hash stability
+│   └── packet_function/        # PacketFunction, CachedPacketFunction, executor
+├── test_channels/              # Async channels, async_execute for operators/nodes/pods,
+│                               # native async operators, pipeline integration
+├── test_pipeline/              # Pipeline compilation, AsyncPipelineOrchestrator
+├── test_hashing/               # Semantic hasher, hash stability, file hashers, string cachers
 ├── test_databases/             # Delta Lake, in-memory, no-op databases
-└── test_semantic_types/        # Type converter tests
+└── test_semantic_types/        # Type converter, semantic registry, struct converters
 ```
 
 ---
@@ -134,10 +193,16 @@ See `orcapod-design.md` at the project root for the full design specification.
 
 ### Core data flow
 
+**Pull-based (synchronous):**
 ```
 RootSource → ArrowTableStream → [Operator / FunctionPod] → ArrowTableStream → ...
 ```
 
+**Push-based (async pipeline):**
+```
+Pipeline.compile() → AsyncPipelineOrchestrator.run() → channels → persistent nodes → DB
+```
+
 Every stream is an immutable sequence of (Tag, Packet) pairs backed by a PyArrow Table.
 Tag columns are join keys and metadata; packet columns are the data payload.
 
@@ -153,22 +218,59 @@ Key methods: `output_schema()`, `keys()`, `iter_packets()`, `as_table()`.
 
 **Source** (`core/sources/`) — produces a stream from external data. `ArrowTableSource` is the
 core implementation; CSV/Delta/DataFrame/Dict/List sources all delegate to it internally. Each
-source adds source-info columns and a system tag column. `DerivedSource` wraps a
-FunctionNode/OperatorNode's DB records as a new source.
+source adds source-info columns and a system tag column. `DerivedSource` wraps a node's DB
+records as a new source. `PersistentSource` wraps any `RootSource` with DB-backed caching
+(deduped by per-row content hash).
 
 **Function Pod** (`core/function_pod.py`) — wraps a `PacketFunction` that transforms individual
-packets. Never inspects tags. Two execution models:
-- `FunctionPod` → `FunctionPodStream`: lazy, in-memory
-- `FunctionNode`: DB-backed, two-phase (yield cached results first, then compute missing)
+packets. Never inspects tags. Supports async functions via `PythonPacketFunction`. The
+`@function_pod` decorator creates `FunctionPod` instances directly from Python functions.
+
+**Node** (`core/nodes/`) — graph-aware wrappers that participate in the computation DAG:
+- `SourceNode` — leaf stream in the graph (wraps a `StreamProtocol`)
+- `FunctionNode` / `PersistentFunctionNode` — packet function invocations (persistent variant
+  is DB-backed with two-phase execution: yield cached, then compute missing)
+- `OperatorNode` / `PersistentOperatorNode` — operator invocations (persistent variant
+  is DB-backed with deduplication)
 
 **Operator** (`core/operators/`) — structural pod transforming streams without synthesizing new
-packet values. All subclass `StaticOutputPod`:
+packet values. All subclass `StaticOutputOperatorPod`. Each operator also implements
+`AsyncExecutableProtocol` for push-based channel execution:
 - `UnaryOperator` — 1 input (Batch, Select/Drop columns, Map, Filter)
 - `BinaryOperator` — 2 inputs (MergeJoin, SemiJoin)
 - `NonZeroInputOperator` — 1+ inputs (Join)
 
-**OperatorNode** (`core/operator_node.py`) — DB-backed operator execution, analogous to
-FunctionNode.
+**Executor** (`core/executors/`) — pluggable execution backends for packet functions:
+- `LocalExecutor` — default in-process execution
+- `RayExecutor` — dispatches to a Ray cluster
+
+**Channel** (`channels.py`) — async primitives for push-based pipeline execution:
+- `Channel[T]` — bounded async channel with backpressure and close signaling
+- `BroadcastChannel[T]` — fan-out channel for multiple consumers
+- `ReadableChannel[T]` / `WritableChannel[T]` — consumer/producer protocols
+
+**Pipeline** (`pipeline/`) — persistent, async-capable pipeline infrastructure:
+- `Pipeline` — extends `GraphTracker`; records operator/function pod invocations during a
+  `with` block, then `compile()` replaces every node with its persistent variant
+  (leaf streams → `PersistentSourceNode`, function nodes → `PersistentFunctionNode`,
+  operator nodes → `PersistentOperatorNode`)
+- `AsyncPipelineOrchestrator` — executes a compiled pipeline using channels; walks the
+  persistent node graph in topological order, creates bounded channels between nodes,
+  launches all nodes concurrently via `asyncio.TaskGroup`
+
+### Async execution model
+
+All pipeline nodes implement `AsyncExecutableProtocol`:
+```python
+async def async_execute(
+    inputs: Sequence[ReadableChannel[tuple[TagProtocol, PacketProtocol]]],
+    output: WritableChannel[tuple[TagProtocol, PacketProtocol]],
+) -> None
+```
+
+The orchestrator wires channels between nodes and launches tasks without knowing node types.
+`PipelineConfig` controls buffer sizes (`channel_buffer_size`) and concurrency limits
+(`default_max_concurrency`). Per-node overrides are set via `NodeConfig`.
 
 ### Strict operator / function pod boundary
 
@@ -233,18 +335,24 @@ and `as_table()` methods. `all_info=True` sets everything to True.
   `if TYPE_CHECKING:` / `else:` blocks at module level.
 - **Argument symmetry** — each operator declares `argument_symmetry(streams)` returning
   `frozenset` (commutative) or `tuple` (ordered). Determines how upstream hashes combine.
-- **`StaticOutputPod.process()` → `DynamicPodStream`** — wraps `static_process()` output
-  with timestamp-based staleness detection and automatic recomputation.
+- **`StaticOutputOperatorPod.process()` → `DynamicPodStream`** — wraps `static_process()`
+  output with timestamp-based staleness detection and automatic recomputation.
 - **Source delegation** — CSVSource, DictSource, etc. all create an internal
   `ArrowTableSource` and delegate every method to it.
+- **`Pipeline` context manager** — records non-persistent nodes during `with` block, then
+  `compile()` promotes them to persistent variants with DB backing.
+- **`AsyncExecutableProtocol`** — unified interface for all pipeline nodes. The orchestrator
+  wires channels and launches tasks without knowing node types.
+- **`GraphTracker`** — tracks operator/function pod invocations in a NetworkX DAG; `Pipeline`
+  extends it to add compilation and persistence.
 
 ### Important implementation details
 
 - `ArrowTableSource.__init__` raises `ValueError` if any `tag_columns` are not in the table.
 - `ArrowTableStream` requires at least one packet column; raises `ValueError` otherwise.
-- `FunctionNode.iter_packets()` Phase 1 returns ALL records in the shared `pipeline_path`
-  DB table (not filtered to current inputs). Phase 2 skips inputs whose hash is already
-  in the DB.
+- `PersistentFunctionNode.iter_packets()` Phase 1 returns ALL records in the shared
+  `pipeline_path` DB table (not filtered to current inputs). Phase 2 skips inputs whose hash
+  is already in the DB.
 - Empty data → `ArrowTableSource` raises `ValueError("Table is empty")`.
 - `DerivedSource` before `run()` → raises `ValueError` (no computed records).
 - Join requires non-overlapping packet columns; raises `InputValidationError` on collision.
@@ -252,3 +360,7 @@ and `as_table()` methods. `all_info=True` sets everything to True.
   `list[T]` with source columns reordered to match.
 - Operators predict their output schema (including system tag column names) without
   performing the actual computation.
+- `CachedFileHasher` uses mtime+size cache busting to detect file changes without re-hashing.
+- `PersistentSource` cache is always on; returns the union of all cached data across runs.
+- `AsyncPipelineOrchestrator` uses `BroadcastChannel` for fan-out (one node feeding multiple
+  downstream consumers).
diff --git a/README.md b/README.md
index 9641750..948a142 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,115 @@
-# Orcapod Python
-Orcapod's Python library for developing reproducbile scientific pipelines.
+# orcapod
 
-## Continuous Integration
+[![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
 
-This project uses GitHub Actions for continuous integration:
+**Intuitive and powerful library for highly reproducible scientific data pipelines.**
 
-- **Run Tests**: A workflow that runs tests on Ubuntu with multiple Python versions.
+orcapod is a Python framework for building data pipelines with built-in provenance tracking, content-addressable caching, and deterministic computation. Every value produced by an orcapod pipeline is traceable back to its original source, every computation is memoizable, and every result is verifiable.
 
-### Running Tests Locally
+## Key Features
 
-To run tests locally:
+- **Full Provenance Tracking** — Every value carries metadata tracing it back to its originating source and record.
+- **Content-Addressable Caching** — Identical computations are never repeated. Results are automatically shared across compatible pipeline runs.
+- **Immutable Data Flow** — Streams are immutable (Tag, Packet) sequences backed by Apache Arrow tables.
+- **Strict Operator / Function Pod Boundary** — Operators transform structure without inspecting data. Function pods transform data without inspecting tags.
+- **Schema as a First-Class Citizen** — Every stream is self-describing with schemas predicted at construction time.
+- **Incremental Computation** — Database-backed nodes compute only what's missing.
+- **Pluggable Execution** — Synchronous, async channels, or distributed via Ray — results are identical.
+
+## Quick Example
+
+```python
+import pyarrow as pa
+from orcapod import ArrowTableSource, FunctionPod
+from orcapod.core.packet_function import PythonPacketFunction
+from orcapod.core.operators import Join
+
+# Create sources with tag (join key) and packet (data) columns
+patients = ArrowTableSource(
+    pa.table({"patient_id": ["p1", "p2", "p3"], "age": [30, 45, 60]}),
+    tag_columns=["patient_id"],
+)
+labs = ArrowTableSource(
+    pa.table({"patient_id": ["p1", "p2", "p3"], "cholesterol": [180, 220, 260]}),
+    tag_columns=["patient_id"],
+)
+
+# Join on shared tag columns
+joined = Join()(patients, labs)
+
+# Apply a computation to each packet
+def risk_score(age: int, cholesterol: int) -> float:
+    return age * 0.5 + cholesterol * 0.3
+
+risk_fn = PythonPacketFunction(risk_score, output_keys="risk")
+result = FunctionPod(packet_function=risk_fn)(joined)
+
+# Iterate results
+for tag, packet in result.iter_packets():
+    print(f"{tag.as_dict()} → {packet.as_dict()}")
+```
+
+## Installation
 
 ```bash
-# Install the package with test dependencies
-pip install -e ".[test]"
+# From source with uv (recommended)
+git clone https://github.com/walkerlab/orcapod-python.git
+cd orcapod-python
+uv sync
 
-# Run tests with coverage
-pytest -v --cov=src --cov-report=term-missing
+# Or with pip
+pip install -e .
+```
+
+### Optional Dependencies
+
+```bash
+pip install orcapod[ray]    # Distributed execution via Ray
+pip install orcapod[redis]  # Redis-backed caching
+pip install orcapod[all]    # Everything
 ```
 
-### Development Setup
+## Documentation
 
-For development, you can install all optional dependencies:
+Full documentation is available at the [orcapod docs site](https://walkerlab.github.io/orcapod-python/).
+
+- [Getting Started](https://walkerlab.github.io/orcapod-python/getting-started/installation/) — Installation and quickstart
+- [Concepts](https://walkerlab.github.io/orcapod-python/concepts/architecture/) — Architecture and design principles
+- [User Guide](https://walkerlab.github.io/orcapod-python/user-guide/sources/) — Detailed guides for each component
+- [API Reference](https://walkerlab.github.io/orcapod-python/api/) — Auto-generated API documentation
+
+## Development
 
 ```bash
-# Install all development dependencies 
-pip install -e ".[test,dev]"
-# or
-pip install -r requirements-dev.txt
+# Install dev dependencies
+uv sync --group dev
+
+# Run tests
+uv run pytest tests/
+
+# Run tests with coverage
+uv run pytest tests/ --cov=src --cov-report=term-missing
+
+# Build documentation locally
+uv sync --group docs
+uv run mkdocs serve
 ```
+
+## Architecture at a Glance
+
+```
+Source → Stream → [Operator / FunctionPod] → Stream → ...
+```
+
+| Abstraction | Role |
+|-------------|------|
+| **Source** | Load external data, establish provenance |
+| **Stream** | Immutable (Tag, Packet) sequence over shared schema |
+| **Operator** | Structural transformation (join, filter, select, rename) |
+| **Function Pod** | Data transformation (compute new values) |
+| **Pipeline** | Orchestrate, persist, and incrementally recompute |
+
+## License
+
+MIT License — see [LICENSE](LICENSE) for details.
diff --git a/docs/CNAME b/docs/CNAME
new file mode 100644
index 0000000..956369b
--- /dev/null
+++ b/docs/CNAME
@@ -0,0 +1 @@
+orcapod.org
diff --git a/docs/CONTRIBUTING_DOCS.md b/docs/CONTRIBUTING_DOCS.md
new file mode 100644
index 0000000..f9a735a
--- /dev/null
+++ b/docs/CONTRIBUTING_DOCS.md
@@ -0,0 +1,295 @@
+# Documentation Site Setup & Maintenance
+
+This guide covers how the orcapod documentation site is built, deployed, and maintained.
+Follow these instructions to replicate the setup from scratch, troubleshoot deployment
+issues, or make changes to the documentation infrastructure.
+
+---
+
+## Overview
+
+| Component | Technology |
+|-----------|-----------|
+| Documentation framework | [MkDocs](https://www.mkdocs.org/) with [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) |
+| API docs generation | [mkdocstrings](https://mkdocstrings.github.io/) (Python handler) |
+| Hosting | [GitHub Pages](https://pages.github.com/) |
+| Deployment | [GitHub Actions](https://github.com/features/actions) (`.github/workflows/docs.yml`) |
+| Custom domain | `orcapod.org` |
+
+---
+
+## Local Development
+
+### Install dependencies
+
+<!--pytest-codeblocks:skip-->
+```bash
+uv sync --group docs
+```
+
+### Live preview
+
+<!--pytest-codeblocks:skip-->
+```bash
+uv run mkdocs serve
+```
+
+Opens a local server at `http://127.0.0.1:8000` with hot-reload on file changes.
+
+### Build static site
+
+<!--pytest-codeblocks:skip-->
+```bash
+uv run mkdocs build
+```
+
+Outputs static HTML to the `site/` directory (gitignored).
+
+---
+
+## File Structure
+
+```
+orcapod-python/
+├── mkdocs.yml                      # MkDocs configuration (nav, theme, plugins)
+├── docs/
+│   ├── CNAME                       # Custom domain file (deployed to GitHub Pages root)
+│   ├── CONTRIBUTING_DOCS.md        # This file
+│   ├── index.md                    # Homepage
+│   ├── getting-started/
+│   │   ├── installation.md
+│   │   ├── quickstart.md
+│   │   └── first-pipeline.md
+│   ├── concepts/
+│   │   ├── architecture.md
+│   │   ├── datagrams.md
+│   │   ├── streams.md
+│   │   ├── identity.md
+│   │   ├── provenance.md
+│   │   └── schema.md
+│   ├── user-guide/
+│   │   ├── sources.md
+│   │   ├── function-pods.md
+│   │   ├── operators.md
+│   │   ├── pipelines.md
+│   │   ├── caching.md
+│   │   └── execution.md
+│   └── api/
+│       ├── index.md
+│       ├── types.md
+│       ├── sources.md
+│       ├── streams.md
+│       ├── datagrams.md
+│       ├── function-pods.md
+│       ├── packet-functions.md
+│       ├── operators.md
+│       ├── nodes.md
+│       ├── pipeline.md
+│       ├── databases.md
+│       ├── errors.md
+│       └── configuration.md
+├── .github/workflows/
+│   └── docs.yml                    # GitHub Actions deployment workflow
+└── pyproject.toml                  # docs dependency group defined here
+```
+
+---
+
+## GitHub Pages Setup (from scratch)
+
+### Step 1: Enable GitHub Pages
+
+1. Go to **https://github.com/walkerlab/orcapod-python/settings/pages**
+2. Under **Source**, select **GitHub Actions** (not "Deploy from a branch")
+3. Click **Save**
+
+That's all that's needed on the GitHub side. The workflow in `.github/workflows/docs.yml`
+handles building and deploying.
+
+### Step 2: Verify the workflow
+
+The workflow triggers on:
+
+- **Push to `main`** — automatic deployment on every merge
+- **`workflow_dispatch`** — manual trigger from the Actions tab
+
+To manually trigger:
+
+1. Go to **https://github.com/walkerlab/orcapod-python/actions**
+2. Select the **Deploy docs** workflow
+3. Click **Run workflow** → **Run workflow**
+
+### Step 3: Verify deployment
+
+After the first successful run:
+
+- The site is live at `https://walkerlab.github.io/orcapod-python/`
+- Check the **Environments** section on the repo homepage for the deployment URL
+
+---
+
+## Custom Domain Setup (orcapod.org)
+
+### Step 1: Configure DNS records
+
+At your domain registrar's DNS management panel (e.g., Cloudflare, Namecheap, Route 53),
+add the following records:
+
+#### A records (apex domain — `orcapod.org`)
+
+| Type | Name | Value | TTL |
+|------|------|-------|-----|
+| A | `@` | `185.199.108.153` | 3600 |
+| A | `@` | `185.199.109.153` | 3600 |
+| A | `@` | `185.199.110.153` | 3600 |
+| A | `@` | `185.199.111.153` | 3600 |
+
+These are GitHub Pages' IP addresses. All four are required for redundancy.
+
+#### CNAME record (www subdomain — optional but recommended)
+
+| Type | Name | Value | TTL |
+|------|------|-------|-----|
+| CNAME | `www` | `walkerlab.github.io` | 3600 |
+
+This redirects `www.orcapod.org` to the GitHub Pages site.
+
+### Step 2: Configure GitHub Pages custom domain
+
+1. Go to **https://github.com/walkerlab/orcapod-python/settings/pages**
+2. Under **Custom domain**, enter `orcapod.org`
+3. Click **Save**
+4. GitHub will run a DNS check — this may take a few minutes
+5. Once the DNS check passes, check **Enforce HTTPS**
+
+### Step 3: CNAME file in the repository
+
+The file `docs/CNAME` contains the custom domain (`orcapod.org`). MkDocs copies this file
+to the root of the built site, which tells GitHub Pages to serve the site at the custom
+domain.
+
+**Important:** Do not delete `docs/CNAME`. If this file is missing, GitHub Pages will revert
+to serving at `walkerlab.github.io/orcapod-python/` and the custom domain will stop working
+after the next deployment.
+
+### Step 4: Verify
+
+<!--pytest-codeblocks:skip-->
+```bash
+# Check DNS propagation (may take up to 24 hours, usually minutes)
+dig orcapod.org +short
+# Should return:
+# 185.199.108.153
+# 185.199.109.153
+# 185.199.110.153
+# 185.199.111.153
+
+# Check HTTPS
+curl -I https://orcapod.org
+# Should return HTTP/2 200
+```
+
+---
+
+## Troubleshooting
+
+### Site not updating after push
+
+1. Check **Actions tab** → look for the latest "Deploy docs" run
+2. If the run failed, click into it to see the error logs
+3. Common issues:
+   - **mkdocstrings import error** — a module referenced in an API doc page doesn't exist
+     or has an import error. Check the build log for the specific module path.
+   - **Missing dependency** — add it to the `docs` group in `pyproject.toml`
+
+### Custom domain shows 404
+
+1. Verify `docs/CNAME` exists and contains `orcapod.org`
+2. Check GitHub Pages settings → Custom domain should show `orcapod.org`
+3. Re-save the custom domain in settings to re-trigger DNS verification
+4. Verify DNS records: `dig orcapod.org +short` should show GitHub's IPs
+
+### Custom domain shows GitHub Pages 404 (not your site)
+
+The CNAME file may have been removed during a deployment. Verify `docs/CNAME` exists in
+the repository and redeploy.
+
+### HTTPS not available
+
+- HTTPS is only available after DNS propagation completes and GitHub verifies ownership
+- Check **Settings > Pages** — if the DNS check shows a warning, wait and try again
+- GitHub provisions TLS certificates via Let's Encrypt, which can take up to 1 hour after
+  DNS verification
+
+### API docs page shows "Module not found"
+
+The mkdocstrings directive references a Python module path. If you see an error like:
+
+```
+ERROR - mkdocstrings: No module named 'orcapod.some.module'
+```
+
+1. Check that the module path in the `.md` file matches the actual Python module path
+2. Verify the module has no import-time errors: `uv run python -c "import orcapod.some.module"`
+3. Check that `src` is listed in `mkdocstrings` handler paths in `mkdocs.yml`
+
+### Build works locally but fails in CI
+
+1. Check Python version — the CI uses whatever `uv` resolves; ensure compatibility
+2. Check for system dependencies — some packages (e.g., `pygraphviz`) need system libraries
+   that may not be available in the CI runner
+
+---
+
+## Making Changes
+
+### Adding a new documentation page
+
+1. Create the `.md` file in the appropriate `docs/` subdirectory
+2. Add it to the `nav` section in `mkdocs.yml`
+3. Preview locally with `uv run mkdocs serve`
+
+### Adding API docs for a new module
+
+Add a mkdocstrings directive in the appropriate `docs/api/` file:
+
+```markdown
+## MyNewClass
+
+::: orcapod.module.path.MyNewClass
+    options:
+      members:
+        - method_a
+        - method_b
+```
+
+The `members` list controls which methods are documented. Omit it to show all public members
+that have docstrings.
+
+### Updating the navigation
+
+Edit the `nav` section in `mkdocs.yml`. The structure maps directly to the site's sidebar
+navigation.
+
+### Changing the theme or plugins
+
+Edit `mkdocs.yml`. See the [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/)
+for available options.
+
+---
+
+## Dependencies
+
+Documentation dependencies are managed in the `docs` dependency group in `pyproject.toml`:
+
+```toml
+[dependency-groups]
+docs = [
+    "mkdocs>=1.6.0",
+    "mkdocs-material>=9.5.0",
+    "mkdocstrings[python]>=0.27.0",
+    "pymdown-extensions>=10.7",
+]
+```
+
+To update: edit the versions in `pyproject.toml` and run `uv sync --group docs`.
diff --git a/docs/api/configuration.md b/docs/api/configuration.md
new file mode 100644
index 0000000..2786483
--- /dev/null
+++ b/docs/api/configuration.md
@@ -0,0 +1,7 @@
+# Configuration
+
+Global configuration for hashing and identity parameters.
+
+## Config
+
+::: orcapod.config.Config
diff --git a/docs/api/databases.md b/docs/api/databases.md
new file mode 100644
index 0000000..823f90e
--- /dev/null
+++ b/docs/api/databases.md
@@ -0,0 +1,15 @@
+# Databases
+
+Database backends for persistent storage of pipeline results.
+
+## InMemoryArrowDatabase
+
+::: orcapod.databases.in_memory_databases.InMemoryArrowDatabase
+
+## DeltaTableDatabase
+
+::: orcapod.databases.delta_lake_databases.DeltaTableDatabase
+
+## NoOpArrowDatabase
+
+::: orcapod.databases.in_memory_databases.NoOpArrowDatabase
diff --git a/docs/api/datagrams.md b/docs/api/datagrams.md
new file mode 100644
index 0000000..2c39eee
--- /dev/null
+++ b/docs/api/datagrams.md
@@ -0,0 +1,50 @@
+# Datagrams
+
+Immutable data containers with lazy dict/Arrow conversion.
+
+## Datagram
+
+::: orcapod.core.datagrams.datagram.Datagram
+    options:
+      members:
+        - keys
+        - schema
+        - as_dict
+        - as_table
+        - content_hash
+        - select
+        - drop
+        - rename
+        - update
+        - with_columns
+        - copy
+        - get_meta_value
+        - datagram_id
+
+## Tag
+
+::: orcapod.core.datagrams.tag_packet.Tag
+    options:
+      members:
+        - keys
+        - schema
+        - as_dict
+        - as_table
+        - system_tags
+        - as_datagram
+        - copy
+
+## Packet
+
+::: orcapod.core.datagrams.tag_packet.Packet
+    options:
+      members:
+        - keys
+        - schema
+        - as_dict
+        - as_table
+        - source_info
+        - with_source_info
+        - rename
+        - as_datagram
+        - copy
diff --git a/docs/api/errors.md b/docs/api/errors.md
new file mode 100644
index 0000000..0905272
--- /dev/null
+++ b/docs/api/errors.md
@@ -0,0 +1,15 @@
+# Errors
+
+Exception classes used throughout orcapod.
+
+## InputValidationError
+
+::: orcapod.errors.InputValidationError
+
+## DuplicateTagError
+
+::: orcapod.errors.DuplicateTagError
+
+## FieldNotResolvableError
+
+::: orcapod.errors.FieldNotResolvableError
diff --git a/docs/api/function-pods.md b/docs/api/function-pods.md
new file mode 100644
index 0000000..5e2b075
--- /dev/null
+++ b/docs/api/function-pods.md
@@ -0,0 +1,31 @@
+# Function Pods
+
+Pods that apply packet functions to stream data.
+
+## FunctionPod
+
+::: orcapod.core.function_pod.FunctionPod
+    options:
+      members:
+        - process
+        - process_packet
+        - validate_inputs
+        - output_schema
+        - packet_function
+
+## FunctionPodStream
+
+::: orcapod.core.function_pod.FunctionPodStream
+    options:
+      members:
+        - iter_packets
+        - as_table
+        - output_schema
+        - keys
+        - clear_cache
+        - content_hash
+        - pipeline_hash
+
+## function_pod (Decorator)
+
+::: orcapod.core.function_pod.function_pod
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 0000000..146d4c6
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,38 @@
+# API Reference
+
+This section provides auto-generated API documentation from the orcapod source code. Browse
+the reference by module:
+
+## Core Types
+
+- **[Types](types.md)** — `Schema`, `ColumnConfig`, `ContentHash`, `DataType`, `CacheMode`,
+  `ExecutorType`, `NodeConfig`, `PipelineConfig`
+- **[Errors](errors.md)** — `InputValidationError`, `DuplicateTagError`, `FieldNotResolvableError`
+- **[Configuration](configuration.md)** — `Config` dataclass
+
+## Data Containers
+
+- **[Datagrams](datagrams.md)** — `Datagram`, `Tag`, `Packet`
+- **[Streams](streams.md)** — `ArrowTableStream`
+
+## Data Sources
+
+- **[Sources](sources.md)** — `ArrowTableSource`, `DictSource`, `ListSource`,
+  `DataFrameSource`, `DeltaTableSource`, `CSVSource`, `DerivedSource`
+
+## Computation
+
+- **[Packet Functions](packet-functions.md)** — `PythonPacketFunction`,
+  `PacketFunctionBase`, `CachedPacketFunction`
+- **[Function Pods](function-pods.md)** — `FunctionPod`, `FunctionPodStream`,
+  `function_pod` decorator
+- **[Operators](operators.md)** — `Join`, `MergeJoin`, `SemiJoin`, `Batch`,
+  `SelectTagColumns`, `SelectPacketColumns`, `DropTagColumns`, `DropPacketColumns`,
+  `MapTags`, `MapPackets`, `PolarsFilter`
+
+## Execution
+
+- **[Operator & Function Nodes](nodes.md)** — `FunctionNode`, `PersistentFunctionNode`,
+  `OperatorNode`, `PersistentOperatorNode`
+- **[Pipeline](pipeline.md)** — `Pipeline`, `PersistentSourceNode`
+- **[Databases](databases.md)** — `InMemoryArrowDatabase`, `DeltaTableDatabase`
diff --git a/docs/api/nodes.md b/docs/api/nodes.md
new file mode 100644
index 0000000..05a3af9
--- /dev/null
+++ b/docs/api/nodes.md
@@ -0,0 +1,58 @@
+# Operator & Function Nodes
+
+Database-backed execution nodes for persistent computation.
+
+## FunctionNode
+
+::: orcapod.core.function_pod.FunctionNode
+    options:
+      members:
+        - iter_packets
+        - as_table
+        - output_schema
+        - keys
+        - clear_cache
+        - content_hash
+        - pipeline_hash
+        - run
+
+## PersistentFunctionNode
+
+::: orcapod.core.function_pod.PersistentFunctionNode
+    options:
+      members:
+        - iter_packets
+        - as_table
+        - output_schema
+        - keys
+        - run
+        - process_packet
+        - add_pipeline_record
+        - get_all_records
+        - pipeline_path
+        - as_source
+
+## OperatorNode
+
+::: orcapod.core.operator_node.OperatorNode
+    options:
+      members:
+        - iter_packets
+        - as_table
+        - output_schema
+        - keys
+        - run
+        - clear_cache
+        - content_hash
+        - pipeline_hash
+
+## PersistentOperatorNode
+
+::: orcapod.core.operator_node.PersistentOperatorNode
+    options:
+      members:
+        - run
+        - get_all_records
+        - as_source
+        - cache_mode
+        - pipeline_path
diff --git a/docs/api/operators.md b/docs/api/operators.md
new file mode 100644
index 0000000..c2cbbfb
--- /dev/null
+++ b/docs/api/operators.md
@@ -0,0 +1,87 @@
+# Operators
+
+Structural transformers that reshape streams without synthesizing new values.
+
+## Join
+
+::: orcapod.core.operators.join.Join
+
+## MergeJoin
+
+::: orcapod.core.operators.merge_join.MergeJoin
+
+## SemiJoin
+
+::: orcapod.core.operators.semijoin.SemiJoin
+
+## Batch
+
+::: orcapod.core.operators.batch.Batch
+
+## SelectTagColumns
+
+::: orcapod.core.operators.column_selection.SelectTagColumns
+
+## SelectPacketColumns
+
+::: orcapod.core.operators.column_selection.SelectPacketColumns
+
+## DropTagColumns
+
+::: orcapod.core.operators.column_selection.DropTagColumns
+
+## DropPacketColumns
+
+::: orcapod.core.operators.column_selection.DropPacketColumns
+
+## MapTags
+
+::: orcapod.core.operators.mappers.MapTags
+
+## MapPackets
+
+::: orcapod.core.operators.mappers.MapPackets
+
+## PolarsFilter
+
+::: orcapod.core.operators.filters.PolarsFilter
+
+## Base Classes
+
+### UnaryOperator
+
+::: orcapod.core.operators.base.UnaryOperator
+    options:
+      members:
+        - validate_unary_input
+        - unary_static_process
+        - unary_output_schema
+
+### BinaryOperator
+
+::: orcapod.core.operators.base.BinaryOperator
+    options:
+      members:
+        - validate_binary_inputs
+        - binary_static_process
+        - binary_output_schema
+        - is_commutative
+
+### NonZeroInputOperator
+
+::: orcapod.core.operators.base.NonZeroInputOperator
+    options:
+      members:
+        - validate_nonzero_inputs
+
+### StaticOutputPod
+
+::: orcapod.core.static_output_pod.StaticOutputPod
+    options:
+      members:
+        - process
+        - validate_inputs
+        - argument_symmetry
+        - output_schema
+        - static_process
+        - async_execute
diff --git a/docs/api/packet-functions.md b/docs/api/packet-functions.md
new file mode 100644
index 0000000..cabec6a
--- /dev/null
+++ b/docs/api/packet-functions.md
@@ -0,0 +1,45 @@
+# Packet Functions
+
+Stateless computations that transform individual packets.
+
+## PythonPacketFunction
+
+::: orcapod.core.packet_function.PythonPacketFunction
+    options:
+      members:
+        - direct_call
+        - direct_async_call
+        - call
+        - async_call
+        - input_packet_schema
+        - output_packet_schema
+        - canonical_function_name
+        - is_active
+        - set_active
+        - identity_structure
+        - pipeline_identity_structure
+
+## PacketFunctionBase (Abstract Base)
+
+::: orcapod.core.packet_function.PacketFunctionBase
+    options:
+      members:
+        - call
+        - async_call
+        - direct_call
+        - direct_async_call
+        - executor
+        - major_version
+        - packet_function_type_id
+        - canonical_function_name
+        - output_packet_schema_hash
+        - uri
+
+## CachedPacketFunction
+
+::: orcapod.core.packet_function.CachedPacketFunction
+    options:
+      members:
+        - get_cached_output_for_packet
+        - record_packet
+        - get_all_cached_outputs
diff --git a/docs/api/pipeline.md b/docs/api/pipeline.md
new file mode 100644
index 0000000..af5d4f2
--- /dev/null
+++ b/docs/api/pipeline.md
@@ -0,0 +1,21 @@
+# Pipeline
+
+Pipeline orchestration and compilation.
+
+## Pipeline
+
+::: orcapod.pipeline.graph.Pipeline
+    options:
+      members:
+        - compile
+        - run
+        - compiled_nodes
+
+## PersistentSourceNode
+
+::: orcapod.pipeline.nodes.PersistentSourceNode
+    options:
+      members:
+        - run
+        - get_all_records
+        - cache_path
diff --git a/docs/api/sources.md b/docs/api/sources.md
new file mode 100644
index 0000000..64b47b6
--- /dev/null
+++ b/docs/api/sources.md
@@ -0,0 +1,60 @@
+# Sources
+
+Data source implementations for loading external data into orcapod streams.
+
+## ArrowTableSource
+
+::: orcapod.core.sources.arrow_table_source.ArrowTableSource
+    options:
+      members:
+        - resolve_field
+        - output_schema
+        - keys
+        - as_table
+        - as_stream
+        - iter_packets
+        - content_hash
+        - pipeline_hash
+        - source_id
+        - pipeline_identity_structure
+
+## DictSource
+
+::: orcapod.core.sources.dict_source.DictSource
+
+## ListSource
+
+::: orcapod.core.sources.list_source.ListSource
+
+## DataFrameSource
+
+::: orcapod.core.sources.data_frame_source.DataFrameSource
+
+## DeltaTableSource
+
+::: orcapod.core.sources.delta_table_source.DeltaTableSource
+
+## CSVSource
+
+::: orcapod.core.sources.csv_source.CSVSource
+
+## DerivedSource
+
+::: orcapod.core.sources.derived_source.DerivedSource
+
+## RootSource (Abstract Base)
+
+::: orcapod.core.sources.base.RootSource
+    options:
+      members:
+        - resolve_field
+        - pipeline_identity_structure
+        - source_id
+
+## PersistentSource
+
+::: orcapod.core.sources.persistent_source.PersistentSource
+
+## SourceRegistry
+
+::: orcapod.core.sources.source_registry.SourceRegistry
diff --git a/docs/api/streams.md b/docs/api/streams.md
new file mode 100644
index 0000000..3a40338
--- /dev/null
+++ b/docs/api/streams.md
@@ -0,0 +1,32 @@
+# Streams
+
+Immutable stream implementations for carrying (Tag, Packet) pairs.
+
+## ArrowTableStream
+
+::: orcapod.core.streams.arrow_table_stream.ArrowTableStream
+    options:
+      members:
+        - output_schema
+        - keys
+        - iter_packets
+        - as_table
+        - content_hash
+        - pipeline_hash
+        - clear_cache
+
+## StreamBase (Abstract Base)
+
+::: orcapod.core.streams.base.StreamBase
+    options:
+      members:
+        - output_schema
+        - keys
+        - iter_packets
+        - as_table
+        - content_hash
+        - pipeline_hash
+        - producer
+        - upstreams
+        - last_modified
+        - is_stale
diff --git a/docs/api/types.md b/docs/api/types.md
new file mode 100644
index 0000000..aba39ba
--- /dev/null
+++ b/docs/api/types.md
@@ -0,0 +1,67 @@
+# Types
+
+Core type definitions used throughout orcapod.
+
+## Schema
+
+::: orcapod.types.Schema
+    options:
+      members:
+        - merge
+        - with_values
+        - select
+        - drop
+        - is_compatible_with
+        - empty
+        - optional_fields
+        - required_fields
+
+## ColumnConfig
+
+::: orcapod.types.ColumnConfig
+    options:
+      members:
+        - all
+        - data_only
+        - handle_config
+
+## ContentHash
+
+::: orcapod.types.ContentHash
+    options:
+      members:
+        - to_hex
+        - to_int
+        - to_uuid
+        - to_base64
+        - to_string
+        - from_string
+        - display_name
+
+## CacheMode
+
+::: orcapod.types.CacheMode
+
+## ExecutorType
+
+::: orcapod.types.ExecutorType
+
+## NodeConfig
+
+::: orcapod.types.NodeConfig
+
+## PipelineConfig
+
+::: orcapod.types.PipelineConfig
+
+## Type Aliases
+
+The following type aliases are used throughout the API:
+
+| Alias | Definition | Description |
+|-------|-----------|-------------|
+| `DataType` | `type \| UnionType` | A Python type or union of types |
+| `TagValue` | `int \| str \| None \| Collection` | Valid tag column values |
+| `DataValue` | Scalar, path, or nested collections | Valid packet column values |
+| `PacketLike` | `dict[str, DataValue]` | Dict mapping field names to values |
+| `SchemaLike` | `dict[str, DataType]` | Dict mapping field names to types |
diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md
new file mode 100644
index 0000000..9aea240
--- /dev/null
+++ b/docs/concepts/architecture.md
@@ -0,0 +1,121 @@
+# Architecture Overview
+
+orcapod is built around a small number of composable abstractions that enforce a strict
+separation between data transformation and structural manipulation. This page provides a
+high-level map of how the pieces fit together.
+
+## Core Data Flow
+
+```mermaid
+graph LR
+    S1[Source A] --> ST1[Stream]
+    S2[Source B] --> ST2[Stream]
+    ST1 --> OP[Operator<br/>Join / Filter / ...]
+    ST2 --> OP
+    OP --> ST3[Stream]
+    ST3 --> FP[Function Pod]
+    FP --> ST4[Stream]
+    ST4 --> NEXT[...]
+```
+
+Every pipeline follows this pattern:
+
+1. **Sources** load external data and annotate it with provenance metadata.
+2. **Streams** carry data as immutable (Tag, Packet) pairs.
+3. **Operators** reshape streams (join, filter, select, rename, batch) without creating new values.
+4. **Function Pods** apply computations to individual packets, producing new values with tracked provenance.
+
+## The Five Core Abstractions
+
+### Datagram
+
+The universal immutable data container. Holds named columns with explicit types and supports
+lazy conversion between Python dict and Apache Arrow representations. Comes in two forms:
+
+- **Tag** — metadata columns for routing, filtering, and joining. Carries hidden system tags
+  for provenance.
+- **Packet** — data payload columns. Carries source-info provenance tokens per column.
+
+### Stream
+
+An immutable sequence of (Tag, Packet) pairs over a shared schema. The fundamental data-flow
+abstraction — every source emits one, every operator consumes and produces them.
+
+### Source
+
+Produces a stream from external data with no upstream dependencies. Establishes provenance
+by annotating each row with source identity and record identity.
+
+### Function Pod
+
+Wraps a stateless **packet function** that transforms individual packets. Never inspects tags.
+Used when the computation synthesizes new values.
+
+### Operator
+
+A structural transformer that reshapes streams without synthesizing new packet values. Every
+output value is traceable to a concrete input value. Used for joins, filters, projections,
+renames, and batching.
+
+## The Operator / Function Pod Boundary
+
+This is orcapod's most important architectural constraint:
+
+| | Operator | Function Pod |
+|---|---|---|
+| Inspects packet content | Never | Yes |
+| Inspects / uses tags | Yes | No |
+| Can rename columns | Yes | No |
+| Synthesizes new values | No | Yes |
+| Stream arity | Configurable | Single in, single out |
+| Cached by content hash | No | Yes |
+
+This strict separation keeps provenance clean. Operators are provenance-transparent (no new
+values, no provenance footprint). Function pods are provenance-tracked (new values always
+carry source-info pointing back to the function).
+
+## Two Parallel Identity Chains
+
+Every pipeline element maintains two hashes:
+
+1. **`content_hash()`** — data-inclusive. Changes when data changes. Used for deduplication
+   and memoization.
+2. **`pipeline_hash()`** — schema and topology only. Ignores data content. Used for database
+   path scoping so different sources with identical schemas share tables.
+
+See [Identity & Hashing](identity.md) for the full specification.
+
+## Execution Models
+
+orcapod supports multiple execution strategies that produce identical results:
+
+| Model | Mechanism | Use Case |
+|-------|-----------|----------|
+| Lazy in-memory | `FunctionPod` → `FunctionPodStream` | Exploration, one-off computations |
+| Static with recomputation | `StaticOutputPod` → `DynamicPodStream` | Operator output with staleness detection |
+| DB-backed incremental | `FunctionNode` / `OperatorNode` | Production pipelines with caching |
+| Async push-based | `async_execute()` with channels | Pipeline-level parallelism |
+
+See [Execution Models](../user-guide/execution.md) for details.
+
+## Pipeline Compilation
+
+The `Pipeline` class automatically captures computation graphs and upgrades all nodes
+to their persistent variants:
+
+```mermaid
+graph TD
+    subgraph "Recording Phase"
+        A[Source] --> B[Join]
+        B --> C[FunctionPod]
+    end
+    subgraph "After Compilation"
+        D[PersistentSourceNode] --> E[PersistentOperatorNode]
+        E --> F[PersistentFunctionNode]
+    end
+    A -.->|compile| D
+    B -.->|compile| E
+    C -.->|compile| F
+```
+
+See [Pipelines](../user-guide/pipelines.md) for the full pipeline lifecycle.
diff --git a/docs/concepts/datagrams.md b/docs/concepts/datagrams.md
new file mode 100644
index 0000000..8491b6c
--- /dev/null
+++ b/docs/concepts/datagrams.md
@@ -0,0 +1,129 @@
+# Datagrams, Tags & Packets
+
+Datagrams are orcapod's universal immutable data containers. They hold named columns with
+explicit type information and support lazy conversion between Python dict and Apache Arrow
+representations.
+
+## Datagram
+
+A `Datagram` is the base container. It can be constructed from either a Python dict or an
+Arrow table/record batch:
+
+```python
+from orcapod.core.datagrams import Datagram
+
+# From a dict
+dg = Datagram({"name": "Alice", "age": 30})
+
+# Access as dict (always available)
+print(dg.as_dict())  # {'name': 'Alice', 'age': 30}
+
+# Access as Arrow table (lazily computed and cached)
+table = dg.as_table()
+
+# Schema introspection
+print(dg.schema())  # Schema({'name': str, 'age': int})
+print(dg.keys())   # ('name', 'age')
+```
+
+### Lazy Conversion
+
+Datagrams convert between dict and Arrow representations lazily:
+
+- If created from a dict, the Arrow table is computed on first `.as_table()` call and cached.
+- If created from an Arrow table, the dict is computed on first `.as_dict()` call and cached.
+- Content hashing always uses the Arrow representation for determinism.
+- Value access always uses the Python dict for convenience.
+
+### Immutability
+
+Datagrams are immutable. Operations like `select()`, `drop()`, `rename()`, and `update()`
+return new datagrams:
+
+```python
+from orcapod.core.datagrams import Datagram
+
+dg = Datagram({"a": 1, "b": 2, "c": 3})
+
+selected = dg.select("a", "b")       # Datagram({'a': 1, 'b': 2})
+dropped = dg.drop("c")              # Datagram({'a': 1, 'b': 2})
+renamed = dg.rename({"a": "alpha"}) # Datagram({'alpha': 1, 'b': 2, 'c': 3})
+```
+
+## Tag
+
+A **Tag** is a datagram specialization for metadata columns. Tags are used for routing,
+filtering, joining, and annotation. They carry additional **system tags** — framework-managed
+hidden provenance columns.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.datagrams import Tag
+
+tag = Tag({"patient_id": "p1", "visit": "v1"})
+
+# Regular keys (user-visible)
+print(tag.keys())  # ('patient_id', 'visit')
+
+# System tags (hidden provenance columns)
+print(tag.system_tags())  # {...}
+```
+
+### Key Properties of Tags
+
+- **Non-authoritative** — never used for cache lookup or pod identity computation.
+- **Auto-propagated** — tags flow forward through the pipeline automatically.
+- **Join keys** — operators join streams by matching tag columns.
+
+### Tag Merging in Joins
+
+When streams are joined:
+
+- **Shared tag keys** act as the join predicate — values must match.
+- **Non-shared tag keys** propagate freely into the joined output.
+
+## Packet
+
+A **Packet** is a datagram specialization for data payload columns. Packets carry additional
+**source info** — per-column provenance tokens tracing each value back to its originating
+source and record.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.datagrams import Packet
+
+packet = Packet({"age": 30, "cholesterol": 180})
+
+# Source info (provenance pointers)
+print(packet.source_info())
+# {'age': 'source_abc::row_0::age', 'cholesterol': 'source_abc::row_0::cholesterol'}
+```
+
+### Source Info Format
+
+Each packet column carries a source info string:
+
+```
+{source_id}::{record_id}::{column_name}
+```
+
+- `source_id` — canonical identifier of the originating source
+- `record_id` — row identifier (positional like `row_0` or column-based like `user_id=abc123`)
+- `column_name` — the original column name
+
+Source info is **immutable through the pipeline** — set once when a source creates the data
+and preserved through all downstream transformations.
+
+## Column Naming Conventions
+
+orcapod uses column name prefixes to distinguish metadata from user data:
+
+| Prefix | Meaning | Example |
+|--------|---------|---------|
+| `__` | System metadata | `__packet_id`, `__pod_version` |
+| `_source_` | Source info provenance | `_source_age` |
+| `_tag::` | System tag | `_tag::source_id::abc123` |
+| `_context_key` | Data context | `_context_key` |
+
+These prefixes are controlled by `ColumnConfig` and excluded from standard output by default.
+See [Schema & Column Configuration](schema.md) for details.
diff --git a/docs/concepts/identity.md b/docs/concepts/identity.md
new file mode 100644
index 0000000..55d360e
--- /dev/null
+++ b/docs/concepts/identity.md
@@ -0,0 +1,120 @@
+# Identity & Hashing
+
+orcapod maintains two parallel identity chains implemented as recursive Merkle-like hash
+trees. These hashes are central to caching, deduplication, and database scoping.
+
+## Two Identity Chains
+
+### Content Hash (`content_hash()`)
+
+Data-inclusive identity capturing the precise semantic content of an object. Changes when
+data changes. Used for deduplication and memoization.
+
+| Component | What Gets Hashed |
+|-----------|-----------------|
+| RootSource | Class name + tag columns + table content hash |
+| PacketFunction | URI (canonical name + output schema hash + version + type ID) |
+| FunctionPodStream | Function pod + argument symmetry of inputs |
+| Operator | Operator class + identity structure |
+| ArrowTableStream | Producer + upstreams (or table content if no producer) |
+| Datagram | Arrow table content |
+| DerivedSource | Origin node's content hash |
+
+### Pipeline Hash (`pipeline_hash()`)
+
+Schema-and-topology-only identity. Excludes data content so that different sources with
+identical schemas share database tables. Used for database path scoping.
+
+| Component | What Gets Hashed |
+|-----------|-----------------|
+| RootSource | `(tag_schema, packet_schema)` — base case |
+| PacketFunction | Raw packet function object (via content hash) |
+| FunctionPodStream | Function pod + input stream pipeline hashes |
+| Operator | Operator class + argument symmetry (pipeline hashes of inputs) |
+| ArrowTableStream | Producer + upstreams pipeline hashes (or schema if no producer) |
+| DerivedSource | Inherited from RootSource: `(tag_schema, packet_schema)` |
+
+### Why Two Hashes?
+
+Consider a medical pipeline that processes patient data from different clinics. Both clinics
+produce tables with schema `{patient_id: str, age: int, cholesterol: int}` but different
+data.
+
+- **Content hash** differs because the data differs — each clinic's results are cached
+  separately.
+- **Pipeline hash** is identical because the schema and topology match — both clinics' data
+  can share the same database table, distinguished by system tags.
+
+## The ContentHash Type
+
+All hashes are represented as `ContentHash` — a frozen dataclass pairing a method identifier
+with raw digest bytes:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.types import ContentHash
+
+hash_val = source.content_hash()
+
+# Various representations
+print(hash_val.to_hex())    # Hexadecimal string
+print(hash_val.to_int())    # Integer
+print(hash_val.to_uuid())   # UUID
+print(hash_val.to_base64()) # Base64 string
+print(hash_val.to_string()) # "{method}:{hex_digest}"
+```
+
+The method name (e.g., `"object_v0.1"`, `"arrow_v2.1"`) enables detecting version mismatches
+across hash configurations.
+
+## Semantic Hashing
+
+Content hashes use a `BaseSemanticHasher` that:
+
+1. Recursively expands structures (dicts, lists, tuples).
+2. Dispatches to type-specific handlers for each leaf value.
+3. Terminates at `ContentHash` leaves (preventing hash-of-hash inflation).
+
+This ensures that structurally identical objects produce identical hashes regardless of how
+they were constructed.
+
+## The Resolver Pattern
+
+Pipeline hash uses a **resolver pattern** — a callback that routes objects to the correct
+hash method:
+
+- `PipelineElementProtocol` objects → `pipeline_hash()`
+- Other `ContentIdentifiable` objects → `content_hash()`
+
+This ensures the correct identity chain is used for nested objects within a single hash
+computation.
+
+## Argument Symmetry
+
+Each pod declares how upstream hashes are combined:
+
+- **Commutative** (`frozenset`) — upstream hashes sorted before combining. Used when input
+  order is semantically irrelevant (Join, MergeJoin).
+- **Non-commutative** (`tuple`) — upstream hashes combined in declared order. Used when
+  input position is significant (SemiJoin).
+- **Partial symmetry** — nesting expresses mixed constraints.
+
+<!--pytest-codeblocks:skip-->
+```python
+# Commutative: Join(A, B) == Join(B, A)
+join.argument_symmetry(streams)  # returns frozenset
+
+# Non-commutative: SemiJoin(A, B) != SemiJoin(B, A)
+semi_join.argument_symmetry(streams)  # returns tuple
+```
+
+## Packet Function URI
+
+Every packet function has a unique signature:
+
+```
+(canonical_function_name, output_schema_hash, major_version, packet_function_type_id)
+```
+
+For Python functions, the identity structure additionally includes the function's bytecode
+hash, input parameter signature, and Git version information.
diff --git a/docs/concepts/provenance.md b/docs/concepts/provenance.md
new file mode 100644
index 0000000..9727df7
--- /dev/null
+++ b/docs/concepts/provenance.md
@@ -0,0 +1,136 @@
+# System Tags & Provenance
+
+orcapod provides two complementary provenance mechanisms: **source info** for tracking
+value-level lineage, and **system tags** for tracking row-level lineage through structural
+operations.
+
+## Source Info
+
+Every packet column carries a **source info** string — a provenance pointer to the source
+and record that produced the value:
+
+```
+{source_id}::{record_id}::{column_name}
+```
+
+For example:
+
+```
+customers_2024::row_42::age
+```
+
+Source info is:
+
+- **Set once** — when a source creates the data.
+- **Immutable** — preserved through all downstream operations, including column renames.
+- **Column-level** — each column independently tracks its origin.
+
+## System Tags
+
+System tags are **framework-managed, hidden provenance columns** automatically attached to
+every row. They maintain perfect traceability from any result back to its original source
+rows.
+
+### How System Tags Are Created
+
+Each source automatically adds a pair of system tag columns:
+
+```
+_tag::source_id::{schema_hash}    → the source's canonical source_id
+_tag::record_id::{schema_hash}    → the row identifier within that source
+```
+
+For example, a source with schema hash `schema1`:
+
+```
+_tag::source_id::schema1 = "customers_2024"
+_tag::record_id::schema1 = "row_42"
+```
+
+### Three Evolution Rules
+
+System tags evolve differently depending on the operation:
+
+#### 1. Name-Preserving (~90% of operations)
+
+Single-stream operations: filter, select, rename, batch, map.
+
+System tag column names and values pass through **unchanged**. The operation doesn't affect
+provenance tracking.
+
+#### 2. Name-Extending (multi-input operations)
+
+Joins and merges. Each input's system tag column name is extended with the node's pipeline
+hash and canonical position:
+
+```
+Before join:
+  Stream A: _tag::source_id::schema1
+  Stream B: _tag::source_id::schema1
+
+After join (pipeline_hash=abc123):
+  _tag::source_id::schema1::abc123:0    (from Stream A)
+  _tag::source_id::schema1::abc123:1    (from Stream B)
+```
+
+For commutative operations, inputs are sorted by `pipeline_hash` to ensure identical column
+names regardless of wiring order.
+
+#### 3. Type-Evolving (aggregation operations)
+
+Batch and grouping operations. Column names are unchanged but types evolve:
+
+```
+Before batch: _tag::source_id::schema1  (type: str)
+After batch:  _tag::source_id::schema1  (type: list[str])
+```
+
+## The Provenance Graph
+
+orcapod's provenance graph is a **bipartite graph of sources and function pods**:
+
+```mermaid
+graph LR
+    S1[Source A] -->|source_info| FP1[FunctionPod 1]
+    S2[Source B] -->|source_info| FP1
+    FP1 -->|source_info| FP2[FunctionPod 2]
+```
+
+Operators do not appear in the provenance graph because they never synthesize new values.
+This means:
+
+- **Operators can be refactored** without invalidating data provenance.
+- **Provenance queries are simpler** — trace source info pointers between function pod table
+  entries.
+- **Provenance is robust** — lineage is told by what generated the data, not how it was routed.
+
+## Inspecting Provenance
+
+Use `ColumnConfig` to include provenance columns in output:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.types import ColumnConfig
+
+# Source info columns (value-level provenance)
+table = stream.as_table(columns=ColumnConfig(source=True))
+
+# System tag columns (row-level provenance)
+table = stream.as_table(columns=ColumnConfig(system_tags=True))
+
+# Everything
+table = stream.as_table(columns=ColumnConfig.all())
+```
+
+## Schema Prediction
+
+Operators predict output system tag column names at schema time — without performing the
+actual computation — by computing `pipeline_hash` values and canonical positions. This is
+exposed via:
+
+<!--pytest-codeblocks:skip-->
+```python
+tag_schema, packet_schema = operator_stream.output_schema(
+    columns=ColumnConfig(system_tags=True)
+)
+```
diff --git a/docs/concepts/schema.md b/docs/concepts/schema.md
new file mode 100644
index 0000000..0b80808
--- /dev/null
+++ b/docs/concepts/schema.md
@@ -0,0 +1,116 @@
+# Schema & Column Configuration
+
+Every stream in orcapod is self-describing. Schemas are embedded explicitly at every level
+rather than resolved against a central registry.
+
+## Schema
+
+A `Schema` is an immutable mapping from field names to Python types, with support for
+optional fields:
+
+```python
+from orcapod.types import Schema
+
+schema = Schema({"name": str, "age": int, "email": str}, optional_fields={"email"})
+
+print(schema)                  # Schema({'name': str, 'age': int, 'email': str})
+print(schema.optional_fields)  # frozenset({'email'})
+print(schema.required_fields)  # frozenset({'name', 'age'})
+```
+
+### Schema Operations
+
+```python
+from orcapod.types import Schema
+
+a = Schema({"x": int, "y": str})
+b = Schema({"y": str, "z": float})
+
+# Merge (union) — raises on type conflicts
+merged = a.merge(b)  # Schema({'x': int, 'y': str, 'z': float})
+
+# Select specific fields
+selected = a.select("x")  # Schema({'x': int})
+
+# Drop specific fields
+dropped = a.drop("y")  # Schema({'x': int})
+
+# Compatibility check
+a.is_compatible_with(b)  # True if shared keys have compatible types
+```
+
+### Output Schema
+
+Every stream and pod exposes `output_schema()` returning a tuple:
+
+<!--pytest-codeblocks:skip-->
+```python
+tag_schema, packet_schema = stream.output_schema()
+```
+
+- `tag_schema` — the schema of tag (metadata) columns.
+- `packet_schema` — the schema of packet (data) columns.
+
+## ColumnConfig
+
+`ColumnConfig` controls which column groups are included in schema and data output. By
+default, metadata columns are excluded for clean output.
+
+```python
+from orcapod.types import ColumnConfig
+
+# Default: only user data columns
+config = ColumnConfig()
+
+# Include specific metadata
+config = ColumnConfig(source=True)       # Include source-info columns
+config = ColumnConfig(system_tags=True)  # Include system tag columns
+config = ColumnConfig(meta=True)         # Include system metadata (__packet_id, etc.)
+
+# Include everything
+config = ColumnConfig.all()
+
+# Explicitly data-only
+config = ColumnConfig.data_only()
+```
+
+### ColumnConfig Fields
+
+| Field | Default | Controls |
+|-------|---------|----------|
+| `meta` | `False` | System metadata columns (`__` prefix) |
+| `context` | `False` | Data context column (`_context_key`) |
+| `source` | `False` | Source-info provenance columns (`_source_` prefix) |
+| `system_tags` | `False` | System tag columns (`_tag::` prefix) |
+| `content_hash` | `False` | Per-row content hash column |
+| `sort_by_tags` | `False` | Whether to sort output by tag columns |
+
+### Using ColumnConfig
+
+Pass `ColumnConfig` to `output_schema()` and `as_table()`:
+
+<!--pytest-codeblocks:skip-->
+```python
+# Schema with source info columns
+tag_schema, packet_schema = stream.output_schema(
+    columns=ColumnConfig(source=True)
+)
+
+# Materialized table with everything
+table = stream.as_table(columns=ColumnConfig.all())
+```
+
+## Column Naming Conventions
+
+orcapod uses prefixes to distinguish column types:
+
+| Prefix | Category | Example | ColumnConfig Field |
+|--------|----------|---------|--------------------|
+| `__` | System metadata | `__packet_id` | `meta` |
+| `_source_` | Source info | `_source_age` | `source` |
+| `_tag::` | System tag | `_tag::source_id::abc` | `system_tags` |
+| `_context_key` | Data context | `_context_key` | `context` |
+| *(no prefix)* | User data | `age`, `name` | Always included |
+
+These prefixes are defined in `SystemConstant` (`system_constants.py`) and computed from
+a shared `constants` singleton.
diff --git a/docs/concepts/streams.md b/docs/concepts/streams.md
new file mode 100644
index 0000000..478c916
--- /dev/null
+++ b/docs/concepts/streams.md
@@ -0,0 +1,87 @@
+# Streams
+
+Streams are orcapod's fundamental data-flow abstraction. A stream is an immutable sequence
+of (Tag, Packet) pairs over a shared schema. Every source emits a stream, every operator
+consumes and produces streams, and every function pod iterates over them.
+
+## ArrowTableStream
+
+The concrete stream implementation is `ArrowTableStream`, backed by an immutable PyArrow
+Table with explicit tag/packet column assignment.
+
+```python
+import pyarrow as pa
+from orcapod.core.streams import ArrowTableStream
+
+table = pa.table({
+    "id": pa.array(["a", "b", "c"], type=pa.large_string()),
+    "value": pa.array([1, 2, 3], type=pa.int64()),
+})
+
+stream = ArrowTableStream(table, tag_columns=["id"])
+```
+
+## Schema Introspection
+
+Every stream exposes its schema as a tuple of `(tag_schema, packet_schema)`:
+
+<!--pytest-codeblocks:cont-->
+```python
+tag_schema, packet_schema = stream.output_schema()
+print(tag_schema)    # Schema({'id': str})
+print(packet_schema) # Schema({'value': int})
+```
+
+## Iterating
+
+Streams provide lazy iteration over (Tag, Packet) pairs:
+
+<!--pytest-codeblocks:cont-->
+```python
+for tag, packet in stream.iter_packets():
+    print(f"Tag: {tag.as_dict()}, Packet: {packet.as_dict()}")
+```
+
+## Materialization
+
+Materialize a stream as a PyArrow table:
+
+<!--pytest-codeblocks:cont-->
+```python
+table = stream.as_table()
+print(table.to_pandas())
+```
+
+Use `ColumnConfig` to control which metadata columns are included:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.types import ColumnConfig
+
+# Data columns only (default)
+table = stream.as_table()
+
+# Include source-info provenance columns
+table = stream.as_table(columns=ColumnConfig(source=True))
+
+# Include everything
+table = stream.as_table(columns=ColumnConfig.all())
+```
+
+## Key Methods
+
+| Method | Description |
+|--------|-------------|
+| `output_schema()` | Returns `(tag_schema, packet_schema)` |
+| `keys()` | Returns the tag column names |
+| `iter_packets()` | Lazy iteration over `(Tag, Packet)` pairs |
+| `as_table()` | Materialize as a PyArrow table |
+| `content_hash()` | Data-inclusive identity hash |
+| `pipeline_hash()` | Schema-and-topology-only identity hash |
+
+## Stream Properties
+
+- **Immutable** — once created, a stream's data never changes.
+- **Lazy** — iteration and materialization are deferred until requested.
+- **Self-describing** — streams carry their schema explicitly, not by reference to a registry.
+- **At least one packet column** — a stream with only tag columns raises `ValueError`.
diff --git a/docs/conftest.py b/docs/conftest.py
new file mode 100644
index 0000000..1cfad13
--- /dev/null
+++ b/docs/conftest.py
@@ -0,0 +1,80 @@
+"""Conftest for pytest-codeblocks doc tests.
+
+Patches the pytest-codeblocks exec namespace to include ``__name__`` so that
+functions defined inside code blocks get a proper ``__module__`` attribute.
+Without this, orcapod's ``get_function_signature`` fails because
+``function.__module__`` is ``None``.
+"""
+
+from __future__ import annotations
+
+try:
+    from pytest_codeblocks import plugin as _codeblocks_plugin
+
+    _original_runtest = _codeblocks_plugin.TestBlock.runtest
+
+    def _patched_runtest(self):
+        """Run code blocks with __name__ set in the exec globals."""
+        import contextlib
+        import io
+        import re as re_mod
+        import subprocess
+
+        assert self.obj is not None
+        output = None
+
+        if self.obj.importorskip is not None:
+            import pytest
+
+            try:
+                __import__(self.obj.importorskip)
+            except (ImportError, ModuleNotFoundError):
+                pytest.skip()
+
+        if self.obj.syntax == "python":
+            with contextlib.redirect_stdout(io.StringIO()) as s:
+                try:
+                    exec(
+                        self.obj.code,
+                        {"__MODULE__": "__main__", "__name__": "__main__"},
+                    )
+                except Exception as e:
+                    raise RuntimeError(
+                        f"{self.name}, line {self.obj.lineno}:\n```\n"
+                        + self.obj.code
+                        + "```\n\n"
+                        + f"{e}"
+                    )
+            output = s.getvalue()
+        else:
+            assert self.obj.syntax in ["sh", "bash"]
+            executable = {
+                "sh": None,
+                "bash": "/bin/bash",
+                "zsh": "/bin/zsh",
+            }[self.obj.syntax]
+            ret = subprocess.run(
+                self.obj.code,
+                shell=True,
+                check=True,
+                stdout=subprocess.PIPE,
+                executable=executable,
+            )
+            output = ret.stdout.decode()
+
+        if output is not None and self.obj.expected_output is not None:
+            str0 = self.obj.expected_output
+            str1 = output
+            if getattr(self.obj, "expected_output_ignore_whitespace", False):
+                str0 = re_mod.sub(r"\s+", "", str0)
+                str1 = re_mod.sub(r"\s+", "", str1)
+            if str0 != str1:
+                raise RuntimeError(
+                    f"{self.name}, line {self.obj.lineno}:\n```\n"
+                    + f"Expected output\n```\n{self.obj.expected_output}```\n"
+                    + f"but got\n```\n{output}```"
+                )
+
+    _codeblocks_plugin.TestBlock.runtest = _patched_runtest
+except ImportError:
+    pass
diff --git a/docs/getting-started/first-pipeline.md b/docs/getting-started/first-pipeline.md
new file mode 100644
index 0000000..b9d70d8
--- /dev/null
+++ b/docs/getting-started/first-pipeline.md
@@ -0,0 +1,190 @@
+# Building Your First Pipeline
+
+This guide walks through building a persistent, incremental pipeline using orcapod's
+`Pipeline` class. You'll learn how to:
+
+- Define a multi-step computation graph
+- Persist results to a database
+- Re-run with incremental computation
+
+## The Pipeline Class
+
+A `Pipeline` wraps your computation graph with automatic persistence. Inside a `with pipeline:`
+block, all source, operator, and function pod invocations are tracked and automatically
+upgraded to their persistent variants when the context exits.
+
+```python
+import pyarrow as pa
+from orcapod import ArrowTableSource, FunctionPod
+from orcapod.core.packet_function import PythonPacketFunction
+from orcapod.databases import InMemoryArrowDatabase
+from orcapod.pipeline import Pipeline
+
+# Define sources
+patients = ArrowTableSource(
+    pa.table({
+        "patient_id": pa.array(["p1", "p2", "p3"], type=pa.large_string()),
+        "age": pa.array([30, 45, 60], type=pa.int64()),
+    }),
+    tag_columns=["patient_id"],
+)
+
+labs = ArrowTableSource(
+    pa.table({
+        "patient_id": pa.array(["p1", "p2", "p3"], type=pa.large_string()),
+        "cholesterol": pa.array([180, 220, 260], type=pa.int64()),
+    }),
+    tag_columns=["patient_id"],
+)
+
+# Define computation
+def risk_score(age: int, cholesterol: int) -> float:
+    return age * 0.5 + cholesterol * 0.3
+
+risk_fn = PythonPacketFunction(risk_score, output_keys="risk")
+risk_pod = FunctionPod(packet_function=risk_fn)
+
+# Build the pipeline
+db = InMemoryArrowDatabase()
+pipeline = Pipeline(name="risk_pipeline", pipeline_database=db)
+
+with pipeline:
+    joined = patients.join(labs, label="join_data")
+    risk_pod(joined, label="compute_risk")
+
+pipeline.run()
+```
+
+## Labels and Node Access
+
+Every operation inside a pipeline context can be given a `label`. After compilation and
+execution, access nodes by label as attributes:
+
+<!--pytest-codeblocks:cont-->
+```python
+# Access results by label
+risk_table = pipeline.compute_risk.as_table()
+print(risk_table.to_pandas()[["patient_id", "risk"]])
+#   patient_id  risk
+# 0         p1  69.0
+# 1         p2  88.5
+# 2         p3 108.0
+```
+
+## Convenience Methods
+
+Streams and sources expose convenience methods for common operators, making pipeline
+construction more fluent:
+
+<!--pytest-codeblocks:skip-->
+```python
+with pipeline:
+    joined = patients.join(labs, label="join_data")
+    selected = joined.select_packet_columns(["age"], label="select_age")
+    renamed = selected.map_packets({"age": "patient_age"}, label="rename")
+    risk_pod(renamed, label="compute")
+```
+
+Available convenience methods:
+
+| Method | Operator |
+|--------|----------|
+| `.join(other)` | `Join` |
+| `.semi_join(other)` | `SemiJoin` |
+| `.map_tags(mapping)` | `MapTags` |
+| `.map_packets(mapping)` | `MapPackets` |
+| `.select_tag_columns(cols)` | `SelectTagColumns` |
+| `.select_packet_columns(cols)` | `SelectPacketColumns` |
+| `.drop_tag_columns(cols)` | `DropTagColumns` |
+| `.drop_packet_columns(cols)` | `DropPacketColumns` |
+| `.batch(batch_size=N)` | `Batch` |
+| `.polars_filter(col="val")` | `PolarsFilter` |
+
+## Persistent Storage with Delta Lake
+
+For durable persistence, use `DeltaTableDatabase` instead of `InMemoryArrowDatabase`:
+
+<!--pytest-codeblocks:skip-->
+```python
+from pathlib import Path
+from orcapod.databases import DeltaTableDatabase
+
+db = DeltaTableDatabase(base_path=Path("./my_pipeline_db"))
+pipeline = Pipeline(name="risk_pipeline", pipeline_database=db)
+
+with pipeline:
+    joined = patients.join(labs, label="join_data")
+    risk_pod(joined, label="compute_risk")
+
+pipeline.run()
+```
+
+Results are stored as Delta Lake tables on disk and survive across process restarts.
+
+## Incremental Computation
+
+When you re-run a pipeline with new data, only the new rows are computed:
+
+<!--pytest-codeblocks:skip-->
+```python
+# First run: 3 patients
+pipeline.run()  # computes all 3
+
+# Add a new patient to the source
+patients_v2 = ArrowTableSource(
+    pa.table({
+        "patient_id": pa.array(["p1", "p2", "p3", "p4"], type=pa.large_string()),
+        "age": pa.array([30, 45, 60, 25], type=pa.int64()),
+    }),
+    tag_columns=["patient_id"],
+)
+
+labs_v2 = ArrowTableSource(
+    pa.table({
+        "patient_id": pa.array(["p1", "p2", "p3", "p4"], type=pa.large_string()),
+        "cholesterol": pa.array([180, 220, 260, 200], type=pa.int64()),
+    }),
+    tag_columns=["patient_id"],
+)
+
+# Rebuild pipeline with updated sources
+pipeline2 = Pipeline(name="risk_pipeline", pipeline_database=db)
+with pipeline2:
+    joined = patients_v2.join(labs_v2, label="join_data")
+    risk_pod(joined, label="compute_risk")
+
+pipeline2.run()  # only p4 is computed; p1-p3 come from cache
+```
+
+## Compiled Node Types
+
+When a pipeline compiles, each node is replaced with its persistent variant:
+
+| Original | Persistent Variant | Cache Scoping |
+|----------|-------------------|---------------|
+| Leaf stream | `PersistentSourceNode` | Content hash |
+| Operator call | `PersistentOperatorNode` | Content hash |
+| Function pod call | `PersistentFunctionNode` | Pipeline hash (schema + topology) |
+
+## Separate Function Database
+
+For isolating function pod result caches from the main pipeline database:
+
+<!--pytest-codeblocks:skip-->
+```python
+pipeline_db = DeltaTableDatabase(base_path=Path("./pipeline"))
+function_db = DeltaTableDatabase(base_path=Path("./functions"))
+
+pipeline = Pipeline(
+    name="risk_pipeline",
+    pipeline_database=pipeline_db,
+    function_database=function_db,
+)
+```
+
+## Next Steps
+
+- [User Guide: Pipelines](../user-guide/pipelines.md) — Advanced pipeline patterns
+  and composition
+- [User Guide: Caching & Persistence](../user-guide/caching.md) — Deep dive into
+  orcapod's three-tier caching strategy
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
new file mode 100644
index 0000000..b7b9e52
--- /dev/null
+++ b/docs/getting-started/installation.md
@@ -0,0 +1,83 @@
+# Installation
+
+## Requirements
+
+- Python 3.11 or later
+- [uv](https://docs.astral.sh/uv/) (recommended package manager)
+
+## Install from Source
+
+Clone the repository and install with `uv`:
+
+<!--pytest-codeblocks:skip-->
+```bash
+git clone https://github.com/walkerlab/orcapod-python.git
+cd orcapod-python
+uv sync
+```
+
+Or install with pip:
+
+<!--pytest-codeblocks:skip-->
+```bash
+pip install -e .
+```
+
+## Optional Dependencies
+
+orcapod has optional dependency groups for extended functionality:
+
+=== "Redis"
+
+    ```bash
+    pip install orcapod[redis]
+    ```
+
+    Enables Redis-backed caching.
+
+=== "Ray"
+
+    ```bash
+    pip install orcapod[ray]
+    ```
+
+    Enables distributed execution via Ray.
+
+=== "All"
+
+    ```bash
+    pip install orcapod[all]
+    ```
+
+    Installs all optional dependencies.
+
+## Development Setup
+
+For contributing to orcapod:
+
+<!--pytest-codeblocks:skip-->
+```bash
+git clone https://github.com/walkerlab/orcapod-python.git
+cd orcapod-python
+uv sync --group dev
+```
+
+Verify your installation:
+
+<!--pytest-codeblocks:skip-->
+```bash
+uv run pytest tests/ -x -q
+```
+
+## Core Dependencies
+
+orcapod builds on several key libraries:
+
+| Library | Purpose |
+|---------|---------|
+| [PyArrow](https://arrow.apache.org/docs/python/) | Columnar data representation and Arrow table backing |
+| [Polars](https://pola.rs/) | DataFrame filtering (used by `PolarsFilter` operator) |
+| [Delta Lake](https://delta.io/) | Persistent database storage via Delta tables |
+| [xxhash](https://github.com/Cyan4973/xxHash) | Fast content hashing |
+| [NetworkX](https://networkx.org/) | Pipeline graph compilation and topological sorting |
+| [Graphviz](https://graphviz.org/) | Pipeline visualization |
diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md
new file mode 100644
index 0000000..a1a3ca6
--- /dev/null
+++ b/docs/getting-started/quickstart.md
@@ -0,0 +1,168 @@
+# Quickstart
+
+This guide introduces orcapod's core concepts through a hands-on example. You'll create
+sources, join them, apply a computation, and inspect the results — all with automatic
+provenance tracking.
+
+## Creating Sources
+
+Sources are the entry points for data in orcapod. The simplest way to create one is from a
+Python dictionary:
+
+```python
+from orcapod import DictSource
+
+patients = DictSource(
+    data=[
+        {"patient_id": "p1", "age": 30},
+        {"patient_id": "p2", "age": 45},
+        {"patient_id": "p3", "age": 60},
+    ],
+    tag_columns=["patient_id"],
+)
+```
+
+The `tag_columns` parameter specifies which columns are **tags** (metadata used for joining
+and routing) versus **packets** (the data payload). Here, `patient_id` is a tag and `age`
+is a packet column.
+
+orcapod supports many source types:
+
+<!--pytest-codeblocks:cont-->
+```python
+import pyarrow as pa
+from orcapod import ArrowTableSource, ListSource
+
+# From a PyArrow table
+arrow_src = ArrowTableSource(
+    pa.table({"id": ["a", "b"], "value": [1, 2]}),
+    tag_columns=["id"],
+)
+
+# From a list of objects with a tag function (see ListSource docs for details)
+# list_src = ListSource(name="images", data=[img1, img2], ...)
+```
+
+## Exploring Streams
+
+Every source produces a **stream** — an immutable sequence of (Tag, Packet) pairs:
+
+<!--pytest-codeblocks:cont-->
+```python
+# Sources implement the stream protocol directly
+stream = patients
+
+# Check the schema
+tag_schema, packet_schema = stream.output_schema()
+print(f"Tags: {tag_schema}")      # Schema({'patient_id': str})
+print(f"Packets: {packet_schema}") # Schema({'age': int})
+
+# Iterate over entries
+for tag, packet in stream.iter_packets():
+    print(f"  {tag.as_dict()} → {packet.as_dict()}")
+```
+
+## Joining Streams
+
+Use the **Join** operator to combine streams on their shared tag columns:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.core.operators import Join
+
+labs = DictSource(
+    data=[
+        {"patient_id": "p1", "cholesterol": 180},
+        {"patient_id": "p2", "cholesterol": 220},
+        {"patient_id": "p3", "cholesterol": 260},
+    ],
+    tag_columns=["patient_id"],
+)
+
+joined = Join()(patients, labs)
+
+# The joined stream has both age and cholesterol as packet columns
+tag_schema, packet_schema = joined.output_schema()
+print(f"Packets: {packet_schema}")  # Schema({'age': int, 'cholesterol': int})
+```
+
+## Applying Computations
+
+**Function pods** apply stateless computations to individual packets. Define a regular Python
+function and wrap it:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod import FunctionPod
+from orcapod.core.packet_function import PythonPacketFunction
+
+def risk_score(age: int, cholesterol: int) -> float:
+    """Compute a simple risk score."""
+    return age * 0.5 + cholesterol * 0.3
+
+risk_fn = PythonPacketFunction(risk_score, output_keys="risk")
+risk_pod = FunctionPod(packet_function=risk_fn)
+
+# Apply the function pod to the joined stream
+result = risk_pod(joined)
+
+for tag, packet in result.iter_packets():
+    print(f"  {tag.as_dict()} → {packet.as_dict()}")
+# {'patient_id': 'p1'} → {'risk': 69.0}
+# {'patient_id': 'p2'} → {'risk': 88.5}
+# {'patient_id': 'p3'} → {'risk': 108.0}
+```
+
+You can also use the decorator syntax:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod import function_pod
+
+@function_pod(output_keys="risk")
+def compute_risk(age: int, cholesterol: int) -> float:
+    return age * 0.5 + cholesterol * 0.3
+
+result = compute_risk.pod(joined)
+```
+
+## Materializing Results
+
+Streams are lazy — data is only computed when you request it. Materialize a stream
+as a PyArrow table:
+
+<!--pytest-codeblocks:cont-->
+```python
+table = result.as_table()
+print(table.to_pandas())
+#   patient_id  risk
+# 0         p1  69.0
+# 1         p2  88.5
+# 2         p3 108.0
+```
+
+## Inspecting Provenance
+
+Every value in orcapod is traceable. Use `ColumnConfig` to inspect provenance metadata:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.types import ColumnConfig
+
+# Include source-info columns
+table = result.as_table(columns=ColumnConfig(source=True))
+print(table.column_names)
+# [..., '_source_risk', ...]
+
+# Include system tags for full lineage
+table = result.as_table(columns=ColumnConfig(system_tags=True))
+print(table.column_names)
+# [..., '_tag::source_id::...', '_tag::record_id::...', ...]
+```
+
+## Next Steps
+
+- [Building Your First Pipeline](first-pipeline.md) — Learn how to orchestrate multi-step
+  pipelines with persistence and incremental computation.
+- [Concepts: Architecture Overview](../concepts/architecture.md) — Understand the design
+  principles behind orcapod.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..fa87034
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,111 @@
+# orcapod
+
+**Intuitive and powerful library for highly reproducible scientific data pipelines.**
+
+orcapod is a Python framework for building data pipelines with built-in provenance tracking,
+content-addressable caching, and deterministic computation. Every value produced by an orcapod
+pipeline is traceable back to its original source, every computation is memoizable, and every
+result is verifiable.
+
+---
+
+## Key Features
+
+- **Full Provenance Tracking** — Every value carries metadata tracing it back to its originating source and record. Operator topology is captured in system tags, forming a complete lineage chain.
+
+- **Content-Addressable Caching** — Computations are identified by their content hash. Identical computations are never repeated, and results are automatically shared across compatible pipeline runs.
+
+- **Immutable Data Flow** — Streams are immutable sequences of (Tag, Packet) pairs backed by Apache Arrow tables. Data flows forward through the pipeline without side effects.
+
+- **Strict Operator / Function Pod Boundary** — Operators transform structure (joins, filters, renames) without inspecting packet data. Function pods transform data without inspecting tags. This separation keeps provenance clean and reasoning simple.
+
+- **Schema as a First-Class Citizen** — Every stream is self-describing. Schemas are predicted at construction time, not discovered at runtime, enabling early validation and deterministic system tag naming.
+
+- **Incremental Computation** — Database-backed nodes compute only what's missing. Add new data to a source and re-run: only the new rows are processed.
+
+- **Pluggable Execution** — Run pipelines synchronously for debugging or with async push-based channels for production. Swap in a Ray executor for distributed computation. Results are identical regardless of execution strategy.
+
+---
+
+## How It Works
+
+```
+Source → Stream → [Operator / FunctionPod] → Stream → ...
+```
+
+1. **Sources** load data from external systems (CSV, Delta Lake, DataFrames, dicts) and annotate each row with provenance metadata.
+
+2. **Streams** carry the data as immutable (Tag, Packet) pairs over a shared schema.
+
+3. **Operators** reshape streams — join, filter, batch, select, rename — without creating new values.
+
+4. **Function Pods** apply packet functions that transform individual packets, producing new computed values with tracked provenance.
+
+5. **Pipelines** orchestrate the full graph, automatically persisting results and enabling incremental re-computation.
+
+---
+
+## Quick Example
+
+```python
+import pyarrow as pa
+from orcapod import ArrowTableSource, FunctionPod
+from orcapod.core.packet_function import PythonPacketFunction
+from orcapod.core.operators import Join
+
+# Create sources
+patients = ArrowTableSource(
+    pa.table({
+        "patient_id": ["p1", "p2", "p3"],
+        "age": [30, 45, 60],
+    }),
+    tag_columns=["patient_id"],
+)
+
+labs = ArrowTableSource(
+    pa.table({
+        "patient_id": ["p1", "p2", "p3"],
+        "cholesterol": [180, 220, 260],
+    }),
+    tag_columns=["patient_id"],
+)
+
+# Join sources on shared tag columns
+joined = Join()(patients, labs)
+
+# Define and apply a packet function
+def risk_score(age: int, cholesterol: int) -> float:
+    return age * 0.5 + cholesterol * 0.3
+
+risk_fn = PythonPacketFunction(risk_score, output_keys="risk")
+risk_pod = FunctionPod(packet_function=risk_fn)
+result = risk_pod(joined)
+
+# Iterate over results
+for tag, packet in result.iter_packets():
+    print(f"{tag.as_dict()} → risk={packet.as_dict()['risk']}")
+```
+
+---
+
+## Next Steps
+
+<div class="grid cards" markdown>
+
+-   :material-download: **[Installation](getting-started/installation.md)**
+
+    Install orcapod and its dependencies
+
+-   :material-rocket-launch: **[Quickstart](getting-started/quickstart.md)**
+
+    Learn the basics in 5 minutes
+
+-   :material-book-open-variant: **[Concepts](concepts/architecture.md)**
+
+    Understand the architecture and design principles
+
+-   :material-api: **[API Reference](api/index.md)**
+
+    Browse the full API documentation
+
+</div>
diff --git a/docs/user-guide/caching.md b/docs/user-guide/caching.md
new file mode 100644
index 0000000..f06a491
--- /dev/null
+++ b/docs/user-guide/caching.md
@@ -0,0 +1,179 @@
+# Caching & Persistence
+
+orcapod uses a differentiated caching strategy across its three pod types — source, function,
+and operator — reflecting the distinct computational semantics of each.
+
+## The Three Caching Tiers
+
+| Property | Source Pod | Function Pod | Operator Pod |
+|----------|-----------|--------------|--------------|
+| Cache scope | Content hash | Pipeline hash | Content hash |
+| Default state | Always on | Always on | Off |
+| Semantic role | Cumulative record | Reusable lookup | Historical record |
+| Cross-source sharing | No | Yes | No |
+| Computation on hit | Dedup and merge | Skip | Recompute by default |
+
+## Source Pod Caching
+
+Source caching is **always on**. Each source gets its own dedicated cache table scoped to
+its content hash.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.sources import PersistentSource, ArrowTableSource
+from orcapod.databases import InMemoryArrowDatabase
+
+source_db = InMemoryArrowDatabase()
+source = PersistentSource(
+    ArrowTableSource(table, tag_columns=["id"]),
+    cache_database=source_db,
+)
+source.run()
+```
+
+**Behavior:**
+
+- Each packet yielded by the source is stored, keyed by its content-addressable hash.
+- On access, the source yields the **merged content of cache + new packets**.
+- Deduplication is performed during merge using content-addressable hashes.
+- The cache is a **correct cumulative record** of all data ever observed from that source.
+
+**Named vs. unnamed sources:**
+
+- **Named sources** (DeltaTable, CSV): `source_id = canonical name`. Identity is
+  data-independent — same name + same schema = same cache table. Updates accumulate.
+- **Unnamed sources** (ArrowTableSource): `source_id = table hash`. Identity is
+  data-dependent — different data = different cache table.
+
+## Function Pod Caching
+
+Function pod caching is **always on** and split into two tiers:
+
+1. **Packet-level cache (global)** — maps `input_hash → output_packet`. Shared across
+   all pipelines.
+2. **Tag-level cache (per structural pipeline)** — maps `tag → input_hash`. Scoped to
+   `pipeline_hash()`.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import PersistentFunctionNode, FunctionPod
+from orcapod.databases import InMemoryArrowDatabase
+
+pipeline_db = InMemoryArrowDatabase()
+result_db = InMemoryArrowDatabase()
+
+node = PersistentFunctionNode(
+    function_pod=pod,
+    input_stream=input_stream,
+    pipeline_database=pipeline_db,
+    result_database=result_db,
+)
+node.run()
+```
+
+**Cross-source sharing:**
+
+Because the cache is scoped to `pipeline_hash()` (schema + topology only, not data content),
+different source instances with identical schemas **share the same cache table**. Rows are
+distinguished by system tags.
+
+```
+Pipeline 1: clinic_a (3 patients) → Join → risk_score → cache table T
+Pipeline 2: clinic_b (2 patients) → Join → risk_score → cache table T (same!)
+
+Table T now has 5 rows, differentiated by system tag source_id columns.
+```
+
+**Two-phase iteration:**
+
+1. **Phase 1** — yield cached results for inputs already in the database.
+2. **Phase 2** — compute missing inputs, store results, yield.
+
+## Operator Pod Caching
+
+Operator caching is **off by default** and uses a three-tier opt-in model via `CacheMode`:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operator_node import PersistentOperatorNode
+from orcapod.types import CacheMode
+
+# Default: compute only, no DB writes
+node_off = PersistentOperatorNode(
+    operator=join_op,
+    input_streams=[source_a, source_b],
+    pipeline_database=db,
+    cache_mode=CacheMode.OFF,
+)
+
+# Log: compute AND write to DB (audit trail)
+node_log = PersistentOperatorNode(
+    operator=join_op,
+    input_streams=[source_a, source_b],
+    pipeline_database=db,
+    cache_mode=CacheMode.LOG,
+)
+
+# Replay: skip computation, load from cache
+node_replay = PersistentOperatorNode(
+    operator=join_op,
+    input_streams=[source_a, source_b],
+    pipeline_database=db,
+    cache_mode=CacheMode.REPLAY,
+)
+```
+
+### CacheMode Summary
+
+| Mode | Cache Writes | Computation | Use Case |
+|------|-------------|-------------|----------|
+| `OFF` | No | Always | Normal pipeline execution |
+| `LOG` | Yes | Always | Audit trail, run-over-run comparison |
+| `REPLAY` | No (prior) | Skipped | Explicitly flowing prior results downstream |
+
+**Why off by default?**
+
+Operators compute over entire streams (joins, aggregations). Their outputs are meaningful
+only as a complete set. Unlike function pods, operator results cannot be safely mixed across
+source combinations:
+
+```
+(X ⋈ Y) ∪ (X' ⋈ Y') ≠ (X ∪ X') ⋈ (Y ∪ Y')
+```
+
+The operator cache is an **append-only historical record**, not a cumulative materialization.
+
+## Pipeline Database Scoping
+
+### FunctionNode Pipeline Path
+
+```
+{prefix} / {function_name} / {output_schema_hash} / v{major_version} / {type_id} / node:{pipeline_hash}
+```
+
+### OperatorNode Pipeline Path
+
+```
+{prefix} / {operator_class} / {operator_content_hash} / node:{pipeline_hash}
+```
+
+### Multi-Source Table Sharing
+
+Sources with identical schemas produce identical `pipeline_hash` values. Through the same
+pipeline structure, they share database tables automatically:
+
+<!--pytest-codeblocks:skip-->
+```python
+# Both use the same pipeline path (same schema, same topology)
+node_a = PersistentFunctionNode(pod, stream_from_clinic_a, pipeline_database=db)
+node_b = PersistentFunctionNode(pod, stream_from_clinic_b, pipeline_database=db)
+
+assert node_a.pipeline_path == node_b.pipeline_path  # True!
+```
+
+## Database Backends
+
+| Backend | Persistence | Best For |
+|---------|------------|----------|
+| `InMemoryArrowDatabase` | Process lifetime | Testing, exploration |
+| `DeltaTableDatabase` | Disk (Delta Lake parquet) | Production pipelines |
diff --git a/docs/user-guide/execution.md b/docs/user-guide/execution.md
new file mode 100644
index 0000000..db9dfaf
--- /dev/null
+++ b/docs/user-guide/execution.md
@@ -0,0 +1,147 @@
+# Execution Models
+
+orcapod supports multiple execution strategies that produce semantically identical results.
+The choice of strategy is an execution concern — neither content hashes nor pipeline hashes
+depend on how the pipeline was executed.
+
+## Synchronous Execution (Pull-Based)
+
+The default model. Callers invoke `process()` on a pod, which returns a stream. Iteration
+triggers computation lazily.
+
+### Lazy In-Memory
+
+`FunctionPod` → `FunctionPodStream`: processes each packet on demand.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import FunctionPod
+
+pod = FunctionPod(packet_function=pf)
+result = pod(input_stream)  # Returns FunctionPodStream
+
+# Computation happens during iteration
+for tag, packet in result.iter_packets():
+    print(packet.as_dict())
+```
+
+No database persistence. Suitable for exploration and one-off computations.
+
+### Static with Recomputation
+
+`StaticOutputPod` → `DynamicPodStream`: the operator's `static_process` produces a complete
+output stream. `DynamicPodStream` wraps it with timestamp-based staleness detection and
+automatic recomputation when upstreams change.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operators import Join
+
+joined = Join()(source_a, source_b)  # Returns DynamicPodStream
+table = joined.as_table()            # Triggers computation
+```
+
+### Database-Backed Incremental
+
+`FunctionNode` / `OperatorNode`: results are persisted. Only inputs whose hashes are not
+already in the database are computed.
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import PersistentFunctionNode
+
+node = PersistentFunctionNode(
+    function_pod=pod,
+    input_stream=input_stream,
+    pipeline_database=db,
+)
+node.run()  # Phase 1: cached results, Phase 2: compute missing
+```
+
+## Asynchronous Execution (Push-Based Channels)
+
+Every pipeline node implements the `AsyncExecutableProtocol`:
+
+<!--pytest-codeblocks:skip-->
+```python
+async def async_execute(
+    inputs: Sequence[ReadableChannel[tuple[Tag, Packet]]],
+    output: WritableChannel[tuple[Tag, Packet]],
+) -> None
+```
+
+Nodes consume `(Tag, Packet)` pairs from input channels and produce them to an output
+channel, enabling streaming execution with backpressure.
+
+### Operator Async Strategies
+
+| Strategy | Description | Operators |
+|----------|-------------|-----------|
+| Barrier mode (default) | Collect all inputs, run `static_process`, emit | Batch |
+| Streaming | Process rows individually, zero buffering | Filter, Map, Select, Drop |
+| Incremental | Stateful, emit partial results as inputs arrive | Join, MergeJoin, SemiJoin |
+
+### Channels
+
+Channels are bounded async queues with explicit close/done signaling:
+
+- **`WritableChannel`** — `send(item)` blocks when buffer is full (backpressure).
+  `close()` signals no more items.
+- **`ReadableChannel`** — `receive()` blocks until available. Supports `async for`
+  iteration.
+- **`BroadcastChannel`** — fans out from one writer to multiple independent readers.
+
+### Configuration
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.types import PipelineConfig, NodeConfig, ExecutorType
+
+# Pipeline-level configuration
+config = PipelineConfig(
+    executor=ExecutorType.ASYNC_CHANNELS,
+    channel_buffer_size=128,
+    default_max_concurrency=4,
+)
+
+# Per-node override
+node_config = NodeConfig(max_concurrency=1)  # Force sequential
+```
+
+## Concurrent Execution with Executors
+
+Executors decouple **what** a function computes from **where** it runs:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.executors import LocalExecutor, RayExecutor
+
+# Local (default): runs in-process
+pf.executor = LocalExecutor()
+
+# Ray: distributed execution
+pf.executor = RayExecutor(num_cpus=4, num_gpus=1)
+```
+
+### Executor Routing
+
+```
+packet_function.call(packet)
+    ├── executor is set → executor.execute(packet_function, packet)
+    └── executor is None → packet_function.direct_call(packet)
+```
+
+### Identity Separation
+
+Executors are **not** part of content or pipeline identity. The same function produces the
+same hash regardless of executor. This means cached results from local execution are valid
+for Ray execution and vice versa.
+
+## Choosing an Execution Model
+
+| Model | Best For |
+|-------|----------|
+| Synchronous lazy | Interactive exploration, debugging |
+| Synchronous DB-backed | Production with incremental computation |
+| Async channels | Pipeline-level parallelism, streaming |
+| Ray executor | Distributed computation, GPU workloads |
diff --git a/docs/user-guide/function-pods.md b/docs/user-guide/function-pods.md
new file mode 100644
index 0000000..26b9c0e
--- /dev/null
+++ b/docs/user-guide/function-pods.md
@@ -0,0 +1,213 @@
+# Function Pods
+
+Function pods wrap stateless **packet functions** that transform individual packets. They
+are orcapod's mechanism for data synthesis — any operation that creates new computed values
+goes through a function pod.
+
+## Defining a Packet Function
+
+A `PythonPacketFunction` wraps a regular Python function:
+
+```python
+from orcapod.core.packet_function import PythonPacketFunction
+
+def risk_score(age: int, cholesterol: int) -> float:
+    """Compute a simple risk score."""
+    return age * 0.5 + cholesterol * 0.3
+
+pf = PythonPacketFunction(
+    risk_score,
+    output_keys="risk",          # Name of the output column
+    function_name="risk_score",  # Optional: canonical name for identity
+    version="v1.0",              # Optional: version for identity
+)
+```
+
+### Output Keys
+
+`output_keys` declares the names of the output columns:
+
+<!--pytest-codeblocks:cont-->
+```python
+# Single output
+def compute_score(age: int) -> float:
+    return age * 1.5
+
+pf_single = PythonPacketFunction(compute_score, output_keys="score")
+
+# Multiple outputs
+def compute_stats(values: list[float]) -> tuple[float, float]:
+    return sum(values) / len(values), max(values)
+
+pf_multi = PythonPacketFunction(compute_stats, output_keys=["mean", "max_val"])
+```
+
+### Input and Output Schemas
+
+Packet functions introspect input parameters from the function signature and output types
+from `output_keys`:
+
+<!--pytest-codeblocks:cont-->
+```python
+print(pf.input_packet_schema)   # Schema({'age': int, 'cholesterol': int})
+print(pf.output_packet_schema)  # Schema({'risk': float})
+```
+
+You can also provide explicit schemas:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.types import Schema
+
+pf = PythonPacketFunction(
+    risk_score,
+    output_keys="risk",
+    input_schema=Schema({"age": int, "cholesterol": int}),
+    output_schema=Schema({"risk": float}),
+)
+```
+
+## Creating a Function Pod
+
+A `FunctionPod` wraps a packet function and applies it to a stream:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import FunctionPod
+
+risk_fn = PythonPacketFunction(risk_score, output_keys="risk")
+pod = FunctionPod(packet_function=risk_fn)
+
+# Apply to a stream
+result_stream = pod(input_stream)
+
+# Or equivalently
+result_stream = pod.process(input_stream)
+```
+
+## The `@function_pod` Decorator
+
+For convenience, use the decorator to create a function pod in one step:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import function_pod
+
+@function_pod(output_keys="risk")
+def compute_risk(age: int, cholesterol: int) -> float:
+    return age * 0.5 + cholesterol * 0.3
+
+# The decorated function has a .pod attribute
+result = compute_risk.pod(input_stream)
+```
+
+The decorator accepts the same parameters as `PythonPacketFunction`:
+
+```python
+from orcapod import function_pod
+
+@function_pod(
+    output_keys=["mean", "std"],
+    function_name="statistics",
+    version="v2.0",
+)
+def compute_statistics(values: list[float]) -> tuple[float, float]:
+    import statistics
+    return statistics.mean(values), statistics.stdev(values)
+```
+
+## Execution Models
+
+Function pods support two execution models:
+
+### Lazy In-Memory (FunctionPodStream)
+
+The default. The function pod processes each packet on demand:
+
+<!--pytest-codeblocks:skip-->
+```python
+result = pod(input_stream)  # Returns FunctionPodStream
+
+# Computation happens lazily during iteration
+for tag, packet in result.iter_packets():
+    print(packet.as_dict())
+```
+
+### Database-Backed (FunctionNode / PersistentFunctionNode)
+
+For incremental computation with caching:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import FunctionNode, PersistentFunctionNode
+from orcapod.databases import InMemoryArrowDatabase
+
+# Non-persistent (in-memory tracking)
+node = FunctionNode(
+    function_pod=pod,
+    input_stream=input_stream,
+)
+
+# Persistent (DB-backed with caching)
+db = InMemoryArrowDatabase()
+persistent_node = PersistentFunctionNode(
+    function_pod=pod,
+    input_stream=input_stream,
+    pipeline_database=db,
+)
+persistent_node.run()
+```
+
+`PersistentFunctionNode` uses two-phase iteration:
+
+1. **Phase 1** — yield cached results for inputs whose hashes are already in the database.
+2. **Phase 2** — compute results for remaining inputs, store them, and yield.
+
+## The Strict Boundary
+
+Function pods **never** inspect or modify tags:
+
+- They receive individual packets, not (Tag, Packet) pairs.
+- Tags flow through automatically — the same tag is attached to the output.
+- Function pods cannot rename columns — use operators for that.
+
+This separation ensures that function pods are purely about data transformation, keeping
+provenance clean.
+
+## Multi-Stream Input
+
+When given multiple input streams, a function pod joins them first (defaulting to `Join`)
+before iterating:
+
+<!--pytest-codeblocks:skip-->
+```python
+# These are equivalent:
+result = pod(stream_a, stream_b)
+
+# Explicit join + function pod
+from orcapod.core.operators import Join
+joined = Join()(stream_a, stream_b)
+result = pod(joined)
+```
+
+## Concurrent Execution
+
+When a packet function has an executor with `supports_concurrent_execution = True`,
+`iter_packets()` materializes all remaining inputs and dispatches them concurrently:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.packet_function import PythonPacketFunction
+
+def expensive_computation(data: float) -> float:
+    return data * 2.0
+
+pf = PythonPacketFunction(expensive_computation, output_keys="result")
+
+# With a Ray executor for distributed computation
+from orcapod.core.executors.ray import RayExecutor
+pf.executor = RayExecutor(num_cpus=4)
+
+pod = FunctionPod(packet_function=pf)
+result = pod(input_stream)  # Packets processed in parallel
+```
diff --git a/docs/user-guide/operators.md b/docs/user-guide/operators.md
new file mode 100644
index 0000000..9be3047
--- /dev/null
+++ b/docs/user-guide/operators.md
@@ -0,0 +1,205 @@
+# Operators
+
+Operators are structural transformers that reshape streams without synthesizing new packet
+values. Every output value is traceable to a concrete input value. Operators handle joins,
+filters, projections, renames, and batching.
+
+## Operator Types
+
+Operators are organized by input arity:
+
+| Base Class | Arity | Operators |
+|-----------|-------|-----------|
+| `UnaryOperator` | Exactly 1 | Batch, Select/Drop columns, Map, PolarsFilter |
+| `BinaryOperator` | Exactly 2 | MergeJoin, SemiJoin |
+| `NonZeroInputOperator` | 1 or more | Join |
+
+## Join
+
+Variable-arity inner join on shared tag columns:
+
+```python
+from orcapod import DictSource
+from orcapod.core.operators import Join
+
+source_a = DictSource(
+    data=[{"id": "a", "x": 1}, {"id": "b", "x": 2}, {"id": "c", "x": 3}],
+    tag_columns=["id"],
+)
+source_b = DictSource(
+    data=[{"id": "a", "y": 4}, {"id": "b", "y": 5}, {"id": "c", "y": 6}],
+    tag_columns=["id"],
+)
+
+joined = Join()(source_a, source_b)
+```
+
+**Properties:**
+
+- **Commutative** — `Join()(A, B)` produces the same result as `Join()(B, A)`.
+- Non-overlapping packet columns are required — raises `InputValidationError` on collision.
+- Tag schema is the union of all input tag schemas.
+- Packet schema is the union of all input packet schemas.
+
+## MergeJoin
+
+Binary inner join that handles colliding packet columns by merging values into sorted lists:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operators import MergeJoin
+
+merged = MergeJoin()(stream_a, stream_b)
+```
+
+**Properties:**
+
+- **Commutative** — commutativity achieved by sorting merged values.
+- Colliding columns must have identical types.
+- Colliding columns become `list[T]` in the output.
+- Non-colliding columns remain as scalars.
+
+**Example:**
+
+```
+Stream A: {id: "p1", score: 0.9}
+Stream B: {id: "p1", score: 0.7}
+
+MergeJoin result: {id: "p1", score: [0.7, 0.9]}  # sorted
+```
+
+## SemiJoin
+
+Binary semi-join: returns entries from the left stream that match on overlapping columns
+in the right stream:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operators import SemiJoin
+
+filtered = SemiJoin()(left_stream, right_stream)
+```
+
+**Properties:**
+
+- **Non-commutative** — `SemiJoin(A, B) != SemiJoin(B, A)`.
+- Output schema matches the left stream exactly.
+
+## Batch
+
+Groups rows into batches of configurable size:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operators import Batch
+
+batched = Batch(batch_size=3)(input_stream)
+
+# With incomplete batch dropping
+batched = Batch(batch_size=3, drop_incomplete=True)(input_stream)
+```
+
+## Column Selection
+
+Keep or remove specific tag or packet columns:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.core.operators import (
+    SelectTagColumns,
+    SelectPacketColumns,
+    DropTagColumns,
+    DropPacketColumns,
+)
+
+stream = source_a  # Sources implement the stream protocol
+
+# Keep only specified columns
+selected = SelectPacketColumns(columns=["x"])(stream)
+
+# Remove specified columns
+dropped = DropPacketColumns(columns=["x"])(stream)
+```
+
+`SelectTagColumns` and `SelectPacketColumns` accept an optional `strict` parameter that
+raises on missing columns.
+
+`DropPacketColumns` automatically removes associated source-info columns.
+
+## Column Renaming
+
+Rename tag or packet columns via a mapping:
+
+<!--pytest-codeblocks:cont-->
+```python
+from orcapod.core.operators import MapTags, MapPackets
+
+renamed = MapPackets(mapping={"x": "new_x"})(stream)
+renamed_tags = MapTags(mapping={"id": "new_id"})(stream)
+```
+
+`MapPackets` automatically renames associated source-info columns.
+
+## PolarsFilter
+
+Filter rows using Polars expressions:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operators import PolarsFilter
+
+# Filter by column value
+filtered = PolarsFilter(age=30)(stream)
+
+# Output schema is unchanged from input
+```
+
+## Convenience Methods
+
+Streams and sources expose convenience methods for all operators:
+
+<!--pytest-codeblocks:cont-->
+```python
+# Instead of explicit operator construction:
+joined = source_a.join(source_b)
+selected = joined.select_packet_columns(["x"])
+renamed = selected.map_packets({"x": "patient_x"})
+```
+
+Inside a pipeline context, pass `label` to track each step:
+
+<!--pytest-codeblocks:skip-->
+```python
+with pipeline:
+    joined = source_a.join(source_b, label="join_data")
+    selected = joined.select_packet_columns(["age"], label="select_age")
+```
+
+## OperatorNode
+
+The `OperatorNode` is the database-backed counterpart for operators:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.operator_node import OperatorNode, PersistentOperatorNode
+from orcapod.databases import InMemoryArrowDatabase
+from orcapod.types import CacheMode
+
+# Non-persistent
+node = OperatorNode(
+    operator=Join(),
+    input_streams=[source_a, source_b],
+)
+
+# Persistent with cache logging
+db = InMemoryArrowDatabase()
+persistent = PersistentOperatorNode(
+    operator=Join(),
+    input_streams=[source_a, source_b],
+    pipeline_database=db,
+    cache_mode=CacheMode.LOG,
+)
+persistent.run()
+```
+
+See [Caching & Persistence](caching.md) for details on operator cache modes.
diff --git a/docs/user-guide/pipelines.md b/docs/user-guide/pipelines.md
new file mode 100644
index 0000000..cd2dde7
--- /dev/null
+++ b/docs/user-guide/pipelines.md
@@ -0,0 +1,192 @@
+# Pipelines
+
+The `Pipeline` class orchestrates multi-step computation graphs with automatic persistence,
+incremental computation, and graph tracking.
+
+## Pipeline Lifecycle
+
+A pipeline has three phases:
+
+### 1. Recording Phase
+
+Inside a `with pipeline:` block, all pod invocations are captured as non-persistent nodes:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.databases import InMemoryArrowDatabase
+from orcapod.pipeline import Pipeline
+
+db = InMemoryArrowDatabase()
+pipeline = Pipeline(name="my_pipeline", pipeline_database=db)
+
+with pipeline:
+    joined = source_a.join(source_b, label="join_data")
+    result = pod(joined, label="compute")
+```
+
+### 2. Compilation Phase
+
+On context exit (if `auto_compile=True`, which is the default), `compile()` walks the
+graph in topological order and replaces every node with its persistent variant:
+
+| Non-persistent | Persistent | Scoped by |
+|----------------|-----------|-----------|
+| Leaf stream | `PersistentSourceNode` | Content hash |
+| Operator call | `PersistentOperatorNode` | Content hash |
+| Function pod call | `PersistentFunctionNode` | Pipeline hash |
+
+### 3. Execution Phase
+
+`pipeline.run()` executes all compiled nodes in topological order:
+
+<!--pytest-codeblocks:skip-->
+```python
+pipeline.run()
+
+# Access results by label
+result_table = pipeline.compute.as_table()
+```
+
+## Labels
+
+Every operation inside a pipeline context can be labeled. Labels become attributes on the
+pipeline object:
+
+<!--pytest-codeblocks:skip-->
+```python
+with pipeline:
+    joined = source_a.join(source_b, label="join_data")
+    risk_stream = risk_pod(joined, label="compute_risk")
+    cat_pod(risk_stream, label="categorize")
+
+pipeline.run()
+
+# Access by label
+pipeline.join_data       # -> PersistentOperatorNode
+pipeline.compute_risk    # -> PersistentFunctionNode
+pipeline.categorize      # -> PersistentFunctionNode
+```
+
+Labels are disambiguated by content hash on collision during incremental compilation.
+
+## Inspecting the Graph
+
+After compilation, inspect the compiled nodes:
+
+<!--pytest-codeblocks:skip-->
+```python
+for name, node in pipeline.compiled_nodes.items():
+    print(f"{name}: {type(node).__name__}")
+```
+
+## Graph Tracking
+
+All pod invocations are automatically recorded by a global `BasicTrackerManager`. The user
+writes normal imperative code, and the computation graph is captured behind the scenes:
+
+<!--pytest-codeblocks:skip-->
+```python
+with pipeline:
+    # These calls are transparently tracked
+    joined = source_a.join(source_b)
+    result = pod(joined)
+    # The graph is automatically built:
+    # source_a -> Join -> pod -> result
+```
+
+## Incremental Compilation
+
+Compilation is incremental — re-entering the context, adding more operations, and compiling
+again preserves existing persistent nodes:
+
+<!--pytest-codeblocks:skip-->
+```python
+pipeline = Pipeline(name="my_pipeline", pipeline_database=db)
+
+# First round
+with pipeline:
+    joined = source_a.join(source_b, label="join")
+    pod_a(joined, label="step_a")
+
+pipeline.run()
+
+# Second round: adds step_b without recomputing step_a
+with pipeline:
+    joined = source_a.join(source_b, label="join")
+    pod_a(joined, label="step_a")
+    pod_b(pipeline.step_a, label="step_b")  # builds on previous
+
+pipeline.run()  # only step_b is new
+```
+
+## Database Configuration
+
+### Single Database
+
+The simplest setup uses one database for everything:
+
+```python
+from orcapod.databases import InMemoryArrowDatabase
+from orcapod.pipeline import Pipeline
+
+db = InMemoryArrowDatabase()
+pipeline = Pipeline(name="my_pipeline", pipeline_database=db)
+```
+
+### Separate Function Database
+
+Isolate function pod result caches from the main pipeline database:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.databases import DeltaTableDatabase
+
+pipeline_db = DeltaTableDatabase(base_path="./pipeline")
+function_db = DeltaTableDatabase(base_path="./functions")
+
+pipeline = Pipeline(
+    name="my_pipeline",
+    pipeline_database=pipeline_db,
+    function_database=function_db,
+)
+```
+
+When `function_database=None`, function results are stored under
+`{pipeline_name}/_results/` in the pipeline database.
+
+## Pipeline Composition
+
+Pipelines can be composed across boundaries:
+
+### Cross-Pipeline References
+
+Pipeline B can use Pipeline A's compiled nodes as input:
+
+<!--pytest-codeblocks:skip-->
+```python
+pipeline_a.run()
+# Use pipeline_a's output as input to pipeline_b
+with pipeline_b:
+    pod(pipeline_a.step_a, label="next_step")
+```
+
+### Chain Detachment with `.as_source()`
+
+Create a `DerivedSource` from a persistent node, breaking the upstream Merkle chain:
+
+<!--pytest-codeblocks:skip-->
+```python
+pipeline_a.run()
+derived = pipeline_a.step_a.as_source()
+
+# downstream pipeline is independent of pipeline_a's topology
+with pipeline_b:
+    pod(derived, label="process")
+```
+
+## Persistence Backends
+
+| Backend | Durability | Use Case |
+|---------|-----------|----------|
+| `InMemoryArrowDatabase` | Process lifetime | Testing, exploration |
+| `DeltaTableDatabase` | Disk (Delta Lake) | Production, reusable pipelines |
diff --git a/docs/user-guide/sources.md b/docs/user-guide/sources.md
new file mode 100644
index 0000000..f350f50
--- /dev/null
+++ b/docs/user-guide/sources.md
@@ -0,0 +1,200 @@
+# Sources
+
+Sources are the entry points for data in orcapod. They produce streams from external data
+with no upstream dependencies and establish provenance by annotating each row with source
+identity.
+
+## Source Types
+
+orcapod provides several source implementations. All delegate internally to `ArrowTableSource`.
+
+### ArrowTableSource
+
+The core source implementation. Backed by an in-memory PyArrow Table:
+
+```python
+import pyarrow as pa
+from orcapod import ArrowTableSource
+
+table = pa.table({
+    "patient_id": pa.array(["p1", "p2", "p3"], type=pa.large_string()),
+    "age": pa.array([30, 45, 60], type=pa.int64()),
+    "cholesterol": pa.array([180, 220, 260], type=pa.int64()),
+})
+
+source = ArrowTableSource(
+    table,
+    tag_columns=["patient_id"],
+)
+```
+
+### DictSource
+
+From a Python dictionary:
+
+```python
+from orcapod import DictSource
+
+source = DictSource(
+    data=[
+        {"id": "a", "value": 1},
+        {"id": "b", "value": 2},
+        {"id": "c", "value": 3},
+    ],
+    tag_columns=["id"],
+)
+```
+
+### ListSource
+
+From a list of arbitrary Python objects, with a tag function that extracts metadata:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import ListSource
+
+source = ListSource(
+    name="scores",
+    data=[
+        {"id": "a", "score": 0.9},
+        {"id": "b", "score": 0.7},
+        {"id": "c", "score": 0.5},
+    ],
+    tag_function=lambda item, idx: {"id": item["id"]},
+    expected_tag_keys=["id"],
+)
+```
+
+### DataFrameSource
+
+From a pandas DataFrame:
+
+```python
+import pandas as pd
+from orcapod import DataFrameSource
+
+df = pd.DataFrame({
+    "id": ["a", "b", "c"],
+    "value": [1, 2, 3],
+})
+
+source = DataFrameSource(df, tag_columns=["id"])
+```
+
+### DeltaTableSource
+
+From a Delta Lake table on disk:
+
+<!--pytest-codeblocks:skip-->
+```python
+from pathlib import Path
+from orcapod.core.sources import DeltaTableSource
+
+source = DeltaTableSource(
+    Path("./data/patients"),
+    tag_columns=["patient_id"],
+)
+```
+
+### CSVSource
+
+From a CSV file:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.sources import CSVSource
+
+source = CSVSource(
+    "data/patients.csv",
+    tag_columns=["patient_id"],
+)
+```
+
+## Tag Columns vs. Packet Columns
+
+The `tag_columns` parameter determines which columns are tags (metadata) and which are
+packets (data):
+
+- **Tag columns** — used for joining, filtering, and routing. They carry provenance
+  metadata and are the basis for stream joins.
+- **Packet columns** — the data payload. All columns not listed in `tag_columns` become
+  packet columns.
+
+```python
+from orcapod import DictSource
+
+source = DictSource(
+    data=[
+        {"id": "a", "x": 1, "y": 3},
+        {"id": "b", "x": 2, "y": 4},
+    ],
+    tag_columns=["id"],
+)
+
+tag_schema, packet_schema = source.output_schema()
+print(tag_schema)    # Schema({'id': str})
+print(packet_schema) # Schema({'x': int, 'y': int})
+```
+
+## Source Identity
+
+Every source has a `source_id` — a canonical name used for provenance tracking:
+
+<!--pytest-codeblocks:skip-->
+```python
+# Named sources: source_id from the data origin
+delta_src = DeltaTableSource(Path("./patients"), tag_columns=["id"])
+print(delta_src.source_id)  # "patients" (directory name)
+
+# Unnamed sources: source_id defaults to a truncated content hash
+arrow_src = ArrowTableSource(table, tag_columns=["id"])
+print(arrow_src.source_id)  # "a3f7b2c1..." (content hash)
+
+# Explicit source_id
+arrow_src = ArrowTableSource(table, tag_columns=["id"], source_id="my_source")
+print(arrow_src.source_id)  # "my_source"
+```
+
+## DerivedSource
+
+A `DerivedSource` wraps the computed output of a `FunctionNode` or `OperatorNode`, reading
+from their pipeline database. It represents a materialization — an intermediate result given
+durable identity:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod import DerivedSource
+from orcapod.core.operators import Join
+
+# After running a pipeline node
+derived = function_node.as_source()
+
+# Use as input to a downstream pipeline
+downstream_joined = Join()(derived, other_source)
+```
+
+Derived sources serve two purposes:
+
+1. **Semantic materialization** — domain-meaningful intermediate results (e.g., a daily
+   top-3 selection) get durable identity.
+2. **Pipeline decoupling** — downstream pipelines evolve independently of upstream topology.
+
+## Source Registration
+
+Sources can be registered in a `SourceRegistry` for provenance resolution:
+
+<!--pytest-codeblocks:skip-->
+```python
+from orcapod.core.sources import SourceRegistry
+
+registry = SourceRegistry()
+source = ArrowTableSource(table, tag_columns=["id"])
+# Register for provenance resolution
+registry.register(source)
+```
+
+## Validation Rules
+
+- Tag columns must exist in the table — raises `ValueError` otherwise.
+- The table must have at least one packet column — raises `ValueError` if all columns are tags.
+- Empty tables raise `ValueError("Table is empty")`.
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..5162cbc
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,129 @@
+site_name: orcapod
+site_description: Intuitive and powerful library for highly reproducible scientific data pipelines
+site_url: https://orcapod.org/
+repo_url: https://github.com/walkerlab/orcapod-python
+repo_name: walkerlab/orcapod-python
+
+theme:
+  name: material
+  palette:
+    - media: "(prefers-color-scheme: light)"
+      scheme: default
+      primary: blue
+      accent: cyan
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: blue
+      accent: cyan
+      toggle:
+        icon: material/brightness-4
+        name: Switch to light mode
+  features:
+    - content.code.copy
+    - content.code.annotate
+    - content.tabs.link
+    - navigation.instant
+    - navigation.tabs
+    - navigation.tabs.sticky
+    - navigation.sections
+    - navigation.expand
+    - navigation.top
+    - navigation.footer
+    - search.suggest
+    - search.highlight
+    - toc.follow
+  icon:
+    repo: fontawesome/brands/github
+
+plugins:
+  - search
+  - mkdocstrings:
+      default_handler: python
+      handlers:
+        python:
+          paths: [src]
+          options:
+            show_source: true
+            show_bases: true
+            show_root_heading: true
+            show_root_full_path: false
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            heading_level: 2
+            members_order: source
+            docstring_style: google
+            docstring_section_style: spacy
+            merge_init_into_class: true
+            show_if_no_docstring: false
+            show_signature_annotations: true
+            separate_signature: true
+            signature_crossrefs: true
+            summary: true
+
+markdown_extensions:
+  - admonition
+  - pymdownx.details
+  - pymdownx.superfences:
+      custom_fences:
+        - name: mermaid
+          class: mermaid
+          format: !!python/name:pymdownx.superfences.fence_mermaid_format
+  - pymdownx.highlight:
+      anchor_linenums: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.tabbed:
+      alternate_style: true
+  - tables
+  - attr_list
+  - md_in_html
+  - toc:
+      permalink: true
+  - def_list
+  - pymdownx.tasklist:
+      custom_checkbox: true
+
+nav:
+  - Home: index.md
+  - Getting Started:
+      - Installation: getting-started/installation.md
+      - Quickstart: getting-started/quickstart.md
+      - Building Your First Pipeline: getting-started/first-pipeline.md
+  - Concepts:
+      - Architecture Overview: concepts/architecture.md
+      - Datagrams, Tags & Packets: concepts/datagrams.md
+      - Streams: concepts/streams.md
+      - Identity & Hashing: concepts/identity.md
+      - System Tags & Provenance: concepts/provenance.md
+      - Schema & Column Configuration: concepts/schema.md
+  - User Guide:
+      - Sources: user-guide/sources.md
+      - Function Pods: user-guide/function-pods.md
+      - Operators: user-guide/operators.md
+      - Pipelines: user-guide/pipelines.md
+      - Caching & Persistence: user-guide/caching.md
+      - Execution Models: user-guide/execution.md
+  - API Reference:
+      - Overview: api/index.md
+      - Types: api/types.md
+      - Sources: api/sources.md
+      - Streams: api/streams.md
+      - Datagrams: api/datagrams.md
+      - Function Pods: api/function-pods.md
+      - Packet Functions: api/packet-functions.md
+      - Operators: api/operators.md
+      - Operator & Function Nodes: api/nodes.md
+      - Pipeline: api/pipeline.md
+      - Databases: api/databases.md
+      - Errors: api/errors.md
+      - Configuration: api/configuration.md
+  - Contributing:
+      - Documentation Site Setup: CONTRIBUTING_DOCS.md
+
+extra:
+  social:
+    - icon: fontawesome/brands/github
+      link: https://github.com/walkerlab/orcapod-python
diff --git a/pyproject.toml b/pyproject.toml
index fd9e10d..988ffd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ classifiers = [
 
 [project.urls]
 Homepage = "https://github.com/walkerlab/orcapod-python"
+Documentation = "https://orcapod.org/"
 
 [project.optional-dependencies]
 redis = ["redis>=6.2.0"]
@@ -68,6 +69,7 @@ dev = [
     "pyright>=1.1.404",
     "pytest>=8.3.5",
     "pytest-asyncio>=1.3.0",
+    "pytest-codeblocks>=0.17.0",
     "pytest-cov>=6.1.1",
     "ray[default]==2.48.0",
     "redis>=6.2.0",
@@ -75,3 +77,9 @@ dev = [
     "sphinx>=8.2.3",
     "tqdm>=4.67.1",
 ]
+docs = [
+    "mkdocs>=1.6.0",
+    "mkdocs-material>=9.5.0",
+    "mkdocstrings[python]>=0.27.0",
+    "pymdown-extensions>=10.7",
+]
diff --git a/uv.lock b/uv.lock
index 6364033..b8c24a9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -344,6 +344,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
 ]
 
+[[package]]
+name = "backrefs"
+version = "6.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4e/a6/e325ec73b638d3ede4421b5445d4a0b8b219481826cc079d510100af356c/backrefs-6.2.tar.gz", hash = "sha256:f44ff4d48808b243b6c0cdc6231e22195c32f77046018141556c66f8bab72a49", size = 7012303, upload-time = "2026-02-16T19:10:15.828Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/39/3765df263e08a4df37f4f43cb5aa3c6c17a4bdd42ecfe841e04c26037171/backrefs-6.2-py310-none-any.whl", hash = "sha256:0fdc7b012420b6b144410342caeb8adc54c6866cf12064abc9bb211302e496f8", size = 381075, upload-time = "2026-02-16T19:10:04.322Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/f0/35240571e1b67ffb19dafb29ab34150b6f59f93f717b041082cdb1bfceb1/backrefs-6.2-py311-none-any.whl", hash = "sha256:08aa7fae530c6b2361d7bdcbda1a7c454e330cc9dbcd03f5c23205e430e5c3be", size = 392874, upload-time = "2026-02-16T19:10:06.314Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/63/77e8c9745b4d227cce9f5e0a6f68041278c5f9b18588b35905f5f19c1beb/backrefs-6.2-py312-none-any.whl", hash = "sha256:c3f4b9cb2af8cda0d87ab4f57800b57b95428488477be164dd2b47be54db0c90", size = 398787, upload-time = "2026-02-16T19:10:08.274Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/71/c754b1737ad99102e03fa3235acb6cb6d3ac9d6f596cbc3e5f236705abd8/backrefs-6.2-py313-none-any.whl", hash = "sha256:12df81596ab511f783b7d87c043ce26bc5b0288cf3bb03610fe76b8189282b2b", size = 400747, upload-time = "2026-02-16T19:10:09.791Z" },
+    { url = "https://files.pythonhosted.org/packages/af/75/be12ba31a6eb20dccef2320cd8ccb3f7d9013b68ba4c70156259fee9e409/backrefs-6.2-py314-none-any.whl", hash = "sha256:e5f805ae09819caa1aa0623b4a83790e7028604aa2b8c73ba602c4454e665de7", size = 412602, upload-time = "2026-02-16T19:10:12.317Z" },
+    { url = "https://files.pythonhosted.org/packages/21/f8/d02f650c47d05034dcd6f9c8cf94f39598b7a89c00ecda0ecb2911bc27e9/backrefs-6.2-py39-none-any.whl", hash = "sha256:664e33cd88c6840b7625b826ecf2555f32d491800900f5a541f772c485f7cda7", size = 381077, upload-time = "2026-02-16T19:10:13.74Z" },
+]
+
 [[package]]
 name = "basedpyright"
 version = "1.38.1"
@@ -899,6 +913,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/51/c7/b64cae5dba3a1b138d7123ec36bb5ccd39d39939f18454407e5468f4763f/fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b", size = 201422, upload-time = "2025-12-03T15:23:41.434Z" },
 ]
 
+[[package]]
+name = "ghp-import"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "python-dateutil" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" },
+]
+
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -974,6 +1000,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" },
 ]
 
+[[package]]
+name = "griffelib"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.74.0"
@@ -1359,6 +1393,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4c/fa/be89a49c640930180657482a74970cdcf6f7072c8d2471e1babe17a222dc/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85", size = 2349213, upload-time = "2024-12-24T18:30:40.019Z" },
 ]
 
+[[package]]
+name = "markdown"
+version = "3.10.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
+]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -1504,6 +1547,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
 ]
 
+[[package]]
+name = "mergedeep"
+version = "1.3.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" },
+]
+
 [[package]]
 name = "minio"
 version = "7.2.16"
@@ -1520,6 +1572,125 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl", hash = "sha256:9288ab988ca57c181eb59a4c96187b293131418e28c164392186c2b89026b223", size = 95750, upload-time = "2025-07-21T20:11:14.139Z" },
 ]
 
+[[package]]
+name = "mkdocs"
+version = "1.6.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "ghp-import" },
+    { name = "jinja2" },
+    { name = "markdown" },
+    { name = "markupsafe" },
+    { name = "mergedeep" },
+    { name = "mkdocs-get-deps" },
+    { name = "packaging" },
+    { name = "pathspec" },
+    { name = "pyyaml" },
+    { name = "pyyaml-env-tag" },
+    { name = "watchdog" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" },
+]
+
+[[package]]
+name = "mkdocs-autorefs"
+version = "1.4.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown" },
+    { name = "markupsafe" },
+    { name = "mkdocs" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/52/c0/f641843de3f612a6b48253f39244165acff36657a91cc903633d456ae1ac/mkdocs_autorefs-1.4.4.tar.gz", hash = "sha256:d54a284f27a7346b9c38f1f852177940c222da508e66edc816a0fa55fc6da197", size = 56588, upload-time = "2026-02-10T15:23:55.105Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/de/a3e710469772c6a89595fc52816da05c1e164b4c866a89e3cb82fb1b67c5/mkdocs_autorefs-1.4.4-py3-none-any.whl", hash = "sha256:834ef5408d827071ad1bc69e0f39704fa34c7fc05bc8e1c72b227dfdc5c76089", size = 25530, upload-time = "2026-02-10T15:23:53.817Z" },
+]
+
+[[package]]
+name = "mkdocs-get-deps"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mergedeep" },
+    { name = "platformdirs" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" },
+]
+
+[[package]]
+name = "mkdocs-material"
+version = "9.7.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "babel" },
+    { name = "backrefs" },
+    { name = "colorama" },
+    { name = "jinja2" },
+    { name = "markdown" },
+    { name = "mkdocs" },
+    { name = "mkdocs-material-extensions" },
+    { name = "paginate" },
+    { name = "pygments" },
+    { name = "pymdown-extensions" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/36/ce/a1cd02ac7448763f0bb56aaf5f23fa2527944ac6df335080c38c2f253165/mkdocs_material-9.7.4.tar.gz", hash = "sha256:711b0ee63aca9a8c7124d4c73e83a25aa996e27e814767c3a3967df1b9e56f32", size = 4097804, upload-time = "2026-03-03T19:57:36.827Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/94/e3535a9ed078b238df3df75a44694ca0ff5772fd538df4939c658a58c59d/mkdocs_material-9.7.4-py3-none-any.whl", hash = "sha256:6549ad95e4d130ed5099759dfa76ea34c593eefdb9c18c97273605518e99cfbf", size = 9305224, upload-time = "2026-03-03T19:57:34.063Z" },
+]
+
+[[package]]
+name = "mkdocs-material-extensions"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" },
+]
+
+[[package]]
+name = "mkdocstrings"
+version = "1.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jinja2" },
+    { name = "markdown" },
+    { name = "markupsafe" },
+    { name = "mkdocs" },
+    { name = "mkdocs-autorefs" },
+    { name = "pymdown-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/62/0dfc5719514115bf1781f44b1d7f2a0923fcc01e9c5d7990e48a05c9ae5d/mkdocstrings-1.0.3.tar.gz", hash = "sha256:ab670f55040722b49bb45865b2e93b824450fb4aef638b00d7acb493a9020434", size = 100946, upload-time = "2026-02-07T14:31:40.973Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/41/1cf02e3df279d2dd846a1bf235a928254eba9006dd22b4a14caa71aed0f7/mkdocstrings-1.0.3-py3-none-any.whl", hash = "sha256:0d66d18430c2201dc7fe85134277382baaa15e6b30979f3f3bdbabd6dbdb6046", size = 35523, upload-time = "2026-02-07T14:31:39.27Z" },
+]
+
+[package.optional-dependencies]
+python = [
+    { name = "mkdocstrings-python" },
+]
+
+[[package]]
+name = "mkdocstrings-python"
+version = "2.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "griffelib" },
+    { name = "mkdocs-autorefs" },
+    { name = "mkdocstrings" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/29/33/c225eaf898634bdda489a6766fc35d1683c640bffe0e0acd10646b13536d/mkdocstrings_python-2.0.3.tar.gz", hash = "sha256:c518632751cc869439b31c9d3177678ad2bfa5c21b79b863956ad68fc92c13b8", size = 199083, upload-time = "2026-02-20T10:38:36.368Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/28/79f0f8de97cce916d5ae88a7bee1ad724855e83e6019c0b4d5b3fabc80f3/mkdocstrings_python-2.0.3-py3-none-any.whl", hash = "sha256:0b83513478bdfd803ff05aa43e9b1fca9dd22bcd9471f09ca6257f009bc5ee12", size = 104779, upload-time = "2026-02-20T10:38:34.517Z" },
+]
+
 [[package]]
 name = "mmh3"
 version = "5.1.0"
@@ -1939,6 +2110,7 @@ dev = [
     { name = "pyright" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
+    { name = "pytest-codeblocks" },
     { name = "pytest-cov" },
     { name = "ray", extra = ["default"] },
     { name = "redis" },
@@ -1946,6 +2118,12 @@ dev = [
     { name = "sphinx" },
     { name = "tqdm" },
 ]
+docs = [
+    { name = "mkdocs" },
+    { name = "mkdocs-material" },
+    { name = "mkdocstrings", extra = ["python"] },
+    { name = "pymdown-extensions" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -1992,6 +2170,7 @@ dev = [
     { name = "pyright", specifier = ">=1.1.404" },
     { name = "pytest", specifier = ">=8.3.5" },
     { name = "pytest-asyncio", specifier = ">=1.3.0" },
+    { name = "pytest-codeblocks", specifier = ">=0.17.0" },
     { name = "pytest-cov", specifier = ">=6.1.1" },
     { name = "ray", extras = ["default"], specifier = "==2.48.0" },
     { name = "redis", specifier = ">=6.2.0" },
@@ -1999,6 +2178,12 @@ dev = [
     { name = "sphinx", specifier = ">=8.2.3" },
     { name = "tqdm", specifier = ">=4.67.1" },
 ]
+docs = [
+    { name = "mkdocs", specifier = ">=1.6.0" },
+    { name = "mkdocs-material", specifier = ">=9.5.0" },
+    { name = "mkdocstrings", extras = ["python"], specifier = ">=0.27.0" },
+    { name = "pymdown-extensions", specifier = ">=10.7" },
+]
 
 [[package]]
 name = "packaging"
@@ -2009,6 +2194,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
 ]
 
+[[package]]
+name = "paginate"
+version = "0.5.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" },
+]
+
 [[package]]
 name = "pandas"
 version = "2.2.3"
@@ -2059,6 +2253,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" },
 ]
 
+[[package]]
+name = "pathspec"
+version = "1.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" },
+]
+
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -2615,6 +2818,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/67/69/c0087d19c8d8e8530acee3ba485d54aedeebf2963784a16692ca4b439566/pyiceberg-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:124793c54a0c2fb5ac4ab19c38da116c068e277c85cbaa7e4064e635a70b595e", size = 595512, upload-time = "2025-04-30T14:59:14.464Z" },
 ]
 
+[[package]]
+name = "pymdown-extensions"
+version = "10.21"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/63/06673d1eb6d8f83c0ea1f677d770e12565fb516928b4109c9e2055656a9e/pymdown_extensions-10.21.tar.gz", hash = "sha256:39f4a020f40773f6b2ff31d2cd2546c2c04d0a6498c31d9c688d2be07e1767d5", size = 853363, upload-time = "2026-02-15T20:44:06.748Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6f/2c/5b079febdc65e1c3fb2729bf958d18b45be7113828528e8a0b5850dd819a/pymdown_extensions-10.21-py3-none-any.whl", hash = "sha256:91b879f9f864d49794c2d9534372b10150e6141096c3908a455e45ca72ad9d3f", size = 268877, upload-time = "2026-02-15T20:44:05.464Z" },
+]
+
 [[package]]
 name = "pymongo"
 version = "4.15.5"
@@ -2735,6 +2951,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
 ]
 
+[[package]]
+name = "pytest-codeblocks"
+version = "0.17.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a5/99/1ee3017a525dcb36566f0523938fbc20fb33ef8bf957205fafe6659f3a60/pytest_codeblocks-0.17.0.tar.gz", hash = "sha256:446e1babd182f54b4f113d567737a22f5405cade144c08a0085b2985247943d5", size = 11176, upload-time = "2023-09-17T19:17:31.05Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/2c/503f797c7ac1e35d81944f8fbcf3ea7a1965e435676570a833035d8d0937/pytest_codeblocks-0.17.0-py3-none-any.whl", hash = "sha256:b2aed8e66c3ce65435630783b391e7c7ae46f80b8220d3fa1bb7c689b36e78ad", size = 7716, upload-time = "2023-09-17T19:17:29.506Z" },
+]
+
 [[package]]
 name = "pytest-cov"
 version = "6.1.1"
@@ -2820,6 +3048,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
 ]
 
+[[package]]
+name = "pyyaml-env-tag"
+version = "1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" },
+]
+
 [[package]]
 name = "pyzmq"
 version = "26.4.0"
@@ -3556,6 +3796,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/87/b22cf40cdf7e2b2bf83f38a94d2c90c5ad6c304896e5a12d0c08a602eb59/virtualenv-20.33.0-py3-none-any.whl", hash = "sha256:106b6baa8ab1b526d5a9b71165c85c456fbd49b16976c88e2bc9352ee3bc5d3f", size = 6060205, upload-time = "2025-08-03T08:09:16.674Z" },
 ]
 
+[[package]]
+name = "watchdog"
+version = "6.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/24/d9be5cd6642a6aa68352ded4b4b10fb0d7889cb7f45814fb92cecd35f101/watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c", size = 96393, upload-time = "2024-11-01T14:06:31.756Z" },
+    { url = "https://files.pythonhosted.org/packages/63/7a/6013b0d8dbc56adca7fdd4f0beed381c59f6752341b12fa0886fa7afc78b/watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2", size = 88392, upload-time = "2024-11-01T14:06:32.99Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/40/b75381494851556de56281e053700e46bff5b37bf4c7267e858640af5a7f/watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c", size = 89019, upload-time = "2024-11-01T14:06:34.963Z" },
+    { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" },
+    { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" },
+    { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" },
+    { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" },
+    { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" },
+    { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" },
+    { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" },
+    { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" },
+]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.13"