diff --git a/.codex b/.codex
new file mode 100644
index 0000000..e69de29
diff --git a/README.md b/README.md
index f9ca4a4..49d082f 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ A professional Python API framework for building agent-based applications with F
 - **[Configuration Guide](./docs/configuration.md)** - All configuration options explained
 - **[Deployment Guide](./docs/deployment.md)** - Docker, Kubernetes, and cloud deployment
 - **[Authentication Guide](./docs/authentication.md)** - JWT and custom authentication
+- **[Rate Limiting Guide](./docs/rate-limiting.md)** - Memory, Redis, and custom rate-limit backends
 - **[ID Generation Guide](./docs/id-generation.md)** - Snowflake ID generation
 - **[Thread Name Generator Guide](./docs/thread-name-generator.md)** - Thread naming strategies
 
@@ -19,6 +20,21 @@ A professional Python API framework for building agent-based applications with F
 pip install 10xscale-agentflow-cli
 ```
 
+Redis rate limiting is optional. Install the Redis extra only when you configure
+`rate_limit.backend` as `redis`:
+
+```bash
+pip install "10xscale-agentflow-cli[redis]"
+```
+
+JWT auth and document text extraction are optional too. Install only the extra
+you need:
+
+```bash
+pip install "10xscale-agentflow-cli[jwt]"
+pip install "10xscale-agentflow-cli[media]"
+```
+
 ### Initialize a New Project
 
 ```bash
@@ -53,6 +69,7 @@ agentflow build --docker-compose
 - ✅ **State Graph Orchestration** - Build complex agent workflows with LangGraph
 - ✅ **FastAPI Backend** - High-performance async web framework
 - ✅ **Authentication** - Built-in JWT auth and custom authentication support
+- ✅ **Rate Limiting** - Sliding-window limits with memory, Redis, and custom backends
 - ✅ **ID Generation** - Distributed Snowflake ID generation
 - ✅ **Thread Management** - Intelligent thread naming and conversation management
 - ✅ **Docker Ready** - Generate production-ready Docker files
@@ -174,6 +191,7 @@ The configuration file (`agentflow.json`) defines your agent, authentication, an
 | `injectq` | string\|null | Path to InjectQ container |
 | `store` | string\|null | Path to data store |
 | `redis` | string\|null | Redis connection URL |
+| `rate_limit` | object\|null | Sliding-window rate limiting configuration |
 | `thread_name_generator` | string\|null | Path to custom thread name generator |
 
 See the [Configuration Guide](./docs/configuration.md) for complete details.
@@ -544,4 +562,3 @@ Developed by [10xScale](https://10xscale.ai) and maintained by the community.
 ---
 
 **Made with ❤️ for the AI agent development community**
-
diff --git a/agentflow.json b/agentflow.json
index 82df8be..36732de 100644
--- a/agentflow.json
+++ b/agentflow.json
@@ -2,5 +2,14 @@
   "agent": "graph.react:app",
   "thread_name_generator": "graph.thread_name_generator:MyNameGenerator",
   "env": ".env",
-  "auth": null
+  "auth": null,
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "trusted_proxy_headers": false,
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
 }
diff --git a/agentflow_cli/cli/templates/skills/agent-skills/SKILL.md b/agentflow_cli/cli/templates/skills/agent-skills/SKILL.md
index 0d73205..404b0b2 100644
--- a/agentflow_cli/cli/templates/skills/agent-skills/SKILL.md
+++ b/agentflow_cli/cli/templates/skills/agent-skills/SKILL.md
@@ -26,6 +26,7 @@ metadata:
     - references/api-configuration.md
     - references/auth-and-authorization.md
     - references/api-settings-and-middleware.md
+    - references/rate-limiting.md
     - references/rest-api-and-errors.md
     - references/id-and-thread-name-generators.md
     - references/client-auth-and-errors.md
@@ -78,6 +79,7 @@ Treat `agentflow-docs/docs` as the first source of truth for public package name
    - `agentflow.json` and dependency loading: `references/api-configuration.md`
    - API auth and authorization: `references/auth-and-authorization.md`
    - API environment, settings, and middleware: `references/api-settings-and-middleware.md`
+   - Rate limiting (config, backends, headers, custom backend): `references/rate-limiting.md`
    - REST routes and error behavior: `references/rest-api-and-errors.md`
    - API Snowflake IDs and thread naming: `references/id-and-thread-name-generators.md`
    - TypeScript auth helpers and structured errors: `references/client-auth-and-errors.md`
diff --git a/agentflow_cli/cli/templates/skills/agent-skills/references/api-configuration.md b/agentflow_cli/cli/templates/skills/agent-skills/references/api-configuration.md
index 072f061..a30c6e1 100644
--- a/agentflow_cli/cli/templates/skills/agent-skills/references/api-configuration.md
+++ b/agentflow_cli/cli/templates/skills/agent-skills/references/api-configuration.md
@@ -23,7 +23,15 @@ Common full shape:
   "thread_name_generator": "graph.thread_name_generator:MyNameGenerator",
   "authorization": "graph.auth:my_authorization_backend",
   "env": ".env",
-  "auth": "jwt"
+  "auth": "jwt",
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
 }
 ```
 
@@ -37,6 +45,7 @@ Common full shape:
 - `authorization`: optional import path to an authorization backend.
 - `env`: optional `.env` path loaded before graph import.
 - `auth`: `null`, `"jwt"`, or `{"method": "custom", "path": "module:backend"}`.
+- `rate_limit`: optional sliding-window rate limiter config object; omit or set to `null` to disable. See `references/rate-limiting.md` for the full field reference.
 
 ## Loading Order
 
@@ -62,3 +71,4 @@ Common full shape:
 - App startup: `agentflow-api/agentflow_cli/src/app/main.py`
 - Main docs: `agentflow-docs/docs/reference/api-cli/configuration.md`
 - How-to: `agentflow-docs/docs/how-to/api-cli/configure-agentflow-json.md`
+- Rate limit config details: `references/rate-limiting.md`
diff --git a/agentflow_cli/cli/templates/skills/agent-skills/references/api-settings-and-middleware.md b/agentflow_cli/cli/templates/skills/agent-skills/references/api-settings-and-middleware.md
index b151a70..e88c7fd 100644
--- a/agentflow_cli/cli/templates/skills/agent-skills/references/api-settings-and-middleware.md
+++ b/agentflow_cli/cli/templates/skills/agent-skills/references/api-settings-and-middleware.md
@@ -48,6 +48,11 @@ Active middleware areas:
 - Request ID assignment.
 - Selective gzip behavior; streaming endpoints should avoid gzip buffering when configured.
 - Worker middleware where used by deployment.
+- Rate limiting: sliding-window limiter controlled by the `rate_limit` block in `agentflow.json`.
+  Uses an in-process `memory` backend by default; use the `redis` backend (requires
+  `pip install "10xscale-agentflow-cli[redis]"`) for multi-worker or multi-instance deployments.
+  Excluded paths, identity mode (`ip` or `global`), and `fail_open` behavior are all configurable.
+  See `references/rate-limiting.md` for the full option reference.
 
 ## Production Warnings
 
@@ -76,7 +81,10 @@ Production mode warns about unsafe defaults such as:
 - Middleware setup: `agentflow-api/agentflow_cli/src/app/core/config/setup_middleware.py`
 - Request limits: `agentflow-api/agentflow_cli/src/app/core/middleware/request_limits.py`
 - Security headers: `agentflow-api/agentflow_cli/src/app/core/middleware/security_headers.py`
+- Rate limit middleware: `agentflow-api/agentflow_cli/src/app/core/middleware/rate_limit/`
+- Rate limit base class: `agentflow-api/agentflow_cli/src/app/core/middleware/rate_limit/base.py`
 - Sentry: `agentflow-api/agentflow_cli/src/app/core/config/sentry_config.py`
 - Log sanitizer: `agentflow-api/agentflow_cli/src/app/core/utils/log_sanitizer.py`
 - Main docs: `agentflow-docs/docs/reference/api-cli/environment.md`
 - Production docs: `agentflow-docs/docs/how-to/production/environment-variables.md`
+- Rate limiting docs: `agentflow-api/docs/rate-limiting.md`
diff --git a/agentflow_cli/cli/templates/skills/agent-skills/references/callbacks-and-command.md b/agentflow_cli/cli/templates/skills/agent-skills/references/callbacks-and-command.md
index 94ad1ee..5cc0347 100644
--- a/agentflow_cli/cli/templates/skills/agent-skills/references/callbacks-and-command.md
+++ b/agentflow_cli/cli/templates/skills/agent-skills/references/callbacks-and-command.md
@@ -43,6 +43,166 @@ app = graph.compile(callback_manager=callback_manager)
 
 Use validators for safety checks, business rules, input policy, and prompt-injection detection.
 
+## Graph Lifecycle Hooks
+
+Graph lifecycle hooks fire at **graph orchestration level** — before/after the entire graph run, on checkpoints, on interrupts, on resume, and after each node transition. They complement the invocation-level hooks above (`before_invoke` / `after_invoke` / `on_error`), which fire inside a single node's AI/Tool/MCP call.
+
+### GraphLifecycleContext
+
+All 7 hooks receive this as their first argument. It provides run-identifying metadata.
+
+```python
+@dataclass
+class GraphLifecycleContext:
+    config: dict[str, Any]       # full config dict passed to invoke/stream
+    timestamp: str               # ISO8601 start time from config["timestamp"]
+    metadata: dict[str, Any] | None = None  # open-ended extra context
+
+    @property
+    def thread_id(self) -> str | None: ...
+
+    @property
+    def run_id(self) -> str | None: ...
+```
+
+### GraphLifecycleHook
+
+Subclass `GraphLifecycleHook` and override only the methods you need. All methods are async and default to no-ops.
+
+```python
+from agentflow.utils.callbacks import GraphLifecycleHook, GraphLifecycleContext, CallbackManager
+from agentflow.core.state import AgentState
+from agentflow.core.state.message import Message
+
+class GraphLifecycleHook(ABC):
+    async def on_graph_start(
+        self, context: GraphLifecycleContext, state: AgentState
+    ) -> AgentState | None: ...
+    # Fires: after state is loaded, before the first node executes.
+    # Return modified AgentState to replace the initial state, or None.
+
+    async def on_graph_end(
+        self, context: GraphLifecycleContext, final_state: AgentState,
+        messages: list[Message], total_steps: int
+    ) -> AgentState | None: ...
+    # Fires: after execution loop completes, before final persistence.
+    # Return modified AgentState or None.
+
+    async def on_graph_error(
+        self, context: GraphLifecycleContext, error: Exception,
+        partial_state: AgentState, messages: list[Message], step: int, node_name: str
+    ) -> tuple[AgentState, str] | None: ...
+    # Fires: when an unhandled exception escapes the graph loop.
+    # Return (AgentState, error_message) to change persisted error snapshot, or None.
+    # Cannot suppress the error — always re-raised after this hook.
+
+    async def on_interrupt(
+        self, context: GraphLifecycleContext, interrupted_node: str,
+        interrupt_type: str,   # "before" | "after" | "stop" | "remote_tool"
+        state: AgentState
+    ) -> AgentState | None: ...
+    # Fires: before interrupt state is persisted (covers before/after/stop/remote_tool types).
+    # Return modified AgentState or None.
+
+    async def on_resume(
+        self, context: GraphLifecycleContext, resumed_node: str,
+        state: AgentState, resume_data: dict[str, Any]
+    ) -> AgentState | None: ...
+    # Fires: when a paused graph is resumed, before clear_interrupt().
+    # resume_data is mutable in-place. Return modified AgentState or None.
+
+    async def on_checkpoint(
+        self, context: GraphLifecycleContext, state: AgentState,
+        messages: list[Message], is_context_trimmed: bool
+    ) -> tuple[AgentState, list[Message]] | AgentState | None: ...
+    # Fires: immediately before state/messages are persisted (every checkpoint, not just final).
+    # Return (AgentState, messages), AgentState, or None.
+
+    async def on_state_update(
+        self, context: GraphLifecycleContext, node_name: str,
+        old_state: AgentState, new_state: AgentState, step: int
+    ) -> AgentState | None: ...
+    # Fires: after each node produces a result and state is merged.
+    # Return modified AgentState or None.
+```
+
+### Registration
+
+```python
+callback_mgr = CallbackManager()
+callback_mgr.register_lifecycle_hook(MyHook())
+
+# Combine with existing invocation-level hooks on the same manager:
+callback_mgr.register_after_invoke(InvocationType.AI, my_ai_callback)
+
+app = graph.compile(callback_manager=callback_mgr)
+```
+
+### Hook Summary
+
+| Hook | Returns | Fires N times | Fire location |
+|---|---|---|---|
+| `on_graph_start` | `AgentState \| None` | 1 per run | After state load, before loop |
+| `on_graph_end` | `AgentState \| None` | 1 per successful run | After `state.complete()`, before final `sync_data()` |
+| `on_graph_error` | `tuple[AgentState, str] \| None` | 1 per failed run | In except block, before error `sync_data()` |
+| `on_interrupt` | `AgentState \| None` | 0–N per run | Before interrupt checkpoint persistence |
+| `on_resume` | `AgentState \| None` | 0–1 per call | Before `clear_interrupt()` |
+| `on_checkpoint` | `(AgentState, list[Message]) \| AgentState \| None` | 1–N per run | Before every durable checkpoint write |
+| `on_state_update` | `AgentState \| None` | N per run (once per node) | After each node result is merged |
+
+### Example
+
+```python
+class ObservabilityHook(GraphLifecycleHook):
+    async def on_graph_start(self, ctx, state):
+        self._span = tracer.start_span(f"graph.run.{ctx.thread_id}")
+        return None
+
+    async def on_graph_end(self, ctx, final_state, messages, total_steps):
+        self._span.set_attribute("steps", total_steps)
+        self._span.end()
+
+    async def on_graph_error(self, ctx, error, partial_state, messages, step, node_name):
+        self._span.record_exception(error)
+        self._span.end()
+        alert_oncall(f"Graph failed at node {node_name}: {error}")
+        return None
+
+    async def on_interrupt(self, ctx, interrupted_node, interrupt_type, state):
+        notify_frontend(ctx.thread_id, status="waiting_for_input", node=interrupted_node)
+        return None
+
+    async def on_resume(self, ctx, resumed_node, state, resume_data):
+        notify_frontend(ctx.thread_id, status="resuming", node=resumed_node)
+        return None
+
+    async def on_checkpoint(self, ctx, state, messages, is_context_trimmed):
+        metrics.increment("agentflow.checkpoints", tags={"thread": ctx.thread_id})
+        return None
+
+    async def on_state_update(self, ctx, node_name, old_state, new_state, step):
+        diff = compute_diff(old_state, new_state)
+        stream_diff_to_frontend(ctx.thread_id, diff)
+        return None
+
+
+callback_mgr = CallbackManager()
+callback_mgr.register_lifecycle_hook(ObservabilityHook())
+app = graph.compile(callback_manager=callback_mgr)
+```
+
+### Common Use Cases by Hook
+
+- **`on_graph_start`**: inject trace IDs, pre-populate state from external DB, set rate-limit budgets, initialize OpenTelemetry spans.
+- **`on_graph_end`**: send completion notifications (Slack/email), record step/message count metrics, archive transcripts, trigger downstream webhooks.
+- **`on_graph_error`**: alert PagerDuty/Sentry, log structured failure diagnostics, close OTel spans with error status.
+- **`on_interrupt`**: push "waiting for approval" notifications to frontend/mobile, start timeout timers, update task queue status.
+- **`on_resume`**: cancel timeout timers, validate resume payload, record interrupt→resume cycle for audit trail.
+- **`on_checkpoint`**: redact sensitive data before persistence, replicate to secondary store, invalidate caches, compliance audit logging (SOC2/HIPAA).
+- **`on_state_update`**: real-time state diffing for frontend streaming, per-node invariant assertions, security scanning of state content.
+
+---
+
 ## Command
 
 `Command` lets a node combine state/message updates with control flow. Use it when the next node depends on runtime logic inside the node and is awkward to express as a static conditional edge.
@@ -94,9 +254,13 @@ def router_node(state, config):
 
 ## Source Map
 
-- Callback system: `agentflow/agentflow/utils/callbacks.py`
+- Callback system (invocation hooks + graph lifecycle hooks): `agentflow/agentflow/utils/callbacks.py`
 - Default validators: `agentflow/agentflow/utils/validators.py`
 - Graph compile callback argument: `agentflow/agentflow/core/graph/state_graph.py`
 - Command API: `agentflow/agentflow/utils/command.py`
 - Command execution paths: `agentflow/agentflow/core/graph/compiled_graph.py`
+- Lifecycle hook fire points — invoke path: `agentflow/agentflow/core/graph/utils/invoke_handler.py`
+- Lifecycle hook fire points — stream path: `agentflow/agentflow/core/graph/utils/stream_handler.py`
+- Lifecycle hook fire points — interrupt/resume: `agentflow/agentflow/core/graph/utils/heandler_utils.py`
+- Lifecycle hook fire points — checkpoint: `agentflow/agentflow/core/graph/utils/utils.py`
 - Legacy docs: `agentflow-docs/docs-mkdocs-legacy/reference/library/Command.md`
diff --git a/agentflow_cli/cli/templates/skills/agent-skills/references/rate-limiting.md b/agentflow_cli/cli/templates/skills/agent-skills/references/rate-limiting.md
new file mode 100644
index 0000000..7db1a51
--- /dev/null
+++ b/agentflow_cli/cli/templates/skills/agent-skills/references/rate-limiting.md
@@ -0,0 +1,194 @@
+# Rate Limiting
+
+Use this when adding, configuring, or debugging AgentFlow's built-in sliding-window rate limiter.
+
+## Overview
+
+AgentFlow provides a sliding-window rate limiter configured via the `rate_limit` block in
+`agentflow.json`. The limiter is disabled by default — add the block to activate it.
+
+## Quick Start
+
+In-memory backend for local development or single-process services:
+
+```json
+{
+  "agent": "graph.react:app",
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
+}
+```
+
+Each client IP may make `requests` calls every `window` seconds.
+
+## Redis Backend (Production)
+
+Redis stores counters centrally so the limit is enforced across multiple workers,
+containers, or servers. Install the optional extra first:
+
+```bash
+pip install "10xscale-agentflow-cli[redis]"
+```
+
+```json
+{
+  "agent": "graph.react:app",
+  "rate_limit": {
+    "enabled": true,
+    "backend": "redis",
+    "requests": 1000,
+    "window": 60,
+    "by": "ip",
+    "trusted_proxy_headers": true,
+    "exclude_paths": ["/health", "/metrics", "/docs", "/redoc", "/openapi.json"],
+    "redis": {
+      "url": "${RATE_LIMIT_REDIS_URL}",
+      "prefix": "agentflow:rate-limit"
+    },
+    "fail_open": true
+  }
+}
+```
+
+Set the environment variable:
+
+```bash
+RATE_LIMIT_REDIS_URL=redis://localhost:6379/0
+```
+
+The Redis backend uses an atomic Lua script with sorted sets — check and record happen as a
+single Redis operation, which prevents concurrent requests from racing past the limit.
+
+## Configuration Reference
+
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `enabled` | boolean | `true` | Enables the middleware when the `rate_limit` block exists. |
+| `backend` | string | `"memory"` | `memory`, `redis`, or `custom`. |
+| `requests` | integer | `100` | Maximum requests allowed in each window. |
+| `window` | integer | `60` | Window size in seconds. |
+| `by` | string | `"ip"` | Limit by client IP (`"ip"`) or one shared quota (`"global"`). |
+| `exclude_paths` | string array | `[]` | Paths that bypass rate limiting entirely. |
+| `trusted_proxy_headers` | boolean | `false` | Use `X-Forwarded-For` as the client IP (only behind a trusted proxy). |
+| `redis.url` | string | `null` | Redis URL; required for the Redis backend. Supports `${ENV_VAR}` expansion. |
+| `redis.prefix` | string | `"agentflow:rate-limit"` | Prefix for all Redis keys. |
+| `fail_open` | boolean | `true` | On Redis errors, allow (`true`) or deny (`false`) requests. |
+
+## Identity Modes
+
+Per-IP limit (most public APIs):
+
+```json
+{ "rate_limit": { "requests": 100, "window": 60, "by": "ip" } }
+```
+
+Global limit (one shared quota for the whole service):
+
+```json
+{ "rate_limit": { "requests": 5000, "window": 60, "by": "global" } }
+```
+
+Only enable `trusted_proxy_headers` when your app sits behind a trusted proxy that strips
+untrusted `X-Forwarded-For` headers from direct clients.
+
+## Response Headers
+
+Every response includes:
+
+| Header | Description |
+| --- | --- |
+| `X-RateLimit-Limit` | Configured request limit. |
+| `X-RateLimit-Remaining` | Requests remaining in the current window. |
+| `X-RateLimit-Reset` | Unix timestamp for the window reset estimate. |
+| `X-RateLimit-Reset-After` | Seconds until the window reset estimate. |
+| `Retry-After` | Present on `429` responses only. |
+
+When the limit is exceeded, AgentFlow returns `429 Too Many Requests`:
+
+```json
+{
+  "error": {
+    "code": "RATE_LIMIT_EXCEEDED",
+    "message": "Too many requests. Limit: 100 per 60s. Retry after 12s.",
+    "limit": 100,
+    "window_seconds": 60,
+    "retry_after_seconds": 12
+  },
+  "metadata": {
+    "request_id": "request-id",
+    "status": "error"
+  }
+}
+```
+
+## Custom Backend
+
+Implement `BaseRateLimitBackend` and bind the instance through InjectQ, then set
+`"backend": "custom"` in `agentflow.json`.
+
+```python
+from agentflow_cli.src.app.core.middleware.rate_limit import (
+    BaseRateLimitBackend,
+    RateLimitDecision,
+)
+
+
+class MyRateLimitBackend(BaseRateLimitBackend):
+    async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+        allowed = True
+        remaining = limit - 1
+        reset_after = window
+        return RateLimitDecision(
+            allowed=allowed,
+            remaining=remaining,
+            reset_after=reset_after,
+        )
+
+    async def close(self) -> None:
+        return None
+```
+
+```json
+{
+  "rate_limit": {
+    "enabled": true,
+    "backend": "custom",
+    "requests": 100,
+    "window": 60,
+    "by": "ip"
+  }
+}
+```
+
+## Choosing a Backend
+
+| Scenario | Backend |
+| --- | --- |
+| Local development, tests, demos | `memory` |
+| Single-process production | `memory` |
+| Gunicorn/Uvicorn with multiple workers | `redis` |
+| Docker / Kubernetes (multiple replicas) | `redis` |
+| Custom storage or quotas | `custom` |
+
+## Rules
+
+- Do not enable `trusted_proxy_headers` unless a reverse proxy strips untrusted forwarding headers.
+- Always add health-check and observability paths to `exclude_paths`.
+- In production with Redis set `fail_open: false` only when hard enforcement is required; otherwise `true` prevents availability issues during Redis outages.
+- The `redis` backend requires the `redis` extra: `pip install "10xscale-agentflow-cli[redis]"`.
+- The `RATE_LIMIT_REDIS_URL` value supports `${ENV_VAR}` expansion — keep secrets out of committed config.
+
+## Source Map
+
+- Middleware: `agentflow-api/agentflow_cli/src/app/core/middleware/rate_limit/`
+- Base class: `agentflow-api/agentflow_cli/src/app/core/middleware/rate_limit/base.py`
+- Rate-limit config model: `agentflow-api/agentflow_cli/src/app/core/config/graph_config.py`
+- Middleware setup: `agentflow-api/agentflow_cli/src/app/core/config/setup_middleware.py`
+- Docs: `agentflow-api/docs/rate-limiting.md`
+- Configuration reference: `agentflow-api/docs/configuration.md` — "Rate Limiting" section
diff --git a/agentflow_cli/cli/templates/skills/copilot/agentflow.instructions.md b/agentflow_cli/cli/templates/skills/copilot/agentflow.instructions.md
index a863ea5..d0a2399 100644
--- a/agentflow_cli/cli/templates/skills/copilot/agentflow.instructions.md
+++ b/agentflow_cli/cli/templates/skills/copilot/agentflow.instructions.md
@@ -32,6 +32,7 @@ Never use repository folder names (e.g. `agentflow-cli`) in install commands or
 - Tools need docstrings and type annotations so model-facing schemas are useful.
 - Injectable parameters (`state`, `config`, `tool_call_id`) are hidden from the model schema.
 - For production, avoid process-local storage for shared state — use durable checkpointer/store backends.
+- Add observability, audit, or business-logic side effects by registering a `GraphLifecycleHook` on `CallbackManager` — do not wrap `ainvoke()` / `astream()` calls in application code to achieve the same result.
 
 ## Where to look when you need more detail
 
@@ -44,7 +45,9 @@ For deeper context on any subsystem, read the matching reference under `.github/
 - Multimodal media, long-term memory stores
 - Streaming, SSE, runtime publishers, A2A/ACP protocols
 - API server, REST routes, auth, errors, settings, middleware
+- Rate limiting: sliding-window config, memory/Redis/custom backends, response headers, 429 behavior: `references/rate-limiting.md`
 - TypeScript client: invoke, stream, threads, memory, files, A2UI
+- Observability, validators, graph lifecycle hooks (`GraphLifecycleHook`), and runtime jumps (`Command`): `references/callbacks-and-command.md`
 
 ## Verifying behavior
 
diff --git a/agentflow_cli/src/app/core/auth/jwt_auth.py b/agentflow_cli/src/app/core/auth/jwt_auth.py
index a49face..b58b928 100644
--- a/agentflow_cli/src/app/core/auth/jwt_auth.py
+++ b/agentflow_cli/src/app/core/auth/jwt_auth.py
@@ -1,6 +1,5 @@
 from typing import Any
 
-import jwt
 from fastapi import Request, Response
 from fastapi.security import HTTPAuthorizationCredentials
 
@@ -10,6 +9,12 @@
 from agentflow_cli.src.app.core.exceptions import UserAccountError
 
 
+try:
+    import jwt
+except ImportError:  # pragma: no cover
+    jwt = None  # type: ignore[assignment]
+
+
 class JwtAuth(BaseAuth):
     def authenticate(
         self,
@@ -51,6 +56,12 @@ def authenticate(
 
         token = credential.credentials
 
+        if jwt is None:
+            raise ImportError(
+                "PyJWT is required for JWT authentication. "
+                'Install with `pip install "10xscale-agentflow-cli[jwt]"`'
+            )
+
         if jwt_secret_key is None or jwt_algorithm is None:
             raise UserAccountError(
                 message="JWT settings are not configured",
diff --git a/agentflow_cli/src/app/core/config/graph_config.py b/agentflow_cli/src/app/core/config/graph_config.py
index a4d0e6d..344278d 100644
--- a/agentflow_cli/src/app/core/config/graph_config.py
+++ b/agentflow_cli/src/app/core/config/graph_config.py
@@ -1,10 +1,147 @@
 import json
 import os
+from dataclasses import dataclass
 from pathlib import Path
 
 from dotenv import load_dotenv
 
 
+def _parse_bool(value: object, *, field: str) -> bool:
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        if normalized in {"1", "true", "yes", "on"}:
+            return True
+        if normalized in {"0", "false", "no", "off"}:
+            return False
+    raise ValueError(f"{field} must be a boolean")
+
+
+def _expand_env(value: str | None) -> str | None:
+    if value is None:
+        return None
+    expanded = os.path.expandvars(value)
+    if expanded == value and (value.startswith("$") or "${" in value):
+        raise ValueError(f"Unresolved environment variable in value: {value}")
+    return expanded
+
+
+@dataclass
+class RateLimitConfig:
+    """Rate limit configuration parsed from agentflow.json.
+
+    Example (memory backend, default)::
+
+        "rate_limit": {
+            "enabled": true,
+            "backend": "memory",
+            "requests": 100,
+            "window": 60,
+            "by": "ip",
+            "trusted_proxy_headers": false,
+            "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+        }
+
+    Example (Redis backend)::
+
+        "rate_limit": {
+            "enabled": true,
+            "backend": "redis",
+            "requests": 100,
+            "window": 60,
+            "by": "ip",
+            "trusted_proxy_headers": true,
+            "exclude_paths": ["/health"],
+            "redis": {
+                "url": "redis://localhost:6379/0",
+                "prefix": "agentflow:rate-limit"
+            },
+            "fail_open": true
+        }
+
+    Example (custom backend)::
+
+        "rate_limit": {
+            "enabled": true,
+            "backend": "custom",
+            "requests": 100,
+            "window": 60,
+            "by": "ip"
+        }
+
+    For custom backends, bind a ``BaseRateLimitBackend`` instance in InjectQ.
+    """
+
+    enabled: bool
+    requests: int
+    window: int
+    by: str  # "ip" | "global"
+    backend: str  # "memory" | "redis" | "custom"
+    redis_url: str | None
+    redis_prefix: str
+    exclude_paths: tuple[str, ...]
+    trusted_proxy_headers: bool  # honour X-Forwarded-For only when True
+    fail_open: bool  # on backend error: True=allow, False=deny
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "RateLimitConfig":
+        if not isinstance(data, dict):
+            raise ValueError("rate_limit must be an object")
+
+        enabled = _parse_bool(data.get("enabled", True), field="rate_limit.enabled")
+        requests = int(data.get("requests", 100))
+        window = int(data.get("window", 60))
+        by = data.get("by", "ip")
+        backend = data.get("backend", "memory")
+        trusted_proxy_headers = _parse_bool(
+            data.get("trusted_proxy_headers", False),
+            field="rate_limit.trusted_proxy_headers",
+        )
+        exclude_paths_raw = data.get("exclude_paths", [])
+        if not isinstance(exclude_paths_raw, list | tuple):
+            raise ValueError("rate_limit.exclude_paths must be a list of paths")
+        exclude_paths = tuple(str(path) for path in exclude_paths_raw)
+        fail_open = _parse_bool(data.get("fail_open", True), field="rate_limit.fail_open")
+
+        # Redis sub-object: {"url": "...", "prefix": "..."}
+        redis_obj = data.get("redis") or {}
+        if isinstance(redis_obj, str):
+            # Allow shorthand: "redis": "redis://..."
+            redis_url: str | None = _expand_env(redis_obj)
+            redis_prefix = "agentflow:rate-limit"
+        elif isinstance(redis_obj, dict):
+            redis_url = _expand_env(redis_obj.get("url") or None)
+            redis_prefix = str(redis_obj.get("prefix", "agentflow:rate-limit"))
+        else:
+            raise ValueError("rate_limit.redis must be an object or Redis URL string")
+
+        # Validation
+        if by not in ("ip", "global"):
+            raise ValueError(f"rate_limit.by must be 'ip' or 'global', got '{by}'")
+        if backend not in ("memory", "redis", "custom"):
+            raise ValueError(
+                f"rate_limit.backend must be 'memory', 'redis', or 'custom', got '{backend}'"
+            )
+        if requests <= 0:
+            raise ValueError("rate_limit.requests must be a positive integer")
+        if window <= 0:
+            raise ValueError("rate_limit.window must be a positive integer")
+
+        return cls(
+            enabled=enabled,
+            requests=requests,
+            window=window,
+            by=by,
+            backend=backend,
+            redis_url=redis_url,
+            redis_prefix=redis_prefix,
+            exclude_paths=exclude_paths,
+            trusted_proxy_headers=trusted_proxy_headers,
+            fail_open=fail_open,
+        )
+
+
 class GraphConfig:
     def __init__(self, path: str = "agentflow.json"):
         with Path(path).open() as f:
@@ -84,3 +221,28 @@ def auth_config(self) -> dict | None:
                 }
 
         raise ValueError(f"Unsupported auth method: {res}")
+
+    @property
+    def rate_limit(self) -> RateLimitConfig | None:
+        """
+        Get rate limit configuration from agentflow.json.
+
+        Returns:
+            RateLimitConfig if 'rate_limit' key is present and enabled, else None.
+
+        Example agentflow.json entry::
+
+            "rate_limit": {
+                "enabled": true,
+                "requests": 100,
+                "window": 60,
+                "by": "ip"
+            }
+        """
+        data = self.data.get("rate_limit", None)
+        if data is None:
+            return None
+        config = RateLimitConfig.from_dict(data)
+        if not config.enabled:
+            return None
+        return config
diff --git a/agentflow_cli/src/app/core/config/setup_middleware.py b/agentflow_cli/src/app/core/config/setup_middleware.py
index cf3c03f..b1da167 100644
--- a/agentflow_cli/src/app/core/config/setup_middleware.py
+++ b/agentflow_cli/src/app/core/config/setup_middleware.py
@@ -4,14 +4,17 @@
 from fastapi import FastAPI
 from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from injectq import InjectQ
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.middleware.cors import CORSMiddleware
 from starlette.requests import Request
 from starlette.types import ASGIApp, Receive, Scope, Send
 
+from agentflow_cli.src.app.core.middleware.rate_limit import RateLimitMiddleware, build_backend
 from agentflow_cli.src.app.core.middleware.request_limits import RequestSizeLimitMiddleware
 from agentflow_cli.src.app.core.middleware.security_headers import SecurityHeadersMiddleware
 
+from .graph_config import GraphConfig
 from .sentry_config import init_sentry
 from .settings import get_settings, logger
 
@@ -94,17 +97,24 @@ async def dispatch(self, request: Request, call_next):
         return response
 
 
-def setup_middleware(app: FastAPI):
+def setup_middleware(
+    app: FastAPI,
+    graph_config: GraphConfig | None = None,
+    container: InjectQ | None = None,
+):
     """
     Set up middleware for the FastAPI application.
 
     Args:
         app (FastAPI): The FastAPI application instance.
+        graph_config (GraphConfig | None): Optional graph configuration used to
+            enable dynamic rate limiting from ``agentflow.json``.
 
     Middleware:
         - CORS: Configured based on settings.ORIGINS.
         - TrustedHost: Configured with allowed hosts from settings.ALLOWED_HOST.
         - GZip: Applied with a minimum size of 1000 bytes (excludes streaming endpoints).
+        - RateLimit: Applied when ``rate_limit`` is configured in ``agentflow.json``.
     """
     settings = get_settings()
     # init cors
@@ -142,6 +152,30 @@ def setup_middleware(app: FastAPI):
     # Use SelectiveGZipMiddleware to exclude streaming endpoints from compression
     # Streaming endpoints need immediate data transmission without buffering
     app.add_middleware(SelectiveGZipMiddleware, minimum_size=1000)
+
+    # Apply rate limiting only when configured in agentflow.json
+    if graph_config is not None:
+        rate_limit_config = graph_config.rate_limit
+        if rate_limit_config is not None:
+            backend = build_backend(rate_limit_config, container=container)
+            # Store on app.state so lifespan can close it cleanly
+            app.state.rate_limit_backend = backend
+            app.add_middleware(
+                RateLimitMiddleware,
+                config=rate_limit_config,
+                backend=backend,
+            )
+            logger.info(
+                "Rate limiting enabled: backend=%s, %d req/%ds, by=%s, "
+                "exclude_paths=%s, trusted_proxy_headers=%s",
+                rate_limit_config.backend,
+                rate_limit_config.requests,
+                rate_limit_config.window,
+                rate_limit_config.by,
+                rate_limit_config.exclude_paths or "(none)",
+                rate_limit_config.trusted_proxy_headers,
+            )
+
     logger.debug("Middleware set up")
 
     # Initialize Sentry
diff --git a/agentflow_cli/src/app/core/middleware/__init__.py b/agentflow_cli/src/app/core/middleware/__init__.py
index 8491a5b..84f98b7 100644
--- a/agentflow_cli/src/app/core/middleware/__init__.py
+++ b/agentflow_cli/src/app/core/middleware/__init__.py
@@ -1,10 +1,24 @@
 """Middleware modules for agentflow-cli."""
 
+from .rate_limit import (
+    BaseRateLimitBackend,
+    MemoryRateLimitBackend,
+    RateLimitDecision,
+    RateLimitMiddleware,
+    RedisRateLimitBackend,
+    build_backend,
+)
 from .request_limits import RequestSizeLimitMiddleware
 from .security_headers import SecurityHeadersMiddleware, create_security_headers_middleware
 
 
 __all__ = [
+    "BaseRateLimitBackend",
+    "RateLimitMiddleware",
+    "RateLimitDecision",
+    "MemoryRateLimitBackend",
+    "RedisRateLimitBackend",
+    "build_backend",
     "RequestSizeLimitMiddleware",
     "SecurityHeadersMiddleware",
     "create_security_headers_middleware",
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/__init__.py b/agentflow_cli/src/app/core/middleware/rate_limit/__init__.py
new file mode 100644
index 0000000..84c13c8
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/__init__.py
@@ -0,0 +1,15 @@
+from .base import BaseRateLimitBackend, RateLimitDecision
+from .factory import build_backend
+from .memory import MemoryRateLimitBackend
+from .middleware import RateLimitMiddleware
+from .redis import RedisRateLimitBackend
+
+
+__all__ = [
+    "BaseRateLimitBackend",
+    "MemoryRateLimitBackend",
+    "RateLimitDecision",
+    "RateLimitMiddleware",
+    "RedisRateLimitBackend",
+    "build_backend",
+]
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/base.py b/agentflow_cli/src/app/core/middleware/rate_limit/base.py
new file mode 100644
index 0000000..cd3ea4f
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/base.py
@@ -0,0 +1,27 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class RateLimitDecision:
+    """Result of a single rate-limit check."""
+
+    allowed: bool
+    remaining: int
+    reset_after: int
+
+
+class BaseRateLimitBackend(ABC):
+    """Abstract base class for rate-limit backends.
+
+    Users can implement this class and bind an instance into InjectQ when they
+    need a backend other than the built-in memory or Redis implementations.
+    """
+
+    @abstractmethod
+    async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+        """Atomically check and record a request for *key*."""
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Release resources held by the backend, if any."""
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/factory.py b/agentflow_cli/src/app/core/middleware/rate_limit/factory.py
new file mode 100644
index 0000000..e4615b9
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/factory.py
@@ -0,0 +1,98 @@
+import logging
+from typing import TYPE_CHECKING, Any
+
+from injectq import InjectQ
+
+from agentflow_cli.src.app.core.config.graph_config import RateLimitConfig
+
+from .base import BaseRateLimitBackend
+from .memory import MemoryRateLimitBackend
+from .redis import RedisRateLimitBackend
+
+
+if TYPE_CHECKING:
+    from redis.asyncio import Redis
+
+
+logger = logging.getLogger("agentflow_api.rate_limit")
+
+
+def build_backend(
+    config: RateLimitConfig,
+    container: InjectQ | None = None,
+) -> BaseRateLimitBackend:
+    """Build or resolve the configured rate-limit backend.
+
+    Custom backends are provided by binding a ``BaseRateLimitBackend`` instance
+    into InjectQ, matching the style used by auth/authorization.
+    """
+    container = container or InjectQ.get_instance()
+
+    injected_backend = container.try_get(BaseRateLimitBackend)
+    if config.backend == "custom":
+        if not injected_backend:
+            raise ValueError(
+                "rate_limit.backend='custom' requires a BaseRateLimitBackend "
+                "instance bound in InjectQ"
+            )
+        return injected_backend
+
+    if config.backend == "redis":
+        return _build_redis_backend(config, container)
+
+    logger.info("Rate-limit backend: memory (in-process, not shared across workers)")
+    return MemoryRateLimitBackend()
+
+
+def _build_redis_backend(
+    config: RateLimitConfig,
+    container: InjectQ,
+) -> RedisRateLimitBackend:
+    redis = _get_redis_from_container(container)
+    if redis is not None:
+        logger.info(
+            "Rate-limit backend: Redis from InjectQ (prefix=%s, fail_open=%s)",
+            config.redis_prefix,
+            config.fail_open,
+        )
+        return RedisRateLimitBackend(
+            redis=redis,
+            prefix=config.redis_prefix,
+            fail_open=config.fail_open,
+            close_redis=False,
+        )
+
+    if not config.redis_url:
+        raise ValueError("rate_limit.redis.url is required when no Redis client is bound")
+
+    backend = RedisRateLimitBackend.from_url(
+        redis_url=config.redis_url,
+        prefix=config.redis_prefix,
+        fail_open=config.fail_open,
+    )
+    _bind_redis(container, backend._redis)
+    logger.info(
+        "Rate-limit backend: created Redis and bound it in InjectQ (prefix=%s, fail_open=%s)",
+        config.redis_prefix,
+        config.fail_open,
+    )
+    return backend
+
+
+def _get_redis_from_container(container: InjectQ) -> Any | None:
+    redis = container.try_get("redis")
+    if redis is not None:
+        return redis
+
+    redis = container.try_get("redis_client")
+    if redis is not None:
+        return redis
+
+    return container.try_get("Redis")
+
+
+def _bind_redis(container: InjectQ, redis: "Redis") -> None:
+    if Redis is not None:
+        container.bind_instance(Redis, redis)
+    container.bind_instance("redis", redis)
+    container.bind_instance("redis_client", redis)
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/memory.py b/agentflow_cli/src/app/core/middleware/rate_limit/memory.py
new file mode 100644
index 0000000..d8033c6
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/memory.py
@@ -0,0 +1,98 @@
+import asyncio
+import logging
+import time
+from collections import deque
+
+from .base import BaseRateLimitBackend, RateLimitDecision
+
+
+logger = logging.getLogger("agentflow_api.rate_limit")
+
+_NUM_STRIPES = 64
+
+
+class MemoryRateLimitBackend(BaseRateLimitBackend):
+    """In-process sliding-window rate limiter.
+
+    This is useful for development and single-process deployments. It is not
+    shared across workers or containers; use Redis for distributed enforcement.
+    """
+
+    _SWEEP_INTERVAL = 2000
+
+    def __init__(self, max_unique_keys: int = 50_000) -> None:
+        self._buckets: dict[str, deque[float]] = {}
+        self._last_seen: dict[str, float] = {}
+        self._locks = [asyncio.Lock() for _ in range(_NUM_STRIPES)]
+        self._sweep_lock = asyncio.Lock()
+        self._bg_tasks: set[asyncio.Task] = set()
+        self._check_count = 0
+        self._max_unique_keys = max_unique_keys
+
+    def _stripe_lock(self, key: str) -> asyncio.Lock:
+        return self._locks[hash(key) % _NUM_STRIPES]
+
+    def _schedule_sweep(self, window: int) -> None:
+        task = asyncio.ensure_future(self._sweep(window))
+        self._bg_tasks.add(task)
+        task.add_done_callback(self._bg_tasks.discard)
+
+    def _evict_oldest_bucket(self) -> None:
+        if not self._last_seen:
+            return
+        oldest_key = min(self._last_seen, key=self._last_seen.__getitem__)
+        self._buckets.pop(oldest_key, None)
+        self._last_seen.pop(oldest_key, None)
+
+    async def _sweep(self, window: int) -> None:
+        async with self._sweep_lock:
+            cutoff = time.monotonic() - window * 2
+            stale = [k for k, ts in self._last_seen.items() if ts < cutoff]
+            for key in stale:
+                self._buckets.pop(key, None)
+                self._last_seen.pop(key, None)
+            if stale:
+                logger.debug("Rate-limit memory sweep removed %d stale buckets", len(stale))
+
+    async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+        now = time.monotonic()
+        window_start = now - window
+
+        self._check_count += 1
+        if self._check_count % self._SWEEP_INTERVAL == 0:
+            self._schedule_sweep(window)
+
+        async with self._stripe_lock(key):
+            if key not in self._buckets and len(self._buckets) >= self._max_unique_keys:
+                logger.warning(
+                    "Rate-limit bucket cap (%d) reached; running emergency sweep",
+                    self._max_unique_keys,
+                )
+                await self._sweep(window)
+                if len(self._buckets) >= self._max_unique_keys:
+                    self._evict_oldest_bucket()
+
+            bucket = self._buckets.setdefault(key, deque())
+            self._last_seen[key] = now
+
+            while bucket and bucket[0] < window_start:
+                bucket.popleft()
+
+            count = len(bucket)
+            if count >= limit:
+                reset_after = max(1, int(bucket[0] + window - now) + 1)
+                return RateLimitDecision(allowed=False, remaining=0, reset_after=reset_after)
+
+            bucket.append(now)
+            return RateLimitDecision(
+                allowed=True,
+                remaining=limit - len(bucket),
+                reset_after=window,
+            )
+
+    async def close(self) -> None:
+        for task in self._bg_tasks:
+            task.cancel()
+        if self._bg_tasks:
+            await asyncio.gather(*self._bg_tasks, return_exceptions=True)
+        self._bg_tasks.clear()
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/middleware.py b/agentflow_cli/src/app/core/middleware/rate_limit/middleware.py
new file mode 100644
index 0000000..336ce13
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/middleware.py
@@ -0,0 +1,93 @@
+import time
+
+from fastapi import status
+from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+
+from agentflow_cli.src.app.core import logger
+from agentflow_cli.src.app.core.config.graph_config import RateLimitConfig
+
+from .base import BaseRateLimitBackend
+
+
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """Backend-agnostic rate-limit middleware."""
+
+    def __init__(
+        self,
+        app,
+        config: RateLimitConfig,
+        backend: BaseRateLimitBackend,
+    ) -> None:
+        super().__init__(app)
+        self.config = config
+        self.backend = backend
+        self._exclude = frozenset(config.exclude_paths)
+
+    def _client_key(self, request: Request) -> str:
+        if self.config.by == "global":
+            return "__global__"
+
+        if self.config.trusted_proxy_headers:
+            forwarded_for = request.headers.get("X-Forwarded-For")
+            if forwarded_for:
+                return forwarded_for.split(",")[0].strip()
+
+        client = request.client
+        return client.host if client else "unknown"
+
+    async def dispatch(self, request: Request, call_next):
+        if request.url.path in self._exclude:
+            return await call_next(request)
+
+        key = self._client_key(request)
+        decision = await self.backend.check(
+            key,
+            limit=self.config.requests,
+            window=self.config.window,
+        )
+        reset_at_epoch = int(time.time()) + decision.reset_after
+
+        if not decision.allowed:
+            request_id = getattr(request.state, "request_id", "unknown")
+            logger.warning(
+                "Rate limit exceeded for %s on %s %s",
+                key,
+                request.method,
+                request.url.path,
+            )
+            return JSONResponse(
+                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                content={
+                    "error": {
+                        "code": "RATE_LIMIT_EXCEEDED",
+                        "message": (
+                            f"Too many requests. Limit: {self.config.requests} "
+                            f"per {self.config.window}s. "
+                            f"Retry after {decision.reset_after}s."
+                        ),
+                        "limit": self.config.requests,
+                        "window_seconds": self.config.window,
+                        "retry_after_seconds": decision.reset_after,
+                    },
+                    "metadata": {
+                        "request_id": request_id,
+                        "status": "error",
+                    },
+                },
+                headers={
+                    "Retry-After": str(decision.reset_after),
+                    "X-RateLimit-Limit": str(self.config.requests),
+                    "X-RateLimit-Remaining": "0",
+                    "X-RateLimit-Reset": str(reset_at_epoch),
+                    "X-RateLimit-Reset-After": str(decision.reset_after),
+                },
+            )
+
+        response = await call_next(request)
+        response.headers["X-RateLimit-Limit"] = str(self.config.requests)
+        response.headers["X-RateLimit-Remaining"] = str(decision.remaining)
+        response.headers["X-RateLimit-Reset"] = str(reset_at_epoch)
+        response.headers["X-RateLimit-Reset-After"] = str(decision.reset_after)
+        return response
diff --git a/agentflow_cli/src/app/core/middleware/rate_limit/redis.py b/agentflow_cli/src/app/core/middleware/rate_limit/redis.py
new file mode 100644
index 0000000..3ff0fb3
--- /dev/null
+++ b/agentflow_cli/src/app/core/middleware/rate_limit/redis.py
@@ -0,0 +1,115 @@
+import logging
+import time
+import uuid
+from typing import Any
+
+from .base import BaseRateLimitBackend, RateLimitDecision
+
+
+try:
+    from redis.asyncio import Redis as AsyncRedis  # type: ignore[import]
+
+    _REDIS_AVAILABLE = True
+except ImportError:
+    _REDIS_AVAILABLE = False
+    AsyncRedis = None  # type: ignore[assignment,misc]
+
+
+logger = logging.getLogger("agentflow_api.rate_limit")
+_REDIS_EXTRA_INSTALL = 'pip install "10xscale-agentflow-cli[redis]"'
+
+# Atomic sliding-window check using a Redis sorted set.
+#
+# The script:
+# 1. Removes timestamps older than the current window.
+# 2. Counts the remaining requests for the key.
+# 3. If under the limit, adds the current request as a unique sorted-set member.
+# 4. Sets an expiry so idle keys clean themselves up.
+# 5. Returns whether the request is allowed, remaining quota, and reset time.
+_SLIDING_WINDOW_LUA = """
+local key        = KEYS[1]
+local now_ms     = tonumber(ARGV[1])
+local window_ms  = tonumber(ARGV[2])
+local limit      = tonumber(ARGV[3])
+local member     = ARGV[4]
+
+local window_start = now_ms - window_ms
+
+redis.call('ZREMRANGEBYSCORE', key, '-inf', tostring(window_start))
+
+local count = tonumber(redis.call('ZCARD', key))
+
+if count < limit then
+    redis.call('ZADD', key, tostring(now_ms), member)
+    redis.call('EXPIRE', key, math.ceil(window_ms / 1000) + 1)
+    return {1, limit - count - 1, math.ceil(window_ms / 1000)}
+else
+    local oldest = redis.call('ZRANGE', key, 0, 0, 'WITHSCORES')
+    local reset_after
+    if #oldest >= 2 then
+        local oldest_ms = tonumber(oldest[2])
+        reset_after = math.ceil((oldest_ms + window_ms - now_ms) / 1000) + 1
+    else
+        reset_after = math.ceil(window_ms / 1000)
+    end
+    return {0, 0, math.max(reset_after, 1)}
+end
+"""
+
+
+class RedisRateLimitBackend(BaseRateLimitBackend):
+    """Distributed sliding-window rate limiter backed by Redis."""
+
+    def __init__(
+        self,
+        redis: Any,
+        prefix: str,
+        fail_open: bool = True,
+        close_redis: bool = False,
+    ) -> None:
+        self._redis = redis
+        self._prefix = prefix
+        self._fail_open = fail_open
+        self._close_redis = close_redis
+        self._script = self._redis.register_script(_SLIDING_WINDOW_LUA)
+
+    @classmethod
+    def from_url(
+        cls,
+        redis_url: str,
+        prefix: str,
+        fail_open: bool = True,
+    ) -> "RedisRateLimitBackend":
+        if not _REDIS_AVAILABLE or AsyncRedis is None:
+            raise ImportError(
+                "Redis backend requires the 'redis' package. "
+                f"Install the optional Redis extra with: {_REDIS_EXTRA_INSTALL}"
+            )
+        redis = AsyncRedis.from_url(redis_url, decode_responses=False)
+        return cls(redis=redis, prefix=prefix, fail_open=fail_open, close_redis=True)
+
+    async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+        redis_key = f"{self._prefix}:{key}"
+        now_ms = int(time.time() * 1000)
+        window_ms = window * 1000
+        member = f"{now_ms}:{uuid.uuid4().hex}"
+
+        try:
+            result = await self._script(
+                keys=[redis_key],
+                args=[now_ms, window_ms, limit, member],
+            )
+            return RateLimitDecision(
+                allowed=bool(result[0]),
+                remaining=int(result[1]),
+                reset_after=int(result[2]),
+            )
+        except Exception:
+            logger.exception("Redis rate-limit backend error")
+            if self._fail_open:
+                return RateLimitDecision(allowed=True, remaining=0, reset_after=window)
+            return RateLimitDecision(allowed=False, remaining=0, reset_after=window)
+
+    async def close(self) -> None:
+        if self._close_redis:
+            await self._redis.aclose()
diff --git a/agentflow_cli/src/app/main.py b/agentflow_cli/src/app/main.py
index aef1173..9d39f1e 100644
--- a/agentflow_cli/src/app/main.py
+++ b/agentflow_cli/src/app/main.py
@@ -86,6 +86,11 @@ async def lifespan(app: FastAPI):
         # release all the resources
         await graph.aclose()
 
+    # Close rate-limit backend (e.g. Redis connection pool)
+    backend = getattr(app.state, "rate_limit_backend", None)
+    if backend is not None:
+        await backend.close()
+
 
 app = FastAPI(
     title=settings.APP_NAME,
@@ -99,7 +104,7 @@ async def lifespan(app: FastAPI):
     root_path=settings.ROOT_PATH,
 )
 
-setup_middleware(app)
+setup_middleware(app, graph_config=graph_config, container=container)
 
 # attach_injector(app, injector=injector)
 setup_fastapi(container=container, app=app)
diff --git a/agentflow_cli/src/app/utils/media/extractor.py b/agentflow_cli/src/app/utils/media/extractor.py
index 8aec11d..0c54a42 100644
--- a/agentflow_cli/src/app/utils/media/extractor.py
+++ b/agentflow_cli/src/app/utils/media/extractor.py
@@ -37,7 +37,7 @@ def __init__(self, extractor: Any | None = None):
         if AsyncTextExtractor is None:
             raise ImportError(
                 "textxtract is required for document extraction. "
-                "Install with `pip install textxtract[pdf,docx,html,xml,md]`"
+                'Install with `pip install "10xscale-agentflow-cli[media]"`'
             )
 
         self.extractor = AsyncTextExtractor()
diff --git a/docs/cli-guide.md b/docs/cli-guide.md
index 9a274e0..da042d0 100644
--- a/docs/cli-guide.md
+++ b/docs/cli-guide.md
@@ -11,7 +11,7 @@ pip install 10xscale-agentflow-cli
 For development with all optional dependencies:
 
 ```bash
-pip install "10xscale-agentflow-cli[redis,sentry,firebase,snowflakekit,gcloud]"
+pip install "10xscale-agentflow-cli[redis,jwt,media,sentry,firebase,snowflakekit,gcloud]"
 ```
 
 ## Quick Start
diff --git a/docs/configuration.md b/docs/configuration.md
index 54f82fb..2db6c55 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -9,6 +9,7 @@ This document provides a complete reference for configuring your AgentFlow appli
 - [Authentication](#authentication)
 - [Dependency Injection](#dependency-injection)
 - [Storage & Persistence](#storage--persistence)
+- [Rate Limiting](#rate-limiting)
 - [Environment Variables](#environment-variables)
 - [Application Settings](#application-settings)
 - [Examples](#examples)
@@ -43,6 +44,7 @@ agentflow api --config /path/to/config.json
   "injectq": null,
   "store": null,
   "redis": null,
+  "rate_limit": null,
   "thread_name_generator": null
 }
 ```
@@ -363,6 +365,82 @@ REDIS_URL=redis://localhost:6379
 
 ---
 
+## Rate Limiting
+
+### `rate_limit`
+
+Configures AgentFlow's built-in sliding-window rate limiter.
+
+**Type:** `object | null`
+
+**Default:** `null`
+
+**Local development example:**
+```json
+{
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
+}
+```
+
+**Production Redis example:**
+
+Install the optional Redis extra before using `backend: "redis"`:
+
+```bash
+pip install "10xscale-agentflow-cli[redis]"
+```
+
+```json
+{
+  "rate_limit": {
+    "enabled": true,
+    "backend": "redis",
+    "requests": 1000,
+    "window": 60,
+    "by": "ip",
+    "trusted_proxy_headers": true,
+    "exclude_paths": ["/health", "/metrics", "/docs", "/redoc", "/openapi.json"],
+    "redis": {
+      "url": "${RATE_LIMIT_REDIS_URL}",
+      "prefix": "agentflow:rate-limit"
+    },
+    "fail_open": true
+  }
+}
+```
+
+**Options:**
+
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `enabled` | boolean | `true` | Enables rate limiting when the block exists. |
+| `backend` | string | `"memory"` | `memory`, `redis`, or `custom`. |
+| `requests` | integer | `100` | Maximum requests per window. |
+| `window` | integer | `60` | Window duration in seconds. |
+| `by` | string | `"ip"` | `ip` or `global`. |
+| `exclude_paths` | string array | `[]` | Paths that bypass rate limiting. |
+| `trusted_proxy_headers` | boolean | `false` | Use `X-Forwarded-For` only behind a trusted proxy. |
+| `redis.url` | string | `null` | Redis URL for the Redis backend. |
+| `redis.prefix` | string | `"agentflow:rate-limit"` | Redis key prefix. |
+| `fail_open` | boolean | `true` | On Redis errors, allow requests when `true`; deny when `false`. |
+
+Use the `memory` backend for local development or one-process services. Use the
+`redis` backend for production deployments with multiple workers, containers, or
+servers. Redis is optional and is not installed unless you install the `redis`
+extra.
+
+See the [Rate Limiting Guide](./rate-limiting.md) for usage details, response
+headers, and custom backend examples.
+
+---
+
 ## Thread Name Generation
 
 ### `thread_name_generator`
diff --git a/docs/deployment.md b/docs/deployment.md
index 8221aaf..4fed583 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -640,7 +640,7 @@ echo "secrets.yaml" >> .gitignore
 pip install --upgrade 10xscale-agentflow-cli
 
 # 7. Enable rate limiting
-# Use nginx, Traefik, or API Gateway
+# Use AgentFlow's Redis backend, or enforce limits at nginx, Traefik, or an API gateway
 ```
 
 ### Performance
diff --git a/docs/rate-limiting.md b/docs/rate-limiting.md
new file mode 100644
index 0000000..d28425f
--- /dev/null
+++ b/docs/rate-limiting.md
@@ -0,0 +1,192 @@
+# Rate Limiting
+
+AgentFlow can protect your API with a sliding-window rate limiter configured from
+`agentflow.json`. The limiter is disabled until you add a `rate_limit` block.
+
+## Quick Start
+
+For local development or a single-process deployment, use the in-memory backend:
+
+```json
+{
+  "agent": "graph.react:app",
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
+}
+```
+
+This allows each client IP to make `100` requests every `60` seconds.
+
+## Production With Redis
+
+Use Redis when your API runs with multiple workers, containers, or servers.
+Redis stores the counters centrally, so the limit is enforced across the whole
+deployment.
+
+Redis support is optional. Install AgentFlow with the Redis extra before using
+`backend: "redis"`:
+
+```bash
+pip install "10xscale-agentflow-cli[redis]"
+```
+
+Configure Redis in `agentflow.json`:
+
+```json
+{
+  "agent": "graph.react:app",
+  "rate_limit": {
+    "enabled": true,
+    "backend": "redis",
+    "requests": 1000,
+    "window": 60,
+    "by": "ip",
+    "trusted_proxy_headers": true,
+    "exclude_paths": ["/health", "/metrics", "/docs", "/redoc", "/openapi.json"],
+    "redis": {
+      "url": "${RATE_LIMIT_REDIS_URL}",
+      "prefix": "agentflow:rate-limit"
+    },
+    "fail_open": true
+  }
+}
+```
+
+Then set the environment variable:
+
+```bash
+RATE_LIMIT_REDIS_URL=redis://localhost:6379/0
+```
+
+The Redis backend uses an atomic Lua script with sorted sets. That means the
+check and the request recording happen as one Redis operation, which prevents
+concurrent requests from racing past the configured limit.
+
+## Configuration Reference
+
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `enabled` | boolean | `true` | Enables the middleware when the `rate_limit` block exists. |
+| `backend` | string | `"memory"` | `memory`, `redis`, or `custom`. |
+| `requests` | integer | `100` | Maximum requests allowed in each window. |
+| `window` | integer | `60` | Window size in seconds. |
+| `by` | string | `"ip"` | Limit by client IP or use `"global"` for one shared quota. |
+| `exclude_paths` | string array | `[]` | Paths that bypass rate limiting. |
+| `trusted_proxy_headers` | boolean | `false` | Whether to use `X-Forwarded-For` as the client IP. |
+| `redis.url` | string | `null` | Redis URL for the Redis backend. Required unless a Redis client is injected. |
+| `redis.prefix` | string | `"agentflow:rate-limit"` | Prefix for Redis keys. |
+| `fail_open` | boolean | `true` | For Redis errors, allow requests when `true` or deny them when `false`. |
+
+## Identity Modes
+
+Use per-IP limits for most public APIs:
+
+```json
+{
+  "rate_limit": {
+    "requests": 100,
+    "window": 60,
+    "by": "ip"
+  }
+}
+```
+
+Use a global limit when you want one shared quota for the whole service:
+
+```json
+{
+  "rate_limit": {
+    "requests": 5000,
+    "window": 60,
+    "by": "global"
+  }
+}
+```
+
+Only enable `trusted_proxy_headers` when your app is behind a trusted proxy or
+load balancer that strips untrusted forwarding headers from direct clients.
+
+## Response Headers
+
+Every limited response includes:
+
+| Header | Description |
+| --- | --- |
+| `X-RateLimit-Limit` | Configured request limit. |
+| `X-RateLimit-Remaining` | Requests remaining in the current window. |
+| `X-RateLimit-Reset` | Unix timestamp for the current reset estimate. |
+| `X-RateLimit-Reset-After` | Seconds until the current reset estimate. |
+| `Retry-After` | Present on `429` responses. |
+
+When the limit is exceeded, AgentFlow returns `429 Too Many Requests`:
+
+```json
+{
+  "error": {
+    "code": "RATE_LIMIT_EXCEEDED",
+    "message": "Too many requests. Limit: 100 per 60s. Retry after 12s.",
+    "limit": 100,
+    "window_seconds": 60,
+    "retry_after_seconds": 12
+  },
+  "metadata": {
+    "request_id": "request-id",
+    "status": "error"
+  }
+}
+```
+
+## Custom Backend
+
+Use a custom backend when you want to store rate-limit state somewhere else.
+Implement `BaseRateLimitBackend`, then bind an instance in InjectQ.
+
+```python
+from agentflow_cli.src.app.core.middleware.rate_limit import (
+    BaseRateLimitBackend,
+    RateLimitDecision,
+)
+
+
+class MyRateLimitBackend(BaseRateLimitBackend):
+    async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+        allowed = True
+        remaining = limit - 1
+        reset_after = window
+        return RateLimitDecision(
+            allowed=allowed,
+            remaining=remaining,
+            reset_after=reset_after,
+        )
+
+    async def close(self) -> None:
+        return None
+```
+
+Configure AgentFlow to use the custom backend:
+
+```json
+{
+  "rate_limit": {
+    "enabled": true,
+    "backend": "custom",
+    "requests": 100,
+    "window": 60,
+    "by": "ip"
+  }
+}
+```
+
+## Choosing A Backend
+
+Use `memory` for local development, tests, demos, and one-process services.
+
+Use `redis` for production APIs, Gunicorn/Uvicorn deployments with multiple
+workers, Docker/Kubernetes deployments, and any setup with more than one API
+instance. Redis is not needed for the `memory` or `custom` backends.
diff --git a/mkdocs.yaml b/mkdocs.yaml
deleted file mode 100644
index add176d..0000000
--- a/mkdocs.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-site_name: AgentFlow-CLI
-site_description: "A lightweight Python framework for building intelligent agents and multi-agent workflows."
-# Required for Material's instant navigation and previews
-site_url: https://10xhub.github.io/agentflow-cli/
-repo_url: https://github.com/10xhub/agentflow-cli
-repo_name: 10xhub/agentflow-cli
-
-theme:
-  name: material
-  language: en
-  logo: null
-  favicon: null
-  features:
-    - navigation.pagination
-    - navigation.goto_top
-    - navigation.path
-    - navigation.tabs
-    - navigation.top
-    - search-suggest
-    - search.highlight
-    - search.share
-    - content.code.copy
-
-
-plugins:
-  - search
-  - autorefs
-  - section-index
-  - mkdocstrings:
-      default_handler: python
-      handlers:
-        python:
-          options:
-            docstring_style: google
-            show_inheritance: true
-            show_signature: true
-            show_source: true
-            allow_inspection: true
-            show_typehints: true
-            show_toc: true
-            show_bases: true
-            show_inheritance_diagram: true
-            parameter_headings: true
-            group_by_category: true
-            show_category_heading: true
-            show_symbol_type_heading: true
-            summary: true
-            load_external_modules: true
-            inherited_members: true
-            show_submodules: true
-            show_if_no_docstring: true
-            separate_signature: true
-
-  - gen-files:
-      scripts:
-        - scripts/generate_docs.py
-
-markdown_extensions:
-  - admonition
-  - pymdownx.details
-  - pymdownx.superfences
-  - pymdownx.highlight:
-      anchor_linenums: true
-      line_spans: __span
-      pygments_lang_class: true
-  - pymdownx.inlinehilite
-  - pymdownx.snippets
-  - pymdownx.superfences
-  - pymdownx.superfences:
-      custom_fences:
-        - name: mermaid
-          class: mermaid
-
-nav:
-  - Home: index.md
-  - Getting Started:
-      - CLI Guide: cli-guide.md
-      - Configuration: configuration.md
-  - Features:
-      - Authentication: authentication.md
-      - ID Generation: id-generation.md
-      - Thread Name Generator: thread-name-generator.md
-  - Deployment:
-      - Deployment Guide: deployment.md
-  - Reference:
-      - CLI Reference: cli.md
diff --git a/pyproject.toml b/pyproject.toml
index 7bb3286..4cb6197 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,8 +44,6 @@ dependencies = [
     "uvicorn",
     "typer",
     "python-dotenv",
-    "PyJWT",
-    "textxtract[pdf,docx,html,xml,md]>=0.2.0",
 ]
 
 [project.urls]
@@ -68,6 +66,12 @@ snowflakekit = [
 redis = [
     "redis>=5.0.7",
 ]
+jwt = [
+    "PyJWT",
+]
+media = [
+    "textxtract[pdf,docx,html,xml,md]>=0.2.0",
+]
 gcloud = [
     "google-cloud-logging",
 ]
@@ -245,4 +249,7 @@ dev = [
     "lib==4.0.0",
     "markdown-it-py==3.0.0",
     "requests==2.32.3",
+    "redis>=5.0.7",
+    "PyJWT",
+    "textxtract[pdf,docx,html,xml,md]>=0.2.0",
 ]
diff --git a/requirements.txt b/requirements.txt
index 81386ab..8232623 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,8 +22,6 @@ pytest-env>=1.1.5
 pytest-xdist>=3.8.0
 pre-commit>=3.8.0
 ruff==0.5.2
-mkdocs-gen-files==0.5.0
-mkdocstrings==0.25.2
 mypy-extensions==1.0.0
 httpx==0.27.0
 lib==4.0.0
diff --git a/tests/unit_tests/test_rate_limit.py b/tests/unit_tests/test_rate_limit.py
new file mode 100644
index 0000000..6c0e698
--- /dev/null
+++ b/tests/unit_tests/test_rate_limit.py
@@ -0,0 +1,235 @@
+# ruff: noqa: S101, PLR2004, SLF001
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from injectq import InjectQ
+
+from agentflow_cli.src.app.core.config.graph_config import RateLimitConfig
+from agentflow_cli.src.app.core.middleware.rate_limit import (
+    BaseRateLimitBackend,
+    MemoryRateLimitBackend,
+    RateLimitDecision,
+    RateLimitMiddleware,
+    RedisRateLimitBackend,
+    build_backend,
+)
+
+
+def _config(**overrides) -> RateLimitConfig:
+    data = {
+        "enabled": True,
+        "backend": "memory",
+        "requests": 2,
+        "window": 60,
+        "by": "ip",
+    }
+    data.update(overrides)
+    return RateLimitConfig.from_dict(data)
+
+
+@pytest.mark.asyncio
+async def test_memory_backend_enforces_limit():
+    backend = MemoryRateLimitBackend()
+
+    first = await backend.check("client", limit=2, window=60)
+    second = await backend.check("client", limit=2, window=60)
+    third = await backend.check("client", limit=2, window=60)
+
+    assert first.allowed is True
+    assert second.allowed is True
+    assert third.allowed is False
+    assert third.remaining == 0
+
+
+@pytest.mark.asyncio
+async def test_memory_backend_unique_key_cap_evicts():
+    backend = MemoryRateLimitBackend(max_unique_keys=2)
+
+    await backend.check("a", limit=10, window=60)
+    await backend.check("b", limit=10, window=60)
+    await backend.check("c", limit=10, window=60)
+
+    assert len(backend._buckets) <= 2
+    assert "c" in backend._buckets
+
+
+@pytest.mark.asyncio
+async def test_redis_backend_uses_unique_members_for_same_millisecond(monkeypatch):
+    calls = []
+
+    async def fake_script(*, keys, args):
+        calls.append((keys, args))
+        return [1, 1, 60]
+
+    backend = object.__new__(RedisRateLimitBackend)
+    backend._prefix = "agentflow:test"
+    backend._fail_open = True
+    backend._script = fake_script
+
+    monkeypatch.setattr("time.time", lambda: 123.456)
+
+    await backend.check("client", limit=2, window=60)
+    await backend.check("client", limit=2, window=60)
+
+    first_member = calls[0][1][3]
+    second_member = calls[1][1][3]
+    assert first_member.startswith("123456:")
+    assert second_member.startswith("123456:")
+    assert first_member != second_member
+
+
+def test_rate_limit_config_parses_boolean_strings_and_expands_redis_url(monkeypatch):
+    monkeypatch.setenv("RATE_LIMIT_REDIS_URL", "redis://localhost:6379/4")
+
+    config = RateLimitConfig.from_dict(
+        {
+            "enabled": "true",
+            "backend": "redis",
+            "requests": 5,
+            "window": 10,
+            "by": "global",
+            "trusted_proxy_headers": "false",
+            "fail_open": "no",
+            "redis": {"url": "${RATE_LIMIT_REDIS_URL}", "prefix": "agentflow:test"},
+        }
+    )
+
+    assert config.enabled is True
+    assert config.trusted_proxy_headers is False
+    assert config.fail_open is False
+    assert config.redis_url == "redis://localhost:6379/4"
+
+
+def test_rate_limit_config_allows_custom_backend_without_path():
+    config = RateLimitConfig.from_dict({"backend": "custom"})
+
+    assert config.backend == "custom"
+
+
+def test_rate_limit_config_rejects_invalid_boolean_string():
+    with pytest.raises(ValueError, match="rate_limit.fail_open must be a boolean"):
+        RateLimitConfig.from_dict({"fail_open": "sometimes"})
+
+
+def test_rate_limit_middleware_ignores_forwarded_for_by_default():
+    app = FastAPI()
+    backend = MemoryRateLimitBackend()
+    app.add_middleware(RateLimitMiddleware, config=_config(), backend=backend)
+
+    @app.get("/")
+    def root():
+        return {"ok": True}
+
+    client = TestClient(app)
+    assert client.get("/", headers={"X-Forwarded-For": "1.1.1.1"}).status_code == 200
+    assert client.get("/", headers={"X-Forwarded-For": "2.2.2.2"}).status_code == 200
+    assert client.get("/", headers={"X-Forwarded-For": "3.3.3.3"}).status_code == 429
+
+
+def test_rate_limit_middleware_uses_forwarded_for_when_trusted():
+    app = FastAPI()
+    backend = MemoryRateLimitBackend()
+    app.add_middleware(
+        RateLimitMiddleware,
+        config=_config(trusted_proxy_headers=True),
+        backend=backend,
+    )
+
+    @app.get("/")
+    def root():
+        return {"ok": True}
+
+    client = TestClient(app)
+    assert client.get("/", headers={"X-Forwarded-For": "1.1.1.1"}).status_code == 200
+    assert client.get("/", headers={"X-Forwarded-For": "2.2.2.2"}).status_code == 200
+    assert client.get("/", headers={"X-Forwarded-For": "3.3.3.3"}).status_code == 200
+
+
+def test_rate_limit_middleware_excludes_paths():
+    app = FastAPI()
+    backend = MemoryRateLimitBackend()
+    app.add_middleware(
+        RateLimitMiddleware,
+        config=_config(requests=1, exclude_paths=["/health"]),
+        backend=backend,
+    )
+
+    @app.get("/health")
+    def health():
+        return {"ok": True}
+
+    client = TestClient(app)
+    assert client.get("/health").status_code == 200
+    assert client.get("/health").status_code == 200
+
+
+@pytest.mark.asyncio
+async def test_custom_backend_resolves_from_injectq():
+    class MyRateLimitBackend(BaseRateLimitBackend):
+        async def check(self, key: str, *, limit: int, window: int) -> RateLimitDecision:
+            return RateLimitDecision(allowed=True, remaining=limit - 1, reset_after=window)
+
+        async def close(self) -> None:
+            return None
+
+    container = InjectQ()
+    custom_backend = MyRateLimitBackend()
+    container.bind_instance(BaseRateLimitBackend, custom_backend)
+
+    backend = build_backend(_config(backend="custom"), container=container)
+
+    assert backend is custom_backend
+    assert (await backend.check("client", limit=2, window=60)).allowed is True
+
+
+def test_custom_backend_requires_injectq_binding():
+    with pytest.raises(ValueError, match="BaseRateLimitBackend"):
+        build_backend(_config(backend="custom"), container=InjectQ())
+
+
+def test_redis_backend_reuses_injectq_redis_client():
+    class FakeRedis:
+        def register_script(self, script):
+            async def fake_script(*, keys, args):
+                return [1, 0, 60]
+
+            return fake_script
+
+    container = InjectQ()
+    redis = FakeRedis()
+    container.bind_instance("redis", redis)
+
+    backend = build_backend(
+        _config(
+            backend="redis",
+            redis={"prefix": "agentflow:test"},
+        ),
+        container=container,
+    )
+
+    assert isinstance(backend, RedisRateLimitBackend)
+    assert backend._redis is redis
+    assert backend._close_redis is False
+
+
+def test_redis_backend_requires_url_when_no_injected_client():
+    with pytest.raises(ValueError, match="redis.url"):
+        build_backend(_config(backend="redis"), container=InjectQ())
+
+
+def test_redis_backend_from_url_requires_optional_extra(monkeypatch):
+    monkeypatch.setattr(
+        "agentflow_cli.src.app.core.middleware.rate_limit.redis._REDIS_AVAILABLE",
+        False,
+    )
+    monkeypatch.setattr(
+        "agentflow_cli.src.app.core.middleware.rate_limit.redis.AsyncRedis",
+        None,
+    )
+
+    with pytest.raises(ImportError, match=r"10xscale-agentflow-cli\[redis\]"):
+        RedisRateLimitBackend.from_url(
+            redis_url="redis://localhost:6379/0",
+            prefix="agentflow:test",
+        )
diff --git a/uv.lock b/uv.lock
index 08ff4dd..391bb37 100644
--- a/uv.lock
+++ b/uv.lock
@@ -23,7 +23,7 @@ wheels = [
 
 [[package]]
 name = "10xscale-agentflow-cli"
-version = "0.3.0"
+version = "0.3.1"
 source = { editable = "." }
 dependencies = [
     { name = "10xscale-agentflow" },
@@ -32,10 +32,8 @@ dependencies = [
     { name = "orjson" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
-    { name = "pyjwt" },
     { name = "python-dotenv" },
     { name = "python-multipart" },
-    { name = "textxtract", extra = ["docx", "html", "md", "pdf", "xml"] },
     { name = "typer" },
     { name = "uvicorn" },
 ]
@@ -48,6 +46,12 @@ firebase = [
 gcloud = [
     { name = "google-cloud-logging" },
 ]
+jwt = [
+    { name = "pyjwt" },
+]
+media = [
+    { name = "textxtract", extra = ["docx", "html", "md", "pdf", "xml"] },
+]
 redis = [
     { name = "redis" },
 ]
@@ -67,14 +71,17 @@ dev = [
     { name = "mkdocstrings" },
     { name = "mypy-extensions" },
     { name = "pre-commit" },
+    { name = "pyjwt" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
     { name = "pytest-env" },
     { name = "pytest-xdist" },
+    { name = "redis" },
     { name = "requests" },
     { name = "ruff" },
     { name = "snowflakekit" },
+    { name = "textxtract", extra = ["docx", "html", "md", "pdf", "xml"] },
 ]
 
 [package.metadata]
@@ -88,17 +95,17 @@ requires-dist = [
     { name = "orjson" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
-    { name = "pyjwt" },
+    { name = "pyjwt", marker = "extra == 'jwt'" },
     { name = "python-dotenv" },
     { name = "python-multipart" },
     { name = "redis", marker = "extra == 'redis'", specifier = ">=5.0.7" },
     { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = ">=2.10.0" },
     { name = "snowflakekit", marker = "extra == 'snowflakekit'" },
-    { name = "textxtract", extras = ["pdf", "docx", "html", "xml", "md"], specifier = ">=0.2.0" },
+    { name = "textxtract", extras = ["pdf", "docx", "html", "xml", "md"], marker = "extra == 'media'", specifier = ">=0.2.0" },
     { name = "typer" },
     { name = "uvicorn" },
 ]
-provides-extras = ["sentry", "firebase", "snowflakekit", "redis", "gcloud"]
+provides-extras = ["sentry", "firebase", "snowflakekit", "redis", "jwt", "media", "gcloud"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -109,14 +116,17 @@ dev = [
     { name = "mkdocstrings", specifier = "==0.25.2" },
     { name = "mypy-extensions", specifier = "==1.0.0" },
     { name = "pre-commit", specifier = ">=3.8.0" },
+    { name = "pyjwt" },
     { name = "pytest", specifier = ">=8.4.2" },
     { name = "pytest-asyncio", specifier = ">=1.2.0" },
     { name = "pytest-cov", specifier = ">=7.0.0" },
     { name = "pytest-env", specifier = ">=1.1.5" },
     { name = "pytest-xdist", specifier = ">=3.8.0" },
+    { name = "redis", specifier = ">=5.0.7" },
     { name = "requests", specifier = "==2.32.3" },
     { name = "ruff", specifier = "==0.5.2" },
     { name = "snowflakekit" },
+    { name = "textxtract", extras = ["pdf", "docx", "html", "xml", "md"], specifier = ">=0.2.0" },
 ]
 
 [[package]]