eellak · fffoivos · Nov 30, 2025 · Nov 30, 2025 · Jan 25, 2026 · Mar 4, 2026
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,40 @@
+name: Build and Deploy Docs
+
+on:
+  push:
+    branches:
+      - development
+      - main
+      - master
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: Install MkDocs
+        run: |
+          python -m pip install --upgrade pip
+          pip install 'mkdocs<2' 'mkdocs-material<10'
+
+      - name: Build site
+        run: mkdocs build --strict
+
+      - name: Deploy to gh-pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./site
+          publish_branch: gh-pages
+          force_orphan: true
diff --git a/.gitignore b/.gitignore
@@ -58,10 +58,13 @@ htmlcov/
 # OCR test outputs
 test_ocr_*_output/
 *_demo_output/
+artifacts/
 
 # OCR model weights (if downloaded locally)
 nanonets/
 ocr_models/
+deepseek-ocr-2-model/
+models/
 
 # Noise analysis reports
 glossapi_noise_analysis_report.md
@@ -78,4 +81,5 @@ dependency_setup/.venvs/
 deepseek-ocr/DeepSeek-OCR-empty/
 # Local DeepSeek checkout and repro scripts (keep out of master)
 deepseek-ocr/
+deepseek-ocr-2/
 repro_rapidocr_onnx/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# Contributing to GlossAPI
+
+## Working branches and PR flow
+- Open PRs are pushed against the `development` branch.
+- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint.  
+
+## Some design principles
+- Corpus methods should be easy to use and descriptive.
+- Python files should be readable and well organized (check folder structure).
+- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to  these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline.
+
+## Pipeline awareness and folder layout
+- Tie any pipeline change to the artifacts it produces. Common touchpoints:
+  - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`).
+  - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders.
+  - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`.
+- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable.
+
+## Keep changes small
+- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss.
 
 ## Why GlossAPI
 - Handles download → extraction → cleaning → sectioning in one pipeline.
-- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR.
+- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation.
 - Rust-powered cleaner/noise metrics keep Markdown quality predictable.
 - Greek-first metadata and section classification tuned for academic corpora.
 - Modular Corpus API lets you resume from any stage or plug into existing flows.
@@ -40,56 +40,127 @@ PY
 
 ## Automated Environment Profiles
 
-Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes:
+Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime:
 
 ```bash
-# Vanilla pipeline (no GPU OCR extras)
-./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests
+# Docling / main GlossAPI environment
+./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests
 
-# Docling + RapidOCR mode
-./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests
-
-# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR)
-./dependency_setup/setup_glossapi.sh \
-  --mode deepseek \
+# DeepSeek OCR runtime (uv-managed)
+./dependency_setup/setup_deepseek_uv.sh \
   --venv dependency_setup/.venvs/deepseek \
-  --weights-dir /path/to/deepseek-ocr \
+  --model-root /path/to/deepseek-ocr-2-model \
+  --download-model \
   --run-tests --smoke-test
 ```
 
-Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately.
+`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`.
+
+If you want a guided install that asks which phases you plan to use, run:
+
+```bash
+python install_glossapi.py
+```
+
+That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them.
+
+## Browser-Gated Download Mode
+
+`Corpus.download(...)` now supports three high-level routes for file acquisition:
+
+- `download_mode="standard"`: direct HTTP downloader only
+- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial
+- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints
+
+Use `browser_mode=True` as a legacy alias for `download_mode="browser"`.
+
+### Policy-driven routing
+
+If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL:
+
+```yaml
+default:
+  downloader: standard
+
+rules:
+  - match:
+      domains: [eur-lex.europa.eu]
+    downloader: browser
+
+  - match:
+      url_regex: "https://example.org/protected/.*"
+    downloader: auto
+```
+
+```python
+from glossapi import Corpus
+
+corpus = Corpus(input_dir="out", output_dir="out")
+corpus.download(
+    input_parquet="input_urls.parquet",
+    download_policy_file="download_policy.yml",
+)
+```
+
+### Operational notes
+
+- Browser mode is for browser-gated file endpoints, not viewer-only sources.
+- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files.
+- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory.
+- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files.
+
+### Regression strategy
+
+The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs.
+
+For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite.
 
 **DeepSeek runtime checklist**
-- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub.
-- Export these to force the real CLI and avoid silent stub output:
+- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR.
+- Export these to force the real runtime and avoid silent stub output:
   - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`
   - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`
-  - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py`
-  - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python`
-  - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR`
-  - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib`
-- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`.
-- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`.
-- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`).
-- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`.
+  - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python`
+  - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`
+  - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2`
+- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`.
+- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise.
 
 ## Choose Your Install Path
 
 | Scenario | Commands | Notes |
 | --- | --- | --- |
 | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. |
-| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. |
+| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. |
+| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. |
 | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. |
 | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. |
 
 See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips.
 
 ## Repo Landmarks
+- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files.
+- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs.
 - `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown.
 - `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic.
 - `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus).
 - `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides.
 
+## Pipeline map
+
+Use this as the shortest path from a documentation concept to the public call that implements it.
+
+| Stage | Main call | Important parameters | Writes |
+| --- | --- | --- | --- |
+| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` |
+| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/<stem>.md`, `json/<stem>.docling.json(.zst)`, `json/metrics/*.json` |
+| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/<stem>.md`, updated parquet metrics/flags |
+| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/<stem>.md`, optional `json/<stem>.latex_map.jsonl` |
+| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` |
+| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` |
+| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns |
+| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL |
+
 ## Contributing
 - Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check.
 - Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together.

diff --git a/dependency_setup/deepseek_gpu_smoke.py b/dependency_setup/deepseek_gpu_smoke.py
@@ -3,9 +3,9 @@
 Minimal DeepSeek OCR integration smoke test.
 
 This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and
-verifies that real Markdown output is produced. It requires the DeepSeek-OCR
-weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to
-the repository root (override via ``DEEPSEEK_MODEL_DIR``).
+verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2
+weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the
+repository root (override via ``DEEPSEEK_MODEL_DIR``).
 """
 from __future__ import annotations
 
@@ -20,15 +20,16 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
 SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs"
-DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve()
+DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve()
 
 
 def ensure_model_available(model_root: Path) -> None:
-    expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors"
+    direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
+    expected = direct_root / "model-00001-of-000001.safetensors"
     if not expected.exists() or expected.stat().st_size < 1_000_000:
         raise FileNotFoundError(
-            f"Expected DeepSeek-OCR weights at {expected}. "
-            "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) "
+            f"Expected DeepSeek-OCR-2 weights at {expected}. "
+            "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) "
             "or set DEEPSEEK_MODEL_DIR to the directory that contains them."
         )
 
@@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None:
     from glossapi import Corpus
 
     ensure_model_available(model_root)
-    sample_pdf = SAMPLES_DIR / "sample01_plain.pdf"
+    model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
+    sample_pdf = SAMPLES_DIR / "alpha.pdf"
     if not sample_pdf.exists():
         raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}")
 
@@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None:
         parquet_path = dl_dir / "download_results.parquet"
         df.to_parquet(parquet_path, index=False)
 
+        os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1")
         os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0")
         os.environ.setdefault(
-            "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT",
-            str(model_root / "run_pdf_ocr_vllm.py"),
+            "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT",
+            str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"),
         )
         os.environ.setdefault(
             "GLOSSAPI_DEEPSEEK_PYTHON",
             sys.executable,
         )
-        ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str(
-            model_root / "libjpeg-turbo" / "lib"
-        )
-        os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra
-        os.environ["LD_LIBRARY_PATH"] = (
-            f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":")
-        )
+        os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir))
 
         corpus = Corpus(input_dir=input_dir, output_dir=output_dir)
         corpus.ocr(
@@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None:
 
 
 def main() -> None:
-    model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR")
+    model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR")
     if model_dir_env:
         model_root = Path(model_dir_env).expanduser().resolve()
     else:

diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml
@@ -0,0 +1,28 @@
+[project]
+name = "glossapi-deepseek-runtime"
+version = "0.1.0"
+description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution"
+requires-python = ">=3.11,<3.13"
+dependencies = [
+    "glossapi[docling,deepseek]",
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0",
+]
+
+[dependency-groups]
+test = [
+    "pytest",
+    "fpdf2",
+]
+
+[tool.uv.sources]
+glossapi = { path = "../..", editable = true }
+torch = { index = "pytorch-cu118" }
+torchvision = { index = "pytorch-cu118" }
+torchaudio = { index = "pytorch-cu118" }
+
+[[tool.uv.index]]
+name = "pytorch-cu118"
+url = "https://download.pytorch.org/whl/cu118"
+explicit = true