diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..4719481 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Build and Deploy Docs + +on: + push: + branches: + - development + - main + - master + workflow_dispatch: + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install MkDocs + run: | + python -m pip install --upgrade pip + pip install 'mkdocs<2' 'mkdocs-material<10' + + - name: Build site + run: mkdocs build --strict + + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site + publish_branch: gh-pages + force_orphan: true diff --git a/.gitignore b/.gitignore index 8c98a88..74f3edc 100644 --- a/.gitignore +++ b/.gitignore @@ -58,10 +58,13 @@ htmlcov/ # OCR test outputs test_ocr_*_output/ *_demo_output/ +artifacts/ # OCR model weights (if downloaded locally) nanonets/ ocr_models/ +deepseek-ocr-2-model/ +models/ # Noise analysis reports glossapi_noise_analysis_report.md @@ -78,4 +81,4 @@ dependency_setup/.venvs/ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ -repro_rapidocr_onnx/ +deepseek-ocr-2/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..979e757 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to GlossAPI + +## Working branches and PR flow +- Open PRs are pushed against the `development` branch. +- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint. + +## Some design principles +- Corpus methods should be easy to use and descriptive. +- Python files should be readable and well organized (check folder structure). +- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline. + +## Pipeline awareness and folder layout +- Tie any pipeline change to the artifacts it produces. Common touchpoints: + - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`). + - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders. + - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`. +- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable. + +## Keep changes small +- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting. diff --git a/README.md b/README.md index ebc6baf..04be81a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss. ## Why GlossAPI - Handles download → extraction → cleaning → sectioning in one pipeline. -- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR. +- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation. - Rust-powered cleaner/noise metrics keep Markdown quality predictable. - Greek-first metadata and section classification tuned for academic corpora. - Modular Corpus API lets you resume from any stage or plug into existing flows. @@ -40,56 +40,128 @@ PY ## Automated Environment Profiles -Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes: +Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime: ```bash -# Vanilla pipeline (no GPU OCR extras) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Docling / main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# Docling + RapidOCR mode -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR runtime (uv-managed) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +The uv-managed DeepSeek runtime is OCR-only on purpose: it installs `glossapi[deepseek]` and does not carry the Docling layout stack. + +If you want a guided install that asks which phases you plan to use, run: + +```bash +python install_glossapi.py +``` + +That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them. + +## Browser-Gated Download Mode + +`Corpus.download(...)` now supports three high-level routes for file acquisition: + +- `download_mode="standard"`: direct HTTP downloader only +- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial +- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints + +Use `browser_mode=True` as a legacy alias for `download_mode="browser"`. + +### Policy-driven routing + +If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL: + +```yaml +default: + downloader: standard + +rules: + - match: + domains: [eur-lex.europa.eu] + downloader: browser + + - match: + url_regex: "https://example.org/protected/.*" + downloader: auto +``` + +```python +from glossapi import Corpus + +corpus = Corpus(input_dir="out", output_dir="out") +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +### Operational notes + +- Browser mode is for browser-gated file endpoints, not viewer-only sources. +- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files. +- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory. +- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files. + +### Regression strategy + +The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs. + +For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub. -- Export these to force the real CLI and avoid silent stub output: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. +- Export these to force the real runtime and avoid silent stub output: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`. -- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`). -- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`. +- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise. ## Choose Your Install Path | Scenario | Commands | Notes | | --- | --- | --- | | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. | -| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. | +| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. | +| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. | | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. | | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. | See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips. ## Repo Landmarks +- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files. +- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs. - `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown. - `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic. - `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus). - `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides. +## Pipeline map + +Use this as the shortest path from a documentation concept to the public call that implements it. + +| Stage | Main call | Important parameters | Writes | +| --- | --- | --- | --- | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `use_gpus`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | +| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | +| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns | +| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL | + ## Contributing - Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check. - Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together. diff --git a/dependency_setup/deepseek_gpu_smoke.py b/dependency_setup/deepseek_gpu_smoke.py index e85d202..ddfb314 100644 --- a/dependency_setup/deepseek_gpu_smoke.py +++ b/dependency_setup/deepseek_gpu_smoke.py @@ -3,9 +3,9 @@ Minimal DeepSeek OCR integration smoke test. This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and -verifies that real Markdown output is produced. It requires the DeepSeek-OCR -weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to -the repository root (override via ``DEEPSEEK_MODEL_DIR``). +verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2 +weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the +repository root (override via ``DEEPSEEK_MODEL_DIR``). """ from __future__ import annotations @@ -20,15 +20,16 @@ REPO_ROOT = Path(__file__).resolve().parents[1] SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs" -DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve() +DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve() def ensure_model_available(model_root: Path) -> None: - expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors" + direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + expected = direct_root / "model-00001-of-000001.safetensors" if not expected.exists() or expected.stat().st_size < 1_000_000: raise FileNotFoundError( - f"Expected DeepSeek-OCR weights at {expected}. " - "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) " + f"Expected DeepSeek-OCR-2 weights at {expected}. " + "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) " "or set DEEPSEEK_MODEL_DIR to the directory that contains them." ) @@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None: from glossapi import Corpus ensure_model_available(model_root) - sample_pdf = SAMPLES_DIR / "sample01_plain.pdf" + model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + sample_pdf = SAMPLES_DIR / "alpha.pdf" if not sample_pdf.exists(): raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}") @@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None: parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") os.environ.setdefault( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - str(model_root / "run_pdf_ocr_vllm.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"), ) os.environ.setdefault( "GLOSSAPI_DEEPSEEK_PYTHON", sys.executable, ) - ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str( - model_root / "libjpeg-turbo" / "lib" - ) - os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra - os.environ["LD_LIBRARY_PATH"] = ( - f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":") - ) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) corpus = Corpus(input_dir=input_dir, output_dir=output_dir) corpus.ocr( @@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None: def main() -> None: - model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") + model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR") if model_dir_env: model_root = Path(model_dir_env).expanduser().resolve() else: diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml new file mode 100644 index 0000000..6f7ffe0 --- /dev/null +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" +requires-python = ">=3.11,<3.13" +dependencies = [ + "glossapi[deepseek]", + "torch==2.10.0", + "torchvision==0.25.0", + "torchaudio==2.10.0", +] + +[dependency-groups] +test = [ + "pytest", + "fpdf2", +] + +[tool.uv.sources] +glossapi = { path = "../..", editable = true } +torch = { index = "pytorch-cu130" } +torchvision = { index = "pytorch-cu130" } +torchaudio = { index = "pytorch-cu130" } + +[[tool.uv.index]] +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" +explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock new file mode 100644 index 0000000..a136794 --- /dev/null +++ b/dependency_setup/deepseek_uv/uv.lock @@ -0,0 +1,1771 @@ +version = 1 +revision = 3 +requires-python = ">=3.11, <3.13" +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] + +[[package]] +name = "accelerate" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" }, +] + +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" }, +] + +[[package]] +name = "aiofiles" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, + { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, + { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, + { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, + { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, + { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, + { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, + { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, + { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, + { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, + { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, + { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, + { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, + { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, + { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/35/02daf95b9cd686320bb622eb148792655c9412dbb9b67abb5694e5910a24/charset_normalizer-3.4.5.tar.gz", hash = "sha256:95adae7b6c42a6c5b5b559b1a99149f090a57128155daeea91732c8d970d8644", size = 134804, upload-time = "2026-03-06T06:03:19.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/9e/bcec3b22c64ecec47d39bf5167c2613efd41898c019dccd4183f6aa5d6a7/charset_normalizer-3.4.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:610f72c0ee565dfb8ae1241b666119582fdbfe7c0975c175be719f940e110694", size = 279531, upload-time = "2026-03-06T06:00:52.252Z" }, + { url = "https://files.pythonhosted.org/packages/58/12/81fd25f7e7078ab5d1eedbb0fac44be4904ae3370a3bf4533c8f2d159acd/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60d68e820af339df4ae8358c7a2e7596badeb61e544438e489035f9fbf3246a5", size = 188006, upload-time = "2026-03-06T06:00:53.8Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6e/f2d30e8c27c1b0736a6520311982cf5286cfc7f6cac77d7bc1325e3a23f2/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b473fc8dca1c3ad8559985794815f06ca3fc71942c969129070f2c3cdf7281", size = 205085, upload-time = "2026-03-06T06:00:55.311Z" }, + { url = "https://files.pythonhosted.org/packages/d0/90/d12cefcb53b5931e2cf792a33718d7126efb116a320eaa0742c7059a95e4/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d4eb8ac7469b2a5d64b5b8c04f84d8bf3ad340f4514b98523805cbf46e3b3923", size = 200545, upload-time = "2026-03-06T06:00:56.532Z" }, + { url = "https://files.pythonhosted.org/packages/03/f4/44d3b830a20e89ff82a3134912d9a1cf6084d64f3b95dcad40f74449a654/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bcb3227c3d9aaf73eaaab1db7ccd80a8995c509ee9941e2aae060ca6e4e5d81", size = 193863, upload-time = "2026-03-06T06:00:57.823Z" }, + { url = "https://files.pythonhosted.org/packages/25/4b/f212119c18a6320a9d4a730d1b4057875cdeabf21b3614f76549042ef8a8/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:75ee9c1cce2911581a70a3c0919d8bccf5b1cbc9b0e5171400ec736b4b569497", size = 181827, upload-time = "2026-03-06T06:00:59.323Z" }, + { url = "https://files.pythonhosted.org/packages/74/00/b26158e48b425a202a92965f8069e8a63d9af1481dfa206825d7f74d2a3c/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d1401945cb77787dbd3af2446ff2d75912327c4c3a1526ab7955ecf8600687c", size = 191085, upload-time = "2026-03-06T06:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c2/1c1737bf6fd40335fe53d28fe49afd99ee4143cc57a845e99635ce0b9b6d/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a45e504f5e1be0bd385935a8e1507c442349ca36f511a47057a71c9d1d6ea9e", size = 190688, upload-time = "2026-03-06T06:01:02.479Z" }, + { url = "https://files.pythonhosted.org/packages/5a/3d/abb5c22dc2ef493cd56522f811246a63c5427c08f3e3e50ab663de27fcf4/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e09f671a54ce70b79a1fc1dc6da3072b7ef7251fadb894ed92d9aa8218465a5f", size = 183077, upload-time = "2026-03-06T06:01:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/44/33/5298ad4d419a58e25b3508e87f2758d1442ff00c2471f8e0403dab8edad5/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d01de5e768328646e6a3fa9e562706f8f6641708c115c62588aef2b941a4f88e", size = 206706, upload-time = "2026-03-06T06:01:05.773Z" }, + { url = "https://files.pythonhosted.org/packages/7b/17/51e7895ac0f87c3b91d276a449ef09f5532a7529818f59646d7a55089432/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:131716d6786ad5e3dc542f5cc6f397ba3339dc0fb87f87ac30e550e8987756af", size = 191665, upload-time = "2026-03-06T06:01:07.473Z" }, + { url = "https://files.pythonhosted.org/packages/90/8f/cce9adf1883e98906dbae380d769b4852bb0fa0004bc7d7a2243418d3ea8/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a374cc0b88aa710e8865dc1bd6edb3743c59f27830f0293ab101e4cf3ce9f85", size = 201950, upload-time = "2026-03-06T06:01:08.973Z" }, + { url = "https://files.pythonhosted.org/packages/08/ca/bce99cd5c397a52919e2769d126723f27a4c037130374c051c00470bcd38/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d31f0d1671e1534e395f9eb84a68e0fb670e1edb1fe819a9d7f564ae3bc4e53f", size = 195830, upload-time = "2026-03-06T06:01:10.155Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/2e3d023a06911f1281f97b8f036edc9872167036ca6f55cc874a0be6c12c/charset_normalizer-3.4.5-cp311-cp311-win32.whl", hash = "sha256:cace89841c0599d736d3d74a27bc5821288bb47c5441923277afc6059d7fbcb4", size = 132029, upload-time = "2026-03-06T06:01:11.706Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1f/a853b73d386521fd44b7f67ded6b17b7b2367067d9106a5c4b44f9a34274/charset_normalizer-3.4.5-cp311-cp311-win_amd64.whl", hash = "sha256:f8102ae93c0bc863b1d41ea0f4499c20a83229f52ed870850892df555187154a", size = 142404, upload-time = "2026-03-06T06:01:12.865Z" }, + { url = "https://files.pythonhosted.org/packages/b4/10/dba36f76b71c38e9d391abe0fd8a5b818790e053c431adecfc98c35cd2a9/charset_normalizer-3.4.5-cp311-cp311-win_arm64.whl", hash = "sha256:ed98364e1c262cf5f9363c3eca8c2df37024f52a8fa1180a3610014f26eac51c", size = 132796, upload-time = "2026-03-06T06:01:14.106Z" }, + { url = "https://files.pythonhosted.org/packages/9c/b6/9ee9c1a608916ca5feae81a344dffbaa53b26b90be58cc2159e3332d44ec/charset_normalizer-3.4.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed97c282ee4f994ef814042423a529df9497e3c666dca19be1d4cd1129dc7ade", size = 280976, upload-time = "2026-03-06T06:01:15.276Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d8/a54f7c0b96f1df3563e9190f04daf981e365a9b397eedfdfb5dbef7e5c6c/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0294916d6ccf2d069727d65973c3a1ca477d68708db25fd758dd28b0827cff54", size = 189356, upload-time = "2026-03-06T06:01:16.511Z" }, + { url = "https://files.pythonhosted.org/packages/42/69/2bf7f76ce1446759a5787cb87d38f6a61eb47dbbdf035cfebf6347292a65/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dc57a0baa3eeedd99fafaef7511b5a6ef4581494e8168ee086031744e2679467", size = 206369, upload-time = "2026-03-06T06:01:17.853Z" }, + { url = "https://files.pythonhosted.org/packages/10/9c/949d1a46dab56b959d9a87272482195f1840b515a3380e39986989a893ae/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ed1a9a204f317ef879b32f9af507d47e49cd5e7f8e8d5d96358c98373314fc60", size = 203285, upload-time = "2026-03-06T06:01:19.473Z" }, + { url = "https://files.pythonhosted.org/packages/67/5c/ae30362a88b4da237d71ea214a8c7eb915db3eec941adda511729ac25fa2/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad83b8f9379176c841f8865884f3514d905bcd2a9a3b210eaa446e7d2223e4d", size = 196274, upload-time = "2026-03-06T06:01:20.728Z" }, + { url = "https://files.pythonhosted.org/packages/b2/07/c9f2cb0e46cb6d64fdcc4f95953747b843bb2181bda678dc4e699b8f0f9a/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:a118e2e0b5ae6b0120d5efa5f866e58f2bb826067a646431da4d6a2bdae7950e", size = 184715, upload-time = "2026-03-06T06:01:22.194Z" }, + { url = "https://files.pythonhosted.org/packages/36/64/6b0ca95c44fddf692cd06d642b28f63009d0ce325fad6e9b2b4d0ef86a52/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:754f96058e61a5e22e91483f823e07df16416ce76afa4ebf306f8e1d1296d43f", size = 193426, upload-time = "2026-03-06T06:01:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/50/bc/a730690d726403743795ca3f5bb2baf67838c5fea78236098f324b965e40/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0c300cefd9b0970381a46394902cd18eaf2aa00163f999590ace991989dcd0fc", size = 191780, upload-time = "2026-03-06T06:01:25.053Z" }, + { url = "https://files.pythonhosted.org/packages/97/4f/6c0bc9af68222b22951552d73df4532b5be6447cee32d58e7e8c74ecbb7b/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c108f8619e504140569ee7de3f97d234f0fbae338a7f9f360455071ef9855a95", size = 185805, upload-time = "2026-03-06T06:01:26.294Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b9/a523fb9b0ee90814b503452b2600e4cbc118cd68714d57041564886e7325/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d1028de43596a315e2720a9849ee79007ab742c06ad8b45a50db8cdb7ed4a82a", size = 208342, upload-time = "2026-03-06T06:01:27.55Z" }, + { url = "https://files.pythonhosted.org/packages/4d/61/c59e761dee4464050713e50e27b58266cc8e209e518c0b378c1580c959ba/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:19092dde50335accf365cce21998a1c6dd8eafd42c7b226eb54b2747cdce2fac", size = 193661, upload-time = "2026-03-06T06:01:29.051Z" }, + { url = "https://files.pythonhosted.org/packages/1c/43/729fa30aad69783f755c5ad8649da17ee095311ca42024742701e202dc59/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4354e401eb6dab9aed3c7b4030514328a6c748d05e1c3e19175008ca7de84fb1", size = 204819, upload-time = "2026-03-06T06:01:30.298Z" }, + { url = "https://files.pythonhosted.org/packages/87/33/d9b442ce5a91b96fc0840455a9e49a611bbadae6122778d0a6a79683dd31/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a68766a3c58fde7f9aaa22b3786276f62ab2f594efb02d0a1421b6282e852e98", size = 198080, upload-time = "2026-03-06T06:01:31.478Z" }, + { url = "https://files.pythonhosted.org/packages/56/5a/b8b5a23134978ee9885cee2d6995f4c27cc41f9baded0a9685eabc5338f0/charset_normalizer-3.4.5-cp312-cp312-win32.whl", hash = "sha256:1827734a5b308b65ac54e86a618de66f935a4f63a8a462ff1e19a6788d6c2262", size = 132630, upload-time = "2026-03-06T06:01:33.056Z" }, + { url = "https://files.pythonhosted.org/packages/70/53/e44a4c07e8904500aec95865dc3f6464dc3586a039ef0df606eb3ac38e35/charset_normalizer-3.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:728c6a963dfab66ef865f49286e45239384249672cd598576765acc2a640a636", size = 142856, upload-time = "2026-03-06T06:01:34.489Z" }, + { url = "https://files.pythonhosted.org/packages/ea/aa/c5628f7cad591b1cf45790b7a61483c3e36cf41349c98af7813c483fd6e8/charset_normalizer-3.4.5-cp312-cp312-win_arm64.whl", hash = "sha256:75dfd1afe0b1647449e852f4fb428195a7ed0588947218f7ba929f6538487f02", size = 132982, upload-time = "2026-03-06T06:01:35.641Z" }, + { url = "https://files.pythonhosted.org/packages/c5/60/3a621758945513adfd4db86827a5bafcc615f913dbd0b4c2ed64a65731be/charset_normalizer-3.4.5-py3-none-any.whl", hash = "sha256:9db5e3fcdcee89a78c04dffb3fe33c79f77bd741a624946db2591c81b2fc85b0", size = 55455, upload-time = "2026-03-06T06:03:17.827Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "dask" +version = "2026.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "cloudpickle" }, + { name = "fsspec" }, + { name = "importlib-metadata", marker = "python_full_version < '3.12'" }, + { name = "packaging" }, + { name = "partd" }, + { name = "pyyaml" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/52/b0f9172b22778def907db1ff173249e4eb41f054b46a9c83b1528aaf811f/dask-2026.1.2.tar.gz", hash = "sha256:1136683de2750d98ea792670f7434e6c1cfce90cab2cc2f2495a9e60fd25a4fc", size = 10997838, upload-time = "2026-01-30T21:04:20.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/23/d39ccc4ed76222db31530b0a7d38876fdb7673e23f838e8d8f0ed4651a4f/dask-2026.1.2-py3-none-any.whl", hash = "sha256:46a0cf3b8d87f78a3d2e6b145aea4418a6d6d606fe6a16c79bd8ca2bb862bc91", size = 1482084, upload-time = "2026-01-30T21:04:18.363Z" }, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + +[[package]] +name = "deprecated" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, +] + +[[package]] +name = "easydict" +version = "1.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/9f/d18d6b5e19244788a6d09c14a8406376b4f4bfcc008e6d17a4f4c15362e8/easydict-1.13.tar.gz", hash = "sha256:b1135dedbc41c8010e2bc1f77ec9744c7faa42bce1a1c87416791449d6c87780", size = 6809, upload-time = "2024-03-04T12:04:41.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, +] + +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, +] + +[[package]] +name = "fonttools" +version = "4.61.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, + { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, + { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, + { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, + { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, + { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, + { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, + { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, + { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, + { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, +] + +[[package]] +name = "fpdf2" +version = "2.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "fonttools" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/f2/72feae0b2827ed38013e4307b14f95bf0b3d124adfef4d38a7d57533f7be/fpdf2-2.8.7.tar.gz", hash = "sha256:7060ccee5a9c7ab0a271fb765a36a23639f83ef8996c34e3d46af0a17ede57f9", size = 362351, upload-time = "2026-02-28T05:39:16.456Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/0a/cf50ecffa1e3747ed9380a3adfc829259f1f86b3fdbd9e505af789003141/fpdf2-2.8.7-py3-none-any.whl", hash = "sha256:d391fc508a3ce02fc43a577c830cda4fe6f37646f2d143d489839940932fbc19", size = 327056, upload-time = "2026-02-28T05:39:14.619Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" }, + { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" }, + { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" }, + { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" }, + { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" }, + { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" }, + { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" }, + { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" }, + { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" }, + { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, + { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, + { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, + { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, + { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, + { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, + { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, + { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, + { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + +[[package]] +name = "glossapi" +version = "0.1.3" +source = { editable = "../../" } +dependencies = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "dask" }, + { name = "ftfy" }, + { name = "joblib" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pypdfium2" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "zstandard" }, +] + +[package.optional-dependencies] +deepseek = [ + { name = "accelerate" }, + { name = "addict" }, + { name = "easydict" }, + { name = "einops" }, + { name = "img2pdf" }, + { name = "pillow" }, + { name = "pymupdf" }, + { name = "tokenizers" }, + { name = "transformers" }, +] + +[package.metadata] +requires-dist = [ + { name = "accelerate", marker = "extra == 'deepseek'", specifier = ">=1.2.1,<2" }, + { name = "addict", marker = "extra == 'deepseek'" }, + { name = "aiofiles", specifier = ">=23.0.0" }, + { name = "aiohttp", specifier = ">=3.8.0" }, + { name = "dask", specifier = ">=2022.1.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.81.0" }, + { name = "easydict", marker = "extra == 'deepseek'" }, + { name = "einops", marker = "extra == 'deepseek'" }, + { name = "ftfy", specifier = ">=6.0.0" }, + { name = "img2pdf", marker = "extra == 'deepseek'", specifier = ">=0.5.1" }, + { name = "joblib", specifier = ">=1.0.0" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, + { name = "numpy", specifier = ">=1.26,<3" }, + { name = "pandas", specifier = ">=1.3.0" }, + { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "playwright", marker = "extra == 'browser'", specifier = ">=1.52,<2" }, + { name = "pyarrow", specifier = ">=7.0.0" }, + { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, + { name = "pypdfium2", specifier = ">=4.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = "==1.6.1" }, + { name = "tenacity", specifier = ">=8.0.0" }, + { name = "tokenizers", marker = "extra == 'deepseek'", specifier = "==0.20.3" }, + { name = "torch", marker = "extra == 'cuda'", specifier = "==2.5.1" }, + { name = "torchvision", marker = "extra == 'cuda'", specifier = "==0.20.1" }, + { name = "tqdm", specifier = ">=4.67.0" }, + { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, + { name = "zstandard", specifier = ">=0.22.0" }, +] +provides-extras = ["browser", "docling", "cuda", "deepseek", "docs"] + +[[package]] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "glossapi", extra = ["deepseek"] }, + { name = "torch" }, + { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchaudio", version = "2.9.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] + +[package.dev-dependencies] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "glossapi", extras = ["deepseek"], editable = "../../" }, + { name = "torch", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchaudio", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchvision", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu130" }, +] + +[package.metadata.requires-dev] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[[package]] +name = "hf-xet" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, + { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, + { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "img2pdf" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pikepdf" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/97/ca44c467131b93fda82d2a2f21b738c8bcf63b5259e3b8250e928b8dd52a/img2pdf-0.6.3.tar.gz", hash = "sha256:219518020f5bd242bdc46493941ea3f756f664c2e86f2454721e74353f58cd95", size = 120350, upload-time = "2025-11-05T20:51:57.558Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/dc/91e3a4a11c25ae183bd5a71b84ecb298db76405ff70013f76b10877bdfe3/img2pdf-0.6.3-py3-none-any.whl", hash = "sha256:44d12d235752edd17c43c04ff39952cdc5dd4c6aba90569c4902bd445085266b", size = 49701, upload-time = "2025-11-05T20:51:55.469Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + +[[package]] +name = "locket" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/83/97b29fe05cb6ae28d2dbd30b81e2e402a3eed5f460c26e9eaa5895ceacf5/locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632", size = 4350, upload-time = "2022-04-20T22:04:44.312Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/bc/83e112abc66cd466c6b83f99118035867cecd41802f8d044638aa78a106e/locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3", size = 4398, upload-time = "2022-04-20T22:04:42.23Z" }, +] + +[[package]] +name = "lxml" +version = "5.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/2d/67693cc8a605a12e5975380d7ff83020dcc759351b5a066e1cced04f797b/lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9", size = 8083240, upload-time = "2025-04-23T01:45:18.566Z" }, + { url = "https://files.pythonhosted.org/packages/73/53/b5a05ab300a808b72e848efd152fe9c022c0181b0a70b8bca1199f1bed26/lxml-5.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7", size = 4387685, upload-time = "2025-04-23T01:45:21.387Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/1a3879c5f512bdcd32995c301886fe082b2edd83c87d41b6d42d89b4ea4d/lxml-5.4.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa", size = 4991164, upload-time = "2025-04-23T01:45:23.849Z" }, + { url = "https://files.pythonhosted.org/packages/f9/94/bbc66e42559f9d04857071e3b3d0c9abd88579367fd2588a4042f641f57e/lxml-5.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df", size = 4746206, upload-time = "2025-04-23T01:45:26.361Z" }, + { url = "https://files.pythonhosted.org/packages/66/95/34b0679bee435da2d7cae895731700e519a8dfcab499c21662ebe671603e/lxml-5.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e", size = 5342144, upload-time = "2025-04-23T01:45:28.939Z" }, + { url = "https://files.pythonhosted.org/packages/e0/5d/abfcc6ab2fa0be72b2ba938abdae1f7cad4c632f8d552683ea295d55adfb/lxml-5.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44", size = 4825124, upload-time = "2025-04-23T01:45:31.361Z" }, + { url = "https://files.pythonhosted.org/packages/5a/78/6bd33186c8863b36e084f294fc0a5e5eefe77af95f0663ef33809cc1c8aa/lxml-5.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba", size = 4876520, upload-time = "2025-04-23T01:45:34.191Z" }, + { url = "https://files.pythonhosted.org/packages/3b/74/4d7ad4839bd0fc64e3d12da74fc9a193febb0fae0ba6ebd5149d4c23176a/lxml-5.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba", size = 4765016, upload-time = "2025-04-23T01:45:36.7Z" }, + { url = "https://files.pythonhosted.org/packages/24/0d/0a98ed1f2471911dadfc541003ac6dd6879fc87b15e1143743ca20f3e973/lxml-5.4.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c", size = 5362884, upload-time = "2025-04-23T01:45:39.291Z" }, + { url = "https://files.pythonhosted.org/packages/48/de/d4f7e4c39740a6610f0f6959052b547478107967362e8424e1163ec37ae8/lxml-5.4.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8", size = 4902690, upload-time = "2025-04-23T01:45:42.386Z" }, + { url = "https://files.pythonhosted.org/packages/07/8c/61763abd242af84f355ca4ef1ee096d3c1b7514819564cce70fd18c22e9a/lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86", size = 4944418, upload-time = "2025-04-23T01:45:46.051Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/6d7e3b63e7e282619193961a570c0a4c8a57fe820f07ca3fe2f6bd86608a/lxml-5.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056", size = 4827092, upload-time = "2025-04-23T01:45:48.943Z" }, + { url = "https://files.pythonhosted.org/packages/71/4a/e60a306df54680b103348545706a98a7514a42c8b4fbfdcaa608567bb065/lxml-5.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7", size = 5418231, upload-time = "2025-04-23T01:45:51.481Z" }, + { url = "https://files.pythonhosted.org/packages/27/f2/9754aacd6016c930875854f08ac4b192a47fe19565f776a64004aa167521/lxml-5.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd", size = 5261798, upload-time = "2025-04-23T01:45:54.146Z" }, + { url = "https://files.pythonhosted.org/packages/38/a2/0c49ec6941428b1bd4f280650d7b11a0f91ace9db7de32eb7aa23bcb39ff/lxml-5.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751", size = 4988195, upload-time = "2025-04-23T01:45:56.685Z" }, + { url = "https://files.pythonhosted.org/packages/7a/75/87a3963a08eafc46a86c1131c6e28a4de103ba30b5ae903114177352a3d7/lxml-5.4.0-cp311-cp311-win32.whl", hash = "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4", size = 3474243, upload-time = "2025-04-23T01:45:58.863Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f9/1f0964c4f6c2be861c50db380c554fb8befbea98c6404744ce243a3c87ef/lxml-5.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539", size = 3815197, upload-time = "2025-04-23T01:46:01.096Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/d101ace719ca6a4ec043eb516fcfcb1b396a9fccc4fcd9ef593df34ba0d5/lxml-5.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4", size = 8127392, upload-time = "2025-04-23T01:46:04.09Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/beddae0cec4dd9ddf46abf156f0af451c13019a0fa25d7445b655ba5ccb7/lxml-5.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d", size = 4415103, upload-time = "2025-04-23T01:46:07.227Z" }, + { url = "https://files.pythonhosted.org/packages/d0/25/d0d93a4e763f0462cccd2b8a665bf1e4343dd788c76dcfefa289d46a38a9/lxml-5.4.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779", size = 5024224, upload-time = "2025-04-23T01:46:10.237Z" }, + { url = "https://files.pythonhosted.org/packages/31/ce/1df18fb8f7946e7f3388af378b1f34fcf253b94b9feedb2cec5969da8012/lxml-5.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e", size = 4769913, upload-time = "2025-04-23T01:46:12.757Z" }, + { url = "https://files.pythonhosted.org/packages/4e/62/f4a6c60ae7c40d43657f552f3045df05118636be1165b906d3423790447f/lxml-5.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9", size = 5290441, upload-time = "2025-04-23T01:46:16.037Z" }, + { url = "https://files.pythonhosted.org/packages/9e/aa/04f00009e1e3a77838c7fc948f161b5d2d5de1136b2b81c712a263829ea4/lxml-5.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5", size = 4820165, upload-time = "2025-04-23T01:46:19.137Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/e0b2f61fa2404bf0f1fdf1898377e5bd1b74cc9b2cf2c6ba8509b8f27990/lxml-5.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5", size = 4932580, upload-time = "2025-04-23T01:46:21.963Z" }, + { url = "https://files.pythonhosted.org/packages/24/a2/8263f351b4ffe0ed3e32ea7b7830f845c795349034f912f490180d88a877/lxml-5.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4", size = 4759493, upload-time = "2025-04-23T01:46:24.316Z" }, + { url = "https://files.pythonhosted.org/packages/05/00/41db052f279995c0e35c79d0f0fc9f8122d5b5e9630139c592a0b58c71b4/lxml-5.4.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e", size = 5324679, upload-time = "2025-04-23T01:46:27.097Z" }, + { url = "https://files.pythonhosted.org/packages/1d/be/ee99e6314cdef4587617d3b3b745f9356d9b7dd12a9663c5f3b5734b64ba/lxml-5.4.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7", size = 4890691, upload-time = "2025-04-23T01:46:30.009Z" }, + { url = "https://files.pythonhosted.org/packages/ad/36/239820114bf1d71f38f12208b9c58dec033cbcf80101cde006b9bde5cffd/lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079", size = 4955075, upload-time = "2025-04-23T01:46:32.33Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e1/1b795cc0b174efc9e13dbd078a9ff79a58728a033142bc6d70a1ee8fc34d/lxml-5.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20", size = 4838680, upload-time = "2025-04-23T01:46:34.852Z" }, + { url = "https://files.pythonhosted.org/packages/72/48/3c198455ca108cec5ae3662ae8acd7fd99476812fd712bb17f1b39a0b589/lxml-5.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8", size = 5391253, upload-time = "2025-04-23T01:46:37.608Z" }, + { url = "https://files.pythonhosted.org/packages/d6/10/5bf51858971c51ec96cfc13e800a9951f3fd501686f4c18d7d84fe2d6352/lxml-5.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f", size = 5261651, upload-time = "2025-04-23T01:46:40.183Z" }, + { url = "https://files.pythonhosted.org/packages/2b/11/06710dd809205377da380546f91d2ac94bad9ff735a72b64ec029f706c85/lxml-5.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc", size = 5024315, upload-time = "2025-04-23T01:46:43.333Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b0/15b6217834b5e3a59ebf7f53125e08e318030e8cc0d7310355e6edac98ef/lxml-5.4.0-cp312-cp312-win32.whl", hash = "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f", size = 3486149, upload-time = "2025-04-23T01:46:45.684Z" }, + { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, + { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, + { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, + { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, + { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, + { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, + { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, + { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, + { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, + { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, + { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, + { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, + { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, +] + +[[package]] +name = "nvidia-cublas" +version = "13.0.0.19" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/99/8447b9ee9f070522ee66604ee819d632ab4568c68b3134cebd3837a015cd/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:381b1a0ca636fdcb6920a871e8fc89dbfd1f6157f421ed0a6f2673e14cffd3bd", size = 539001158, upload-time = "2025-08-04T10:19:50.761Z" }, + { url = "https://files.pythonhosted.org/packages/5a/99/210e113dde53955e97042bd76dc4ad927eca04c5b4645ec157cc59f4f3ae/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:f6723af2e8e2600a11dc384037d90d9bf93070e346c24ef2e8f9001658c99896", size = 419392356, upload-time = "2025-08-04T10:20:19.449Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/63/e9c12c3ae07c1f3a0821536bc188d7bf76e1b633b3bcd2bd393b00bb3426/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:67c22627ef436afcf080b48e4ad17b3f83d9e7c0d990ad0c6c0627b01fb92ccc", size = 10171189, upload-time = "2025-08-04T10:16:24.39Z" }, + { url = "https://files.pythonhosted.org/packages/ba/28/e37d62ff27b4462953fdd5713d8a78760578dfa12685c30b71b55fab57b1/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:417699e216b23d81bc0bbcb7032352f81b9c5372ef73c097a01abb83125a3d09", size = 10718148, upload-time = "2025-08-04T10:16:33.605Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/5b/f7636b3d66caefade6a0a0dc5b705c259a2062c20ad18b432b3129d348e0/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:87e13d186905a35e7c04ad553a2abded0fba22f93b43d02e5da6f6cf73fb4d0a", size = 90214268, upload-time = "2025-08-04T10:18:09.305Z" }, + { url = "https://files.pythonhosted.org/packages/c0/bd/eb18593b43dae42312612ffbac24b8e68149e590102c3b6cc2e3d3792069/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6ccf1ef1b90a0763ac7536f3c17046659d89869d76b98ac358efc2e09b348365", size = 43013627, upload-time = "2025-08-04T10:17:57.338Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/3b/c5e5d8aafd355e2ff9922472ba71251331af6cc866e5b04a3b1dc8f58977/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b807c0bb925a307bfa667a24f24d253aef8eda3ac4be66b333f2c9d357557008", size = 2260687, upload-time = "2025-08-04T10:15:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/cc/78/edb119083ca2ff0f09ab0cd597e97775ac3f575b8aa0caf10d68ed49e032/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b54d12087a1abff81a4cbfa6556876e3afea1fc60da2e0816da374619810c89", size = 2242632, upload-time = "2025-08-04T10:15:49.339Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu13" +version = "9.13.0.50" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/9c/9e99c00dc23db324244ec257d1e84d79539202ee2f185dee2c1fa97c9549/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:33f0aa0b64230101b348648fd0693342188071d3f8a137c0cf50051c24b3584b", size = 412337597, upload-time = "2025-09-04T20:22:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/cf/68/2712854561170b2a81bea7b6b35cc1ae264d9794c0c218986e5c685d45f7/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:2150b4850725d30653ec3e365f0732e3e2e3eb8633cf3bd2d3117628dea8b4f9", size = 348571624, upload-time = "2025-09-04T20:23:26.544Z" }, +] + +[[package]] +name = "nvidia-cufft" +version = "12.0.0.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/e9/4e49b1baf6899e42eeec324a49d7aa2219fec42076327c4e468000dd375a/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1885731254835797572ff075f3daf43a2a0a2801210dea26971940dae7e1a367", size = 214053580, upload-time = "2025-08-04T10:20:45.781Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9f/e298b66e584ad25bd78ad4a45b061fe7bb57a1ec011128089404ce3fcc7d/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9f160b1f018e80bcb0d7c0fa50564b042fa26b13edc1b1ff14b6375a9edd2812", size = 214085489, upload-time = "2025-08-04T10:21:02.975Z" }, +] + +[[package]] +name = "nvidia-cufile" +version = "1.15.0.42" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/0a/4adf0c9bb1241cd1314fc923fde00f3749c7fc785b1e3b3f4a104cd3090c/nvidia_cufile-1.15.0.42-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8f9813eff24d61586699c615e39817e2b4e4f642cace32733c2ab6f663a7eab", size = 1223104, upload-time = "2025-08-04T10:21:31.131Z" }, + { url = "https://files.pythonhosted.org/packages/bf/a5/636baa43399ea10d22b63e7454f22a92ace4a7eaa3c45b94607250857e2d/nvidia_cufile-1.15.0.42-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bced4036b5a8dbf57e4d78cd4fafefec58ad754b784a9eaa272b011896754c62", size = 1136527, upload-time = "2025-08-04T10:21:22.441Z" }, +] + +[[package]] +name = "nvidia-curand" +version = "10.4.0.35" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, +] + +[[package]] +name = "nvidia-cusolver" +version = "12.0.3.29" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-cusparse", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/bb/2e60de9bb1f0c3395eabd91ccad00f4ba3ef736dc9190a158a9d268419f5/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:3bb6e65ce0beaeafdd069b320246e8f17c1cd30ddb27a0539143a3706733a4d8", size = 193104180, upload-time = "2025-08-04T10:22:19.821Z" }, + { url = "https://files.pythonhosted.org/packages/a5/87/e3c9ee227b750e5b61572e7509f586cc8d494a4f7874b5163e734ed852c2/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:6f54c2eed5edab54c224dd1852dde80ba76b2b78e6d3ce7344fef5dfc66d16ab", size = 193474165, upload-time = "2025-08-04T10:22:47.976Z" }, +] + +[[package]] +name = "nvidia-cusparse" +version = "12.6.2.49" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/30/f32023427f2ef4ec27e8293dfddb5068de566912cd0a45eccfd400017a62/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d3269c19283a0057fb5ebfb003ae2a10c97a28a6958f4238354826b055827c7", size = 155888587, upload-time = "2025-08-04T10:23:04.091Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e8/b3f7a87cc719dca926c7baee92f2544de8909573a4126c85a9f1625431e8/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efcf0b01e3a0827c144feff5391456b8a06e9ce63dcd51c0943e32e605251952", size = 140247612, upload-time = "2025-08-04T10:23:29.844Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu13" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/10/8dcd1175260706a2fc92a16a52e306b71d4c1ea0b0cc4a9484183399818a/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:400c6ed1cf6780fc6efedd64ec9f1345871767e6a1a0a552a1ea0578117ea77c", size = 220791277, upload-time = "2025-08-13T19:22:40.982Z" }, + { url = "https://files.pythonhosted.org/packages/fd/53/43b0d71f4e702fa9733f8b4571fdca50a8813f1e450b656c239beff12315/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:25e30a8a7323935d4ad0340b95a0b69926eee755767e8e0b1cf8dd85b197d3fd", size = 169884119, upload-time = "2025-08-13T19:23:41.967Z" }, +] + +[[package]] +name = "nvidia-nccl-cu13" +version = "2.27.7" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/61/2c7762da6febee96341ea17d1f7309ac7559ac3cab00f3f7e1e7bd0e5d00/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5e3cc863e52bf9dd1e3ab1941bddb414098f489ae7342f6b3a274602303da123", size = 194014855, upload-time = "2025-09-23T16:30:27.56Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/dabb10684e60edfaf1a1c9984d12a668bc1091582099d4e03ac5b9983b51/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b28a524abd8389b76a4a3f133c76a7aaa7005e47fcaa9d9603b90103927a3f93", size = 193901479, upload-time = "2025-09-23T16:30:41.165Z" }, +] + +[[package]] +name = "nvidia-nvjitlink" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/39/726edebeb76f3efc25c79f885429fa1227c9d200e20ea219bf724b382e19/nvidia_nvjitlink-13.0.39-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:bc3179be558329ef9687884c6faa27cdc0659bdbc642432ec8cc6cc00d182627", size = 40709605, upload-time = "2025-08-04T10:25:04.129Z" }, + { url = "https://files.pythonhosted.org/packages/bc/7a/0fb4c4413b3b14519f8934edd4dcd9f411c4e14e2a2c0ae58709e4dda255/nvidia_nvjitlink-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce0d63fa5ebedf542056e7491c49feed2297c900980aa6269b6a55f478056ad7", size = 38767126, upload-time = "2025-08-04T10:24:53.05Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu13" +version = "3.3.24" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/7e/b8797780e442eabd9046cd6eb54100b8d0cb047ebc2f70931710cb03bcfe/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:28ae82a4d14b322b93409535de62df6b7b83f4f7672ca97fc89107c2d40ce2c2", size = 60168129, upload-time = "2025-08-22T19:56:28.818Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e9/8530afb8ed38d16bbc89cec80a4dd6a52dbf59bc93e546c3658cfa8b1f9b/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c14d09571697d2e57cb079c8daec88ab1c68cb3586532bfbd4886125a08339b7", size = 60390470, upload-time = "2025-08-22T19:56:49.848Z" }, +] + +[[package]] +name = "nvidia-nvtx" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/37/0d103c84e7884382a79a569b720965141f83dd1c5df9e3e00cbc02d7099c/nvidia_nvtx-13.0.39-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc113127785c96db8a0fe715df92db9788777b4b3d1bd713d42f75969201b5ce", size = 147197, upload-time = "2025-08-04T10:18:39.829Z" }, + { url = "https://files.pythonhosted.org/packages/86/91/8b486ba85f71a2859dd705a4ec6aab38c37a389b8b7f94343db027732999/nvidia_nvtx-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cddd2e08b35144f1000631c3880c9ebbcb8a2863d762e76f92d47d30ecaf87cc", size = 148037, upload-time = "2025-08-04T10:18:31.763Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, +] + +[[package]] +name = "partd" +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "locket" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/3a/3f06f34820a31257ddcabdfafc2672c5816be79c7e353b02c1f318daa7d4/partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c", size = 21029, upload-time = "2024-05-06T19:51:41.945Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, +] + +[[package]] +name = "pikepdf" +version = "10.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "lxml" }, + { name = "packaging" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/ba/7635a5f4259a2a91ed4f094e358dec3068ecedc891d70b8e76a02904ca0c/pikepdf-10.3.0.tar.gz", hash = "sha256:e2a64a5f1ebf8c411193126b9eeff7faf5739a40bce7441e579531422469fbb1", size = 4575749, upload-time = "2026-01-30T07:33:53.317Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a9/0d2107a3c796ab2fa7d379ee801190c95c4132f0bb5cfc1fd8d2e3ac74af/pikepdf-10.3.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:99fb21d20dc02f9828d477d2c549ee3f6e191801f84a2a2505d21baacb731745", size = 4753016, upload-time = "2026-01-30T07:32:51.999Z" }, + { url = "https://files.pythonhosted.org/packages/a9/2b/f634a0956aa15074db6c62309ec3d08bd158ddbdea8bd2081cea8b6eb3ed/pikepdf-10.3.0-cp311-cp311-macosx_15_0_x86_64.whl", hash = "sha256:c8a4b6862d7e0e69dd3f57efd362826966d1f341e0d052f7f23f0fe3a2375a36", size = 5063869, upload-time = "2026-01-30T07:32:54.418Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/d5ba1febacde805e7ec75a3df0888e53212f8e5f82fa1fc09c0fa981c7f9/pikepdf-10.3.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b86d42e66004ffaf5284aae0d9814bb3d19f048a45943479db5ca3d02d46bfb", size = 2445530, upload-time = "2026-01-30T07:32:56.117Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ba/196351a049a7a9d255140a414f586779b3ad77f0d09091e639d9f85c4131/pikepdf-10.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7021b31eddd5aa611f6941a2c171b7ce321c7763263ff658368f5f40bda1d4", size = 2673622, upload-time = "2026-01-30T07:32:57.85Z" }, + { url = "https://files.pythonhosted.org/packages/7c/cf/1315759de9dc66f769f84067da2127046e46489100f6e2be614fcb6c8394/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b653b1d0c5f17efb080ef68b65d3fcc8909f22128b75e0479775a35cd8d9fe6e", size = 3644910, upload-time = "2026-01-30T07:33:00.182Z" }, + { url = "https://files.pythonhosted.org/packages/80/6f/578ee7b53d06267f6c489fb7734792f6fa670a3a7d0b55db20b084e0957d/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fa3e4b32a2c1d15bb57e91ee3896c19b3c8145d46c26fbac8747efe7cb5ce3bd", size = 3835871, upload-time = "2026-01-30T07:33:02.804Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0f/980dbfb5ab9231d30e44d9285e8a7509f0871fc6fe438559e1eed16e683d/pikepdf-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:3233da668d665d301a4a4fd1481867e688336fdb410e9bc9d4e5b0cd62e334eb", size = 3756976, upload-time = "2026-01-30T07:33:05.596Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/d6ca7f6066d7f3b61b56bffeca1069c0ded635ba316aa1df54fcc0e2104f/pikepdf-10.3.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d1a6646def3fc47f763eab0dcb11341a7205cef1b7dc5c62f1dee435a89472b9", size = 4762039, upload-time = "2026-01-30T07:33:08.626Z" }, + { url = "https://files.pythonhosted.org/packages/9c/dc/d0db713a34a493eedf4eded566668762aee5acfad958bdf374a450df931c/pikepdf-10.3.0-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:e968e4e81d6c05d8e4b24594b27a64cb9be3c7a4371bf0635f6b669559171e6b", size = 5078640, upload-time = "2026-01-30T07:33:10.478Z" }, + { url = "https://files.pythonhosted.org/packages/21/c0/e0a1f1afb99ecac5f7f21313b47c174178f85df0f1ec7080e0d431324099/pikepdf-10.3.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfad0e4e6bc268ca041d639b232d76c25c9ad7023b7189d14869ef4446cabda2", size = 2450284, upload-time = "2026-01-30T07:33:12.215Z" }, + { url = "https://files.pythonhosted.org/packages/db/3a/2f0e8bd70cf57896a85b1d7f7ca3ce79d91a17222e1b23b607860ea52a5d/pikepdf-10.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cf7ab25f1e9063de320d2edecb2cd2960329cc25bac645c7938390f6538d9bf", size = 2699411, upload-time = "2026-01-30T07:33:13.878Z" }, + { url = "https://files.pythonhosted.org/packages/fd/10/da5f244aa14b845cd835f34b6a7a217493952f2532d2e00957ed3bd79aea/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3904353137e5b0cb2a316d84057e1e5301a65e6b1810d4763348ae8919ba20f4", size = 3649524, upload-time = "2026-01-30T07:33:15.641Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ef/3efb78a16d9c702dfd64fdeaee6a1ac6af95c41d4ec60b784e9171f20753/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4335ec70a659b5be1dfc7094a67db7f9c017c9c1cf9049b56d0e35ad24a46ff0", size = 3861320, upload-time = "2026-01-30T07:33:17.466Z" }, + { url = "https://files.pythonhosted.org/packages/8d/63/b0243fe62cf5d4d9da49010a15e0177b9629b8183092b3bd804f59a1529a/pikepdf-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac5befc1e991e28b16be104c219bdb1f6cf62a8371f4019ce7bab64ec5ec5745", size = 3763570, upload-time = "2026-01-30T07:33:19.863Z" }, +] + +[[package]] +name = "pillow" +version = "10.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/62/c9449f9c3043c37f73e7487ec4ef0c03eb9c9afc91a92b977a67b3c0bbc5/pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c", size = 3509265, upload-time = "2024-07-01T09:45:49.812Z" }, + { url = "https://files.pythonhosted.org/packages/f4/5f/491dafc7bbf5a3cc1845dc0430872e8096eb9e2b6f8161509d124594ec2d/pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be", size = 3375655, upload-time = "2024-07-01T09:45:52.462Z" }, + { url = "https://files.pythonhosted.org/packages/73/d5/c4011a76f4207a3c151134cd22a1415741e42fa5ddecec7c0182887deb3d/pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3", size = 4340304, upload-time = "2024-07-01T09:45:55.006Z" }, + { url = "https://files.pythonhosted.org/packages/ac/10/c67e20445a707f7a610699bba4fe050583b688d8cd2d202572b257f46600/pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6", size = 4452804, upload-time = "2024-07-01T09:45:58.437Z" }, + { url = "https://files.pythonhosted.org/packages/a9/83/6523837906d1da2b269dee787e31df3b0acb12e3d08f024965a3e7f64665/pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe", size = 4365126, upload-time = "2024-07-01T09:46:00.713Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e5/8c68ff608a4203085158cff5cc2a3c534ec384536d9438c405ed6370d080/pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319", size = 4533541, upload-time = "2024-07-01T09:46:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7c/01b8dbdca5bc6785573f4cee96e2358b0918b7b2c7b60d8b6f3abf87a070/pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d", size = 4471616, upload-time = "2024-07-01T09:46:05.356Z" }, + { url = "https://files.pythonhosted.org/packages/c8/57/2899b82394a35a0fbfd352e290945440e3b3785655a03365c0ca8279f351/pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696", size = 4600802, upload-time = "2024-07-01T09:46:08.145Z" }, + { url = "https://files.pythonhosted.org/packages/4d/d7/a44f193d4c26e58ee5d2d9db3d4854b2cfb5b5e08d360a5e03fe987c0086/pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496", size = 2235213, upload-time = "2024-07-01T09:46:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/c1/d0/5866318eec2b801cdb8c82abf190c8343d8a1cd8bf5a0c17444a6f268291/pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91", size = 2554498, upload-time = "2024-07-01T09:46:12.685Z" }, + { url = "https://files.pythonhosted.org/packages/d4/c8/310ac16ac2b97e902d9eb438688de0d961660a87703ad1561fd3dfbd2aa0/pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22", size = 2243219, upload-time = "2024-07-01T09:46:14.83Z" }, + { url = "https://files.pythonhosted.org/packages/05/cb/0353013dc30c02a8be34eb91d25e4e4cf594b59e5a55ea1128fde1e5f8ea/pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94", size = 3509350, upload-time = "2024-07-01T09:46:17.177Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5c558a0f247e0bf9cec92bff9b46ae6474dd736f6d906315e60e4075f737/pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597", size = 3374980, upload-time = "2024-07-01T09:46:19.169Z" }, + { url = "https://files.pythonhosted.org/packages/84/48/6e394b86369a4eb68b8a1382c78dc092245af517385c086c5094e3b34428/pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80", size = 4343799, upload-time = "2024-07-01T09:46:21.883Z" }, + { url = "https://files.pythonhosted.org/packages/3b/f3/a8c6c11fa84b59b9df0cd5694492da8c039a24cd159f0f6918690105c3be/pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca", size = 4459973, upload-time = "2024-07-01T09:46:24.321Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1b/c14b4197b80150fb64453585247e6fb2e1d93761fa0fa9cf63b102fde822/pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef", size = 4370054, upload-time = "2024-07-01T09:46:26.825Z" }, + { url = "https://files.pythonhosted.org/packages/55/77/40daddf677897a923d5d33329acd52a2144d54a9644f2a5422c028c6bf2d/pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a", size = 4539484, upload-time = "2024-07-01T09:46:29.355Z" }, + { url = "https://files.pythonhosted.org/packages/40/54/90de3e4256b1207300fb2b1d7168dd912a2fb4b2401e439ba23c2b2cabde/pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b", size = 4477375, upload-time = "2024-07-01T09:46:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/13/24/1bfba52f44193860918ff7c93d03d95e3f8748ca1de3ceaf11157a14cf16/pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9", size = 4608773, upload-time = "2024-07-01T09:46:33.73Z" }, + { url = "https://files.pythonhosted.org/packages/55/04/5e6de6e6120451ec0c24516c41dbaf80cce1b6451f96561235ef2429da2e/pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42", size = 2235690, upload-time = "2024-07-01T09:46:36.587Z" }, + { url = "https://files.pythonhosted.org/packages/74/0a/d4ce3c44bca8635bd29a2eab5aa181b654a734a29b263ca8efe013beea98/pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a", size = 2554951, upload-time = "2024-07-01T09:46:38.777Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ca/184349ee40f2e92439be9b3502ae6cfc43ac4b50bc4fc6b3de7957563894/pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9", size = 2243427, upload-time = "2024-07-01T09:46:43.15Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" }, + { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" }, + { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" }, + { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" }, + { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" }, + { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" }, + { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" }, + { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" }, + { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pymupdf" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pymupdfb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/57/da06ca4886afc71a624e4b463d05f45c8a822596ede939957295e229eb4e/PyMuPDF-1.24.10.tar.gz", hash = "sha256:bd3ebd6d3fb8a845582098362f885bfb0a31ae4272587efc2c55c5e29fe7327a", size = 46988085, upload-time = "2024-09-02T16:28:45.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/35/6af0bb4bafe9d54893a04d9639f73b1b754efe0235997052d75fb6b7edc1/PyMuPDF-1.24.10-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:5fbd67cce759fc0126902137409cf9da6313b776c4d5ff0d5200f336350f86a3", size = 3194012, upload-time = "2024-09-02T16:27:14.019Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2b/c254cf49dfcf2469a674407a680f5b2b174b866e84d322f5767baf4d3ad3/PyMuPDF-1.24.10-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:2b14dbdf7c415bb0fa849527abbe7b4f1f55ae23b9355d132951f634438c59ac", size = 2974781, upload-time = "2024-09-02T16:27:17.213Z" }, + { url = "https://files.pythonhosted.org/packages/1c/77/78800d3a711f92060f8e338a5df9330ffb5950f4fb3beeba01e15c03c4c6/PyMuPDF-1.24.10-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:1a87440a6cbc0d5ad513425baa0f4747841898fca6e37350ca3e6b29e5f40c01", size = 3210393, upload-time = "2024-09-02T22:17:05.788Z" }, + { url = "https://files.pythonhosted.org/packages/c5/39/3aaa1e8822c55c71bb37911b5b1c3157ef38d731581224b29a682d80a17b/PyMuPDF-1.24.10-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:c0d1ccdc062ea9961063790831e838bc43fcf9a8436a8b9f55898addf97c0f86", size = 3482650, upload-time = "2024-09-02T16:27:21.101Z" }, + { url = "https://files.pythonhosted.org/packages/5b/73/6b5c2dc59539b79cb9430ff946d7dff308af146f7c8bc7b96c963e12970d/PyMuPDF-1.24.10-cp311-none-musllinux_1_2_x86_64.whl", hash = "sha256:f68671363be5a2ba104ab7d3bad821d2994cbe3f3408538bbc27d32e6dc9f923", size = 3600588, upload-time = "2024-09-02T16:27:25.022Z" }, + { url = "https://files.pythonhosted.org/packages/71/e9/d3bf062325b4821726a2f9ce9d75b63f594ae24bc38c31f55b4285f1f5e1/PyMuPDF-1.24.10-cp311-none-win32.whl", hash = "sha256:49f83556cd1a7d05b36a54ccc01fce324da8a4e6854e36cc5cd94d321e428565", size = 2694768, upload-time = "2024-09-02T16:27:33.318Z" }, + { url = "https://files.pythonhosted.org/packages/30/3f/356a70c105d4410c29529f1ca8c53b5d176b448a4409238b4dcd133507a4/PyMuPDF-1.24.10-cp311-none-win_amd64.whl", hash = "sha256:05b8d360766b87f4abd186eba16a56b92bae513b2361b13f633fe6256329292e", size = 3214889, upload-time = "2024-09-02T16:27:28.174Z" }, + { url = "https://files.pythonhosted.org/packages/75/84/7231344d98355a40fb57c4025391dfb4116e2c3e9d98d5cc83f80c5ea942/PyMuPDF-1.24.10-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f323aa7bb55e0214e632bfe24fa140bd5dcfeac2d3977bdce46e760385140513", size = 3230169, upload-time = "2024-09-02T16:27:37.842Z" }, + { url = "https://files.pythonhosted.org/packages/b2/bc/975b4fe4400b00c912dad1874c43d31486150e6f39d7dae758751c27e2dd/PyMuPDF-1.24.10-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:50d2972558d25ce46a8634b58787b28dbeff9b3fe4299530fc9c8c9921061e83", size = 2980118, upload-time = "2024-09-02T16:27:41.534Z" }, + { url = "https://files.pythonhosted.org/packages/5b/dc/0f22c77ac4f8e6b8316072519513d5f0111fffe96d357051db0ddf043032/PyMuPDF-1.24.10-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:0e3969c2fdff682b3b2c6a2b463adde068d6d8e20e2133ef6c8503469259646a", size = 3216830, upload-time = "2024-09-02T22:17:09.193Z" }, + { url = "https://files.pythonhosted.org/packages/a3/1b/1b41b27aab571b835f8d983492b80ed64548e3b5c4d169e23c639727d43b/PyMuPDF-1.24.10-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:cd78ee1ebefdfe72bc36fd4b731cc8c694eb8ef5337d8ea956b0e94cd88751fc", size = 3491118, upload-time = "2024-09-02T16:27:50.098Z" }, + { url = "https://files.pythonhosted.org/packages/2d/3c/f1ffbc6e13ab37900c2aa71e434bbba922770091242e2b059acdb14f779e/PyMuPDF-1.24.10-cp312-none-musllinux_1_2_x86_64.whl", hash = "sha256:696eed91d2ee44e76277dfeb6bd904c84ae005378588949df6ed9be9e03b9817", size = 3612589, upload-time = "2024-09-02T16:27:54.185Z" }, + { url = "https://files.pythonhosted.org/packages/53/fb/158909af75c84968ea7e6659a75fd67bd462103c599033b23ffd6bc173be/PyMuPDF-1.24.10-cp312-none-win32.whl", hash = "sha256:1e5413e1aeab2f18e1ca1b3ff17057a4a7c5cbf4ff14abc93203da88fc1a1dd8", size = 2701190, upload-time = "2024-09-02T16:27:57.74Z" }, + { url = "https://files.pythonhosted.org/packages/91/4a/4a54d3f6a779ac5eed92e82fe3c1bb426bc40f9ea57c8656839198944a82/PyMuPDF-1.24.10-cp312-none-win_amd64.whl", hash = "sha256:227a4473fce8fa32b9268da68781048795503b67dc045867fc201e1334204bf1", size = 3228084, upload-time = "2024-09-02T16:27:45.749Z" }, +] + +[[package]] +name = "pymupdfb" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/ff/ecfcb41414b51976974d74c8e35fef0a0e5b47c7046a11c860553f5dccf0/PyMuPDFb-1.24.10.tar.gz", hash = "sha256:007b91fa9b528c5c0eecea2e49c486ac02e878274f9e31522bdd948adc5f8327", size = 37502, upload-time = "2024-09-02T16:28:48.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/94/b217dc987b4ac0e3793984427112d6032563b741e27763f7761c2231d022/PyMuPDFb-1.24.10-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:cd6b24630d90dce9ab3e59d06c5e616686f8d7ec626be1311721fcb062aa0078", size = 15536229, upload-time = "2024-09-02T16:25:19.4Z" }, + { url = "https://files.pythonhosted.org/packages/16/7a/f634c76d8331cb8dedcfaced17424cc469ee20b7f53cf29c9ef17a01b461/PyMuPDFb-1.24.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fda2c34b206f724b1b5685b67188e2a57bcaa5c99bc40a0a5bc62057514c5cdf", size = 15149482, upload-time = "2024-09-02T16:25:34.352Z" }, + { url = "https://files.pythonhosted.org/packages/62/97/67b5da2edd034e66dadd0ec530e277afb14fe866a3b3b01d9fad154bc6f8/PyMuPDFb-1.24.10-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4f50a7472f9bb10cbc7a1cd589ee4626ca030b8a4a02749f9a29eb6f00c0e0db", size = 15711338, upload-time = "2024-09-02T22:17:01.592Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/ad3f076e86328880797fe7e98c43b2879df56cf6cb75ac3058da06d6e6cb/PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:409f1270ef2e70d845e80149ff3db9cfed578274042316cba55cc3e3882421ea", size = 15921939, upload-time = "2024-09-02T16:26:00.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/e7/02160ea905a7ba16d6e1ca51759ae1c1045785ebebae57ba30e82617f934/PyMuPDFb-1.24.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:aca96b6e9ee3096a26810592f4d899f4d3cf3cf0c902ae7e8cca09bce4d946c4", size = 17076991, upload-time = "2024-09-02T16:25:46.703Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c0/e1ed840440131f71b068cdb3b620a69ec27543b1012a6bd855d8d05f1629/PyMuPDFb-1.24.10-py3-none-win32.whl", hash = "sha256:2d231b42fe3bf79837df235e7fbdf7ff8b46bf4ca1346d0f0124fb1cdd343ce8", size = 11731706, upload-time = "2024-09-02T16:26:19.131Z" }, + { url = "https://files.pythonhosted.org/packages/70/cb/8459d6c179befd7c6eee555334f054e9a6dcdd9f8671891e1da19e0ce526/PyMuPDFb-1.24.10-py3-none-win_amd64.whl", hash = "sha256:27ea65c701608b6b7632703339ca33ea6d513843b26dbe9bdefb2f56f7b9b196", size = 13186168, upload-time = "2024-09-02T16:26:10.503Z" }, +] + +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "pytz" +version = "2026.1.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, +] + +[[package]] +name = "regex" +version = "2026.2.28" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/db/8cbfd0ba3f302f2d09dd0019a9fcab74b63fee77a76c937d0e33161fb8c1/regex-2026.2.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e621fb7c8dc147419b28e1702f58a0177ff8308a76fa295c71f3e7827849f5d9", size = 488462, upload-time = "2026-02-28T02:16:22.616Z" }, + { url = "https://files.pythonhosted.org/packages/5d/10/ccc22c52802223f2368731964ddd117799e1390ffc39dbb31634a83022ee/regex-2026.2.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d5bef2031cbf38757a0b0bc4298bb4824b6332d28edc16b39247228fbdbad97", size = 290774, upload-time = "2026-02-28T02:16:23.993Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/6796b3bf3101e64117201aaa3a5a030ec677ecf34b3cd6141b5d5c6c67d5/regex-2026.2.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcb399ed84eabf4282587ba151f2732ad8168e66f1d3f85b1d038868fe547703", size = 288724, upload-time = "2026-02-28T02:16:25.403Z" }, + { url = "https://files.pythonhosted.org/packages/9c/02/291c0ae3f3a10cea941d0f5366da1843d8d1fa8a25b0671e20a0e454bb38/regex-2026.2.28-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1b34dfa72f826f535b20712afa9bb3ba580020e834f3c69866c5bddbf10098", size = 791924, upload-time = "2026-02-28T02:16:26.863Z" }, + { url = "https://files.pythonhosted.org/packages/0f/57/f0235cc520d9672742196c5c15098f8f703f2758d48d5a7465a56333e496/regex-2026.2.28-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:851fa70df44325e1e4cdb79c5e676e91a78147b1b543db2aec8734d2add30ec2", size = 860095, upload-time = "2026-02-28T02:16:28.772Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7c/393c94cbedda79a0f5f2435ebd01644aba0b338d327eb24b4aa5b8d6c07f/regex-2026.2.28-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:516604edd17b1c2c3e579cf4e9b25a53bf8fa6e7cedddf1127804d3e0140ca64", size = 906583, upload-time = "2026-02-28T02:16:30.977Z" }, + { url = "https://files.pythonhosted.org/packages/2c/73/a72820f47ca5abf2b5d911d0407ba5178fc52cf9780191ed3a54f5f419a2/regex-2026.2.28-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7ce83654d1ab701cb619285a18a8e5a889c1216d746ddc710c914ca5fd71022", size = 800234, upload-time = "2026-02-28T02:16:32.55Z" }, + { url = "https://files.pythonhosted.org/packages/34/b3/6e6a4b7b31fa998c4cf159a12cbeaf356386fbd1a8be743b1e80a3da51e4/regex-2026.2.28-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2791948f7c70bb9335a9102df45e93d428f4b8128020d85920223925d73b9e1", size = 772803, upload-time = "2026-02-28T02:16:34.029Z" }, + { url = "https://files.pythonhosted.org/packages/10/e7/5da0280c765d5a92af5e1cd324b3fe8464303189cbaa449de9a71910e273/regex-2026.2.28-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a83cc26aa2acda6b8b9dfe748cf9e84cbd390c424a1de34fdcef58961a297a", size = 781117, upload-time = "2026-02-28T02:16:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/76/39/0b8d7efb256ae34e1b8157acc1afd8758048a1cf0196e1aec2e71fd99f4b/regex-2026.2.28-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ec6f5674c5dc836994f50f1186dd1fafde4be0666aae201ae2fcc3d29d8adf27", size = 854224, upload-time = "2026-02-28T02:16:38.119Z" }, + { url = "https://files.pythonhosted.org/packages/21/ff/a96d483ebe8fe6d1c67907729202313895d8de8495569ec319c6f29d0438/regex-2026.2.28-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:50c2fc924749543e0eacc93ada6aeeb3ea5f6715825624baa0dccaec771668ae", size = 761898, upload-time = "2026-02-28T02:16:40.333Z" }, + { url = "https://files.pythonhosted.org/packages/89/bd/d4f2e75cb4a54b484e796017e37c0d09d8a0a837de43d17e238adf163f4e/regex-2026.2.28-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ba55c50f408fb5c346a3a02d2ce0ebc839784e24f7c9684fde328ff063c3cdea", size = 844832, upload-time = "2026-02-28T02:16:41.875Z" }, + { url = "https://files.pythonhosted.org/packages/8a/a7/428a135cf5e15e4e11d1e696eb2bf968362f8ea8a5f237122e96bc2ae950/regex-2026.2.28-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edb1b1b3a5576c56f08ac46f108c40333f222ebfd5cf63afdfa3aab0791ebe5b", size = 788347, upload-time = "2026-02-28T02:16:43.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/59/68691428851cf9c9c3707217ab1d9b47cfeec9d153a49919e6c368b9e926/regex-2026.2.28-cp311-cp311-win32.whl", hash = "sha256:948c12ef30ecedb128903c2c2678b339746eb7c689c5c21957c4a23950c96d15", size = 266033, upload-time = "2026-02-28T02:16:45.094Z" }, + { url = "https://files.pythonhosted.org/packages/42/8b/1483de1c57024e89296cbcceb9cccb3f625d416ddb46e570be185c9b05a9/regex-2026.2.28-cp311-cp311-win_amd64.whl", hash = "sha256:fd63453f10d29097cc3dc62d070746523973fb5aa1c66d25f8558bebd47fed61", size = 277978, upload-time = "2026-02-28T02:16:46.75Z" }, + { url = "https://files.pythonhosted.org/packages/a4/36/abec45dc6e7252e3dbc797120496e43bb5730a7abf0d9cb69340696a2f2d/regex-2026.2.28-cp311-cp311-win_arm64.whl", hash = "sha256:00f2b8d9615aa165fdff0a13f1a92049bfad555ee91e20d246a51aa0b556c60a", size = 270340, upload-time = "2026-02-28T02:16:48.626Z" }, + { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" }, + { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" }, + { url = "https://files.pythonhosted.org/packages/9e/06/3ef1ac6910dc3295ebd71b1f9bfa737e82cfead211a18b319d45f85ddd09/regex-2026.2.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b65d33a17101569f86d9c5966a8b1d7fbf8afdda5a8aa219301b0a80f58cf7d", size = 289200, upload-time = "2026-02-28T02:16:54.08Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c9/8cc8d850b35ab5650ff6756a1cb85286e2000b66c97520b29c1587455344/regex-2026.2.28-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e71dcecaa113eebcc96622c17692672c2d104b1d71ddf7adeda90da7ddeb26fc", size = 796765, upload-time = "2026-02-28T02:16:55.905Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5d/57702597627fc23278ebf36fbb497ac91c0ce7fec89ac6c81e420ca3e38c/regex-2026.2.28-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:481df4623fa4969c8b11f3433ed7d5e3dc9cec0f008356c3212b3933fb77e3d8", size = 863093, upload-time = "2026-02-28T02:16:58.094Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/f3ecad537ca2811b4d26b54ca848cf70e04fcfc138667c146a9f3157779c/regex-2026.2.28-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64e7c6ad614573e0640f271e811a408d79a9e1fe62a46adb602f598df42a818d", size = 909455, upload-time = "2026-02-28T02:17:00.918Z" }, + { url = "https://files.pythonhosted.org/packages/9e/40/bb226f203caa22c1043c1ca79b36340156eca0f6a6742b46c3bb222a3a57/regex-2026.2.28-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6b08a06976ff4fb0d83077022fde3eca06c55432bb997d8c0495b9a4e9872f4", size = 802037, upload-time = "2026-02-28T02:17:02.842Z" }, + { url = "https://files.pythonhosted.org/packages/44/7c/c6d91d8911ac6803b45ca968e8e500c46934e58c0903cbc6d760ee817a0a/regex-2026.2.28-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:864cdd1a2ef5716b0ab468af40139e62ede1b3a53386b375ec0786bb6783fc05", size = 775113, upload-time = "2026-02-28T02:17:04.506Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8d/4a9368d168d47abd4158580b8c848709667b1cd293ff0c0c277279543bd0/regex-2026.2.28-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:511f7419f7afab475fd4d639d4aedfc54205bcb0800066753ef68a59f0f330b5", size = 784194, upload-time = "2026-02-28T02:17:06.888Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bf/2c72ab5d8b7be462cb1651b5cc333da1d0068740342f350fcca3bca31947/regex-2026.2.28-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b42f7466e32bf15a961cf09f35fa6323cc72e64d3d2c990b10de1274a5da0a59", size = 856846, upload-time = "2026-02-28T02:17:09.11Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f4/6b65c979bb6d09f51bb2d2a7bc85de73c01ec73335d7ddd202dcb8cd1c8f/regex-2026.2.28-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8710d61737b0c0ce6836b1da7109f20d495e49b3809f30e27e9560be67a257bf", size = 763516, upload-time = "2026-02-28T02:17:11.004Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/29ea5e27400ee86d2cc2b4e80aa059df04eaf78b4f0c18576ae077aeff68/regex-2026.2.28-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4390c365fd2d45278f45afd4673cb90f7285f5701607e3ad4274df08e36140ae", size = 849278, upload-time = "2026-02-28T02:17:12.693Z" }, + { url = "https://files.pythonhosted.org/packages/1d/91/3233d03b5f865111cd517e1c95ee8b43e8b428d61fa73764a80c9bb6f537/regex-2026.2.28-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb3b1db8ff6c7b8bf838ab05583ea15230cb2f678e569ab0e3a24d1e8320940b", size = 790068, upload-time = "2026-02-28T02:17:14.9Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/abc706c1fb03b4580a09645b206a3fc032f5a9f457bc1a8038ac555658ab/regex-2026.2.28-cp312-cp312-win32.whl", hash = "sha256:f8ed9a5d4612df9d4de15878f0bc6aa7a268afbe5af21a3fdd97fa19516e978c", size = 266416, upload-time = "2026-02-28T02:17:17.15Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/2a6f7dff190e5fa9df9fb4acf2fdf17a1aa0f7f54596cba8de608db56b3a/regex-2026.2.28-cp312-cp312-win_amd64.whl", hash = "sha256:01d65fd24206c8e1e97e2e31b286c59009636c022eb5d003f52760b0f42155d4", size = 277297, upload-time = "2026-02-28T02:17:18.723Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f0/58a2484851fadf284458fdbd728f580d55c1abac059ae9f048c63b92f427/regex-2026.2.28-cp312-cp312-win_arm64.whl", hash = "sha256:c0b5ccbb8ffb433939d248707d4a8b31993cb76ab1a0187ca886bf50e96df952", size = 270408, upload-time = "2026-02-28T02:17:20.328Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312, upload-time = "2025-01-10T08:07:55.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/2a/e291c29670795406a824567d1dfc91db7b699799a002fdaa452bceea8f6e/scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33", size = 12102620, upload-time = "2025-01-10T08:06:16.675Z" }, + { url = "https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d", size = 11116234, upload-time = "2025-01-10T08:06:21.83Z" }, + { url = "https://files.pythonhosted.org/packages/30/cd/ed4399485ef364bb25f388ab438e3724e60dc218c547a407b6e90ccccaef/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2", size = 12592155, upload-time = "2025-01-10T08:06:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/62fc9a5a659bb58a03cdd7e258956a5824bdc9b4bb3c5d932f55880be569/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8", size = 13497069, upload-time = "2025-01-10T08:06:32.515Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415", size = 11139809, upload-time = "2025-01-10T08:06:35.514Z" }, + { url = "https://files.pythonhosted.org/packages/0a/18/c797c9b8c10380d05616db3bfb48e2a3358c767affd0857d56c2eb501caa/scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b", size = 12104516, upload-time = "2025-01-10T08:06:40.009Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2", size = 11167837, upload-time = "2025-01-10T08:06:43.305Z" }, + { url = "https://files.pythonhosted.org/packages/a4/f6/ff7beaeb644bcad72bcfd5a03ff36d32ee4e53a8b29a639f11bcb65d06cd/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f", size = 12253728, upload-time = "2025-01-10T08:06:47.618Z" }, + { url = "https://files.pythonhosted.org/packages/29/7a/8bce8968883e9465de20be15542f4c7e221952441727c4dad24d534c6d99/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86", size = 13147700, upload-time = "2025-01-10T08:06:50.888Z" }, + { url = "https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52", size = 11110613, upload-time = "2025-01-10T08:06:54.115Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" }, + { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" }, + { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" }, + { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" }, + { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" }, + { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" }, + { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" }, + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.20.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/25/b1681c1c30ea3ea6e584ae3fffd552430b12faa599b558c4c4783f56d7ff/tokenizers-0.20.3.tar.gz", hash = "sha256:2278b34c5d0dd78e087e1ca7f9b1dcbf129d80211afa645f214bd6e051037539", size = 340513, upload-time = "2024-11-05T17:34:10.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/93/6742ef9206409d5ce1fdf44d5ca1687cdc3847ba0485424e2c731e6bcf67/tokenizers-0.20.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:585b51e06ca1f4839ce7759941e66766d7b060dccfdc57c4ca1e5b9a33013a90", size = 2674224, upload-time = "2024-11-05T17:30:49.972Z" }, + { url = "https://files.pythonhosted.org/packages/aa/14/e75ece72e99f6ef9ae07777ca9fdd78608f69466a5cecf636e9bd2f25d5c/tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61cbf11954f3b481d08723ebd048ba4b11e582986f9be74d2c3bdd9293a4538d", size = 2558991, upload-time = "2024-11-05T17:30:51.666Z" }, + { url = "https://files.pythonhosted.org/packages/46/54/033b5b2ba0c3ae01e026c6f7ced147d41a2fa1c573d00a66cb97f6d7f9b3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef820880d5e4e8484e2fa54ff8d297bb32519eaa7815694dc835ace9130a3eea", size = 2892476, upload-time = "2024-11-05T17:30:53.505Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/cc369fb3297d61f3311cab523d16d48c869dc2f0ba32985dbf03ff811041/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:67ef4dcb8841a4988cd00dd288fb95dfc8e22ed021f01f37348fd51c2b055ba9", size = 2802775, upload-time = "2024-11-05T17:30:55.229Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/62ad983e8ea6a63e04ed9c5be0b605056bf8aac2f0125f9b5e0b3e2b89fa/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb", size = 3086138, upload-time = "2024-11-05T17:30:57.332Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ac/4637ba619db25094998523f9e6f5b456e1db1f8faa770a3d925d436db0c3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:444d188186eab3148baf0615b522461b41b1f0cd58cd57b862ec94b6ac9780f1", size = 3098076, upload-time = "2024-11-05T17:30:59.455Z" }, + { url = "https://files.pythonhosted.org/packages/58/ce/9793f2dc2ce529369807c9c74e42722b05034af411d60f5730b720388c7d/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37c04c032c1442740b2c2d925f1857885c07619224a533123ac7ea71ca5713da", size = 3379650, upload-time = "2024-11-05T17:31:01.264Z" }, + { url = "https://files.pythonhosted.org/packages/50/f6/2841de926bc4118af996eaf0bdf0ea5b012245044766ffc0347e6c968e63/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453c7769d22231960ee0e883d1005c93c68015025a5e4ae56275406d94a3c907", size = 2994005, upload-time = "2024-11-05T17:31:02.985Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b2/00915c4fed08e9505d37cf6eaab45b12b4bff8f6719d459abcb9ead86a4b/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4bb31f7b2847e439766aaa9cc7bccf7ac7088052deccdb2275c952d96f691c6a", size = 8977488, upload-time = "2024-11-05T17:31:04.424Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ac/1c069e7808181ff57bcf2d39e9b6fbee9133a55410e6ebdaa89f67c32e83/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:843729bf0f991b29655a069a2ff58a4c24375a553c70955e15e37a90dd4e045c", size = 9294935, upload-time = "2024-11-05T17:31:06.882Z" }, + { url = "https://files.pythonhosted.org/packages/50/47/722feb70ee68d1c4412b12d0ea4acc2713179fd63f054913990f9e259492/tokenizers-0.20.3-cp311-none-win32.whl", hash = "sha256:efcce3a927b1e20ca694ba13f7a68c59b0bd859ef71e441db68ee42cf20c2442", size = 2197175, upload-time = "2024-11-05T17:31:09.385Z" }, + { url = "https://files.pythonhosted.org/packages/75/68/1b4f928b15a36ed278332ac75d66d7eb65d865bf344d049c452c18447bf9/tokenizers-0.20.3-cp311-none-win_amd64.whl", hash = "sha256:88301aa0801f225725b6df5dea3d77c80365ff2362ca7e252583f2b4809c4cc0", size = 2381616, upload-time = "2024-11-05T17:31:10.685Z" }, + { url = "https://files.pythonhosted.org/packages/07/00/92a08af2a6b0c88c50f1ab47d7189e695722ad9714b0ee78ea5e1e2e1def/tokenizers-0.20.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:49d12a32e190fad0e79e5bdb788d05da2f20d8e006b13a70859ac47fecf6ab2f", size = 2667951, upload-time = "2024-11-05T17:31:12.356Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9a/e17a352f0bffbf415cf7d73756f5c73a3219225fc5957bc2f39d52c61684/tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:282848cacfb9c06d5e51489f38ec5aa0b3cd1e247a023061945f71f41d949d73", size = 2555167, upload-time = "2024-11-05T17:31:13.839Z" }, + { url = "https://files.pythonhosted.org/packages/27/37/d108df55daf4f0fcf1f58554692ff71687c273d870a34693066f0847be96/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe4e08c7d0cd6154c795deb5bf81d2122f36daf075e0c12a8b050d824ef0a64", size = 2898389, upload-time = "2024-11-05T17:31:15.12Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/32f29da16d28f59472fa7fb38e7782069748c7e9ab9854522db20341624c/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca94fc1b73b3883c98f0c88c77700b13d55b49f1071dfd57df2b06f3ff7afd64", size = 2795866, upload-time = "2024-11-05T17:31:16.857Z" }, + { url = "https://files.pythonhosted.org/packages/29/4e/8a9a3c89e128c4a40f247b501c10279d2d7ade685953407c4d94c8c0f7a7/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef279c7e239f95c8bdd6ff319d9870f30f0d24915b04895f55b1adcf96d6c60d", size = 3085446, upload-time = "2024-11-05T17:31:18.392Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3b/a2a7962c496ebcd95860ca99e423254f760f382cd4bd376f8895783afaf5/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16384073973f6ccbde9852157a4fdfe632bb65208139c9d0c0bd0176a71fd67f", size = 3094378, upload-time = "2024-11-05T17:31:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f4/a8a33f0192a1629a3bd0afcad17d4d221bbf9276da4b95d226364208d5eb/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:312d522caeb8a1a42ebdec87118d99b22667782b67898a76c963c058a7e41d4f", size = 3385755, upload-time = "2024-11-05T17:31:21.778Z" }, + { url = "https://files.pythonhosted.org/packages/9e/65/c83cb3545a65a9eaa2e13b22c93d5e00bd7624b354a44adbdc93d5d9bd91/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b7cb962564785a83dafbba0144ecb7f579f1d57d8c406cdaa7f32fe32f18ad", size = 2997679, upload-time = "2024-11-05T17:31:23.134Z" }, + { url = "https://files.pythonhosted.org/packages/55/e9/a80d4e592307688a67c7c59ab77e03687b6a8bd92eb5db763a2c80f93f57/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:124c5882ebb88dadae1fc788a582299fcd3a8bd84fc3e260b9918cf28b8751f5", size = 8989296, upload-time = "2024-11-05T17:31:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/60c957af8d2244321124e893828f1a4817cde1a2d08d09d423b73f19bd2f/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2b6e54e71f84c4202111a489879005cb14b92616a87417f6c102c833af961ea2", size = 9303621, upload-time = "2024-11-05T17:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/be/a9/96172310ee141009646d63a1ca267c099c462d747fe5ef7e33f74e27a683/tokenizers-0.20.3-cp312-none-win32.whl", hash = "sha256:83d9bfbe9af86f2d9df4833c22e94d94750f1d0cd9bfb22a7bb90a86f61cdb1c", size = 2188979, upload-time = "2024-11-05T17:31:29.483Z" }, + { url = "https://files.pythonhosted.org/packages/bd/68/61d85ae7ae96dde7d0974ff3538db75d5cdc29be2e4329cd7fc51a283e22/tokenizers-0.20.3-cp312-none-win_amd64.whl", hash = "sha256:44def74cee574d609a36e17c8914311d1b5dbcfe37c55fd29369d42591b91cf2", size = 2380725, upload-time = "2024-11-05T17:31:31.315Z" }, +] + +[[package]] +name = "toolz" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/d6/114b492226588d6ff54579d95847662fc69196bdeec318eb45393b24c192/toolz-1.1.0.tar.gz", hash = "sha256:27a5c770d068c110d9ed9323f24f1543e83b2f300a687b7891c1a6d56b697b5b", size = 52613, upload-time = "2025-10-17T04:03:21.661Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/12/5911ae3eeec47800503a238d971e51722ccea5feb8569b735184d5fcdbc0/toolz-1.1.0-py3-none-any.whl", hash = "sha256:15ccc861ac51c53696de0a5d6d4607f99c210739caf987b5d2054f3efed429d8", size = 58093, upload-time = "2025-10-17T04:03:20.435Z" }, +] + +[[package]] +name = "torch" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufft", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, + { name = "nvidia-curand", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusolver", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvtx", marker = "sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, + { name = "sympy" }, + { name = "triton", marker = "sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fd6c7d297e21758a7fa07624f2b5bb15607ee3b1dcc52519e8e796c6d4fcf960" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f40778951ca1533dc634b3842392641fa0b641181ff2f71d62728ef33cc36a5c" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:8db2814e63f2b365bda88526587ca75a6083a0b957a24b2b0d45ddc5ee350176" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e7f84cb10c7e7d9f862c318f056d64840544ab4f0bcbf8cf7ed6047fe04051f" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e70e1b18881e6b3c1ce402d0a989da39f956a3a057526e03c354df23d704ce9b" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:cd3232a562ad2a2699d48130255e1b24c07dfe694a40dcd24fad683c752de121" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:493421d061375074ce84840ca619605f625892e16dead63ec97181ef02da3357" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b3c75f87e325946276c952864dbce2c8fabc88a00d86730c3d5bc0999ebf7789" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1023bb6598fa6312e1990fdc78660f4b4ef128d8942a1f10c5827aea23d6bd7e" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:817e2660d35a3c9a2638dd80d63c7a488cbbe87446ddbb564a5cf88b9de632f7" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6c58d5e846da5a90d50bd425e2c24368747cd04297d95c6dd51d3f7f85fea26" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7533a17bed21e5b86b8c49fd79656779779f2c991aef2804af6f318d2022ea6a" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d4ba2532440a93c23a99c41423a765a0cdd47556afa3acf7c318dd1d3d6793e9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:89743dcee13e943f58b37c7647aff14b5bb24c11c84826376d457acf97586fec" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b0cc84c57c1fd54644698a70a74d1ea1eddfa44ee2df3354b7bb2c619a5d2923" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:f564b9fdbc336ac187780931331fb4253f8511deae914dde12dca5bf17b3045f" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6939dd403cc28ab0a46f53e6c86e2e852cf65771c1b0ddd09c44c541a1cdbad9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:d31ceaded0d9b737471fa680ccd9e1acb6d5f0f70f03ef3a8d786a99c79da7cf" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "transformers" +version = "4.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944, upload-time = "2024-11-18T22:13:01.012Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, +] + +[[package]] +name = "triton" +version = "3.5.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" }, + { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" }, + { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" }, + { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] + +[[package]] +name = "wrapt" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/81/60c4471fce95afa5922ca09b88a25f03c93343f759aae0f31fb4412a85c7/wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb", size = 60666, upload-time = "2026-03-06T02:52:58.934Z" }, + { url = "https://files.pythonhosted.org/packages/6b/be/80e80e39e7cb90b006a0eaf11c73ac3a62bbfb3068469aec15cc0bc795de/wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d", size = 61601, upload-time = "2026-03-06T02:53:00.487Z" }, + { url = "https://files.pythonhosted.org/packages/b0/be/d7c88cd9293c859fc74b232abdc65a229bb953997995d6912fc85af18323/wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894", size = 114057, upload-time = "2026-03-06T02:52:44.08Z" }, + { url = "https://files.pythonhosted.org/packages/ea/25/36c04602831a4d685d45a93b3abea61eca7fe35dab6c842d6f5d570ef94a/wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842", size = 116099, upload-time = "2026-03-06T02:54:56.74Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4e/98a6eb417ef551dc277bec1253d5246b25003cf36fdf3913b65cb7657a56/wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8", size = 112457, upload-time = "2026-03-06T02:53:52.842Z" }, + { url = "https://files.pythonhosted.org/packages/cb/a6/a6f7186a5297cad8ec53fd7578533b28f795fdf5372368c74bd7e6e9841c/wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6", size = 115351, upload-time = "2026-03-06T02:53:32.684Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/06e66189e721dbebd5cf20e138acc4d1150288ce118462f2fcbff92d38db/wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9", size = 111748, upload-time = "2026-03-06T02:53:08.455Z" }, + { url = "https://files.pythonhosted.org/packages/ef/43/4808b86f499a51370fbdbdfa6cb91e9b9169e762716456471b619fca7a70/wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15", size = 113783, upload-time = "2026-03-06T02:53:02.02Z" }, + { url = "https://files.pythonhosted.org/packages/91/2c/a3f28b8fa7ac2cefa01cfcaca3471f9b0460608d012b693998cd61ef43df/wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b", size = 57977, upload-time = "2026-03-06T02:53:27.844Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c3/2b1c7bd07a27b1db885a2fab469b707bdd35bddf30a113b4917a7e2139d2/wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1", size = 60336, upload-time = "2026-03-06T02:54:28.104Z" }, + { url = "https://files.pythonhosted.org/packages/ec/5c/76ece7b401b088daa6503d6264dd80f9a727df3e6042802de9a223084ea2/wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a", size = 58756, upload-time = "2026-03-06T02:53:16.319Z" }, + { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255, upload-time = "2026-03-06T02:52:45.663Z" }, + { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848, upload-time = "2026-03-06T02:53:48.728Z" }, + { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433, upload-time = "2026-03-06T02:54:40.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013, upload-time = "2026-03-06T02:53:26.58Z" }, + { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326, upload-time = "2026-03-06T02:53:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444, upload-time = "2026-03-06T02:54:09.5Z" }, + { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237, upload-time = "2026-03-06T02:54:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563, upload-time = "2026-03-06T02:53:20.412Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198, upload-time = "2026-03-06T02:53:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441, upload-time = "2026-03-06T02:52:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836, upload-time = "2026-03-06T02:53:22.053Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, +] + +[[package]] +name = "yarl" +version = "1.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" }, + { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" }, + { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" }, + { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" }, + { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" }, + { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" }, + { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" }, + { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" }, + { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" }, + { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" }, + { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" }, + { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, + { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, + { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, + { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, + { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, + { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, + { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, + { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, + { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" }, + { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" }, + { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" }, + { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" }, + { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" }, + { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" }, + { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" }, + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, +] diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index b85460e..e0f7707 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -1,67 +1,57 @@ # GlossAPI Dependency Profiles & Test Notes ## Environment Profiles -- **Vanilla** – core GlossAPI pipeline without GPU OCR add-ons. Uses `requirements-glossapi-vanilla.txt`. -- **RapidOCR** – Docling + RapidOCR GPU stack. Builds on vanilla requirements and adds ONNX runtime (`requirements-glossapi-rapidocr.txt`). -- **DeepSeek** – GPU OCR via DeepSeek/vLLM. Extends vanilla requirements with torch/cu128, nightly vLLM and supporting CUDA libs (`requirements-glossapi-deepseek.txt`). `xformers` was dropped because the published wheels still pin Torch 2.8; the rest of the stack now installs cleanly on Torch 2.9. +- **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml` and intentionally excludes the Docling layout stack. -Each profile is installed through `dependency_setup/setup_glossapi.sh`: +Recommended installation commands: ```bash -# Examples (venv path optional) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests -./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests ``` Key flags: -- `--download-deepseek` optionally fetches DeepSeek weights (skipped by default; set `--weights-dir` if they live elsewhere). +- `--download-model` optionally fetches DeepSeek weights (set `--model-root` if they live elsewhere). - `--smoke-test` (DeepSeek only) runs `dependency_setup/deepseek_gpu_smoke.py`. ## Test Segmentation Pytest markers were added so suites can be run per profile: -- `rapidocr` – GPU Docling/RapidOCR integration tests. - `deepseek` – DeepSeek execution paths. -- Unmarked tests cover the vanilla footprint. +- Unmarked tests cover the Docling/core footprint. -`setup_glossapi.sh` now chooses marker expressions automatically: +Suggested commands: -| Mode | Command run by script | -|-----------|---------------------------------------------------------| -| vanilla | `pytest -q -m "not rapidocr and not deepseek" tests` | -| rapidocr | `pytest -q -m "not deepseek" tests` | -| deepseek | `pytest -q -m "not rapidocr" tests` | +| Profile | Command | +|-----------|---------| +| Docling | `pytest -q -m "not deepseek" tests` | +| DeepSeek | `pytest -q -m "deepseek" tests` | -Heavy GPU tests in `tests/test_pipeline_smoke.py` were guarded with `pytest.importorskip("onnxruntime")` so vanilla installs skip them cleanly. Helper PDFs now embed DejaVuSans with Unicode support and insert spacing to keep OCR-friendly glyphs. +## Validation Runs (2026-03-08) +- `./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --smoke-test` -## Validation Runs (2025-10-30) -- `./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests` - -All three completed successfully after the following adjustments: -1. **Rust extensions** – switched to `pip install -e rust/glossapi_rs_{cleaner,noise}` because `maturin develop` left the wheel unregistered. -2. **Parquet locking** – `_parquet_lock` now creates parent directories before attempting the file lock (fixes `FileNotFoundError` in concurrent metadata tests). -3. **RapidOCR pipeline** – fixed `GlossExtract.create_extractor()` to build the Docling converter regardless of import path and added UTF-8 PDF generation improvements; smoke tests now pass on CUDA. -4. **DeepSeek stack** – updated nightly vLLM pin (`0.11.1rc5.dev58+g60f76baa6.cu129`) and removed `xformers` to resolve Torch 2.9 dependency conflicts. +These completed successfully after the following adjustments: +1. **Rust extensions** – use editable installs for `rust/glossapi_rs_{cleaner,noise}` so local changes are picked up immediately. +2. **DeepSeek stack** – moved to a uv-managed runtime pinned to the `transformers`-based OCR-2 path. +3. **Attention fallback** – the DeepSeek runner falls back to `eager` attention if `flash-attn` is unavailable. ## Known Follow-ups -- **DeepSeek weights** – installer warns if weights are absent. Set `--download-deepseek` or populate `${DEEPSEEK_ROOT}/DeepSeek-OCR` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). -- **xformers kernels** – removed pending compatible Torch 2.9 wheels. Reintroduce once upstream publishes matching builds. +- **DeepSeek weights** – installer warns if weights are absent. Set `--download-model` or populate `${MODEL_ROOT}/DeepSeek-OCR-2` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). +- **flash-attn** – optional. Reintroduce into the pinned flow once wheel availability is stable across target hosts. - **Patchelf warnings** – maturin emits rpath hints if `patchelf` is missing; they are benign but install `patchelf` if cleaner logs are desired. -- **Deprecation noise** – Docling emits future warnings (Pydantic) and RapidOCR font deprecation notices; currently harmless but worth tracking for future upgrades. +- **Deprecation noise** – Docling and Transformers emit some warnings on current pins; currently harmless but worth tracking for future upgrades. ## Quick Reference -- Activate an environment: `source dependency_setup/.venvs//bin/activate` +- Activate an environment: `source dependency_setup/.venvs//bin/activate` - Re-run tests manually: - - Vanilla: `pytest -m "not rapidocr and not deepseek" tests` - - RapidOCR: `pytest -m "not deepseek" tests` - - DeepSeek: `pytest -m "not rapidocr" tests` + - Docling: `pytest -m "not deepseek" tests` + - DeepSeek: `pytest -m "deepseek" tests` - DeepSeek runtime exports: ```bash export GLOSSAPI_DEEPSEEK_PYTHON="dependency_setup/.venvs/deepseek/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="/mnt/data/glossAPI/deepseek-ocr/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="/mnt/data/glossAPI/deepseek-ocr/libjpeg-turbo/lib" - export LD_LIBRARY_PATH="$GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH:${LD_LIBRARY_PATH:-}" + export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT="/mnt/data/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py" + export GLOSSAPI_DEEPSEEK_MODEL_DIR="/mnt/data/glossAPI/deepseek-ocr-2-model/DeepSeek-OCR-2" ``` These notes capture the current dependency state, the rationale behind constraint changes, and the validation steps used to exercise each profile. diff --git a/dependency_setup/requirements-glossapi-deepseek.txt b/dependency_setup/requirements-glossapi-deepseek.txt index 5cc685a..8185d9c 100644 --- a/dependency_setup/requirements-glossapi-deepseek.txt +++ b/dependency_setup/requirements-glossapi-deepseek.txt @@ -1,16 +1,13 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 ---extra-index-url https://wheels.vllm.ai/nightly --r requirements-glossapi-vanilla.txt -# CUDA Torch stack aligned with NVIDIA L4 (CUDA 12.8 wheels) -torch==2.9.0+cu128 -torchvision==0.24.0+cu128 -torchaudio==2.9.0+cu128 -# DeepSeek via nightly vLLM -vllm==0.11.1rc5.dev58+g60f76baa6.cu129 -flashinfer-python==0.4.1 -compressed-tensors==0.12.2 -depyf==0.20.0 -# Auxiliary CUDA libs -nvidia-nvshmem-cu12==3.3.20 -nvidia-nccl-cu12==2.27.5 -triton==3.5.0 +--extra-index-url https://download.pytorch.org/whl/cu118 +-r requirements-glossapi-docling.txt +torch==2.6.0 +torchvision==0.21.0 +torchaudio==2.6.0 +transformers==4.46.3 +tokenizers==0.20.3 +accelerate>=1.2.1,<2 +pymupdf==1.24.10 +Pillow==10.4.0 +img2pdf>=0.5.1 +easydict +addict diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt new file mode 100644 index 0000000..73cb17f --- /dev/null +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -0,0 +1,38 @@ +# Core GlossAPI runtime (Docling extraction/layout) +maturin>=1.5,<2.0 +numpy>=1.26,<3 +pandas>=1.3.0 +python-dateutil>=2.8.2 +pytz>=2021.1 +scikit-learn==1.6.1 +joblib>=1.0.0 +dask>=2022.1.0 +pyarrow>=7.0.0 +aiohttp>=3.8.0 +aiofiles>=23.0.0 +ftfy>=6.0.0 +tenacity>=8.0.0 +tqdm>=4.67.0 +pyyaml>=6.0 +pypdfium2>=4.0.0 +zstandard>=0.22.0 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 +msgspec>=0.18.6 +fpdf2>=2.7.0 +cachetools +cbor2 +einops +tiktoken +diskcache==5.6.3 +lark==1.2.2 +numba==0.61.2 +# Tooling / tests +pytest>=8.0 +pytest-mock>=3.14 +psutil>=5.9 +rich>=14.0 +safetensors>=0.4 +huggingface-hub>=0.22 diff --git a/dependency_setup/requirements-glossapi-rapidocr.txt b/dependency_setup/requirements-glossapi-rapidocr.txt deleted file mode 100644 index f5c5839..0000000 --- a/dependency_setup/requirements-glossapi-rapidocr.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements-glossapi-vanilla.txt -rapidocr>=3.3.0 -opencv-python-headless>=4.8.0 -onnxruntime-gpu==1.18.1 diff --git a/dependency_setup/requirements-glossapi-vanilla.txt b/dependency_setup/requirements-glossapi-vanilla.txt index b13df49..eca76ba 100644 --- a/dependency_setup/requirements-glossapi-vanilla.txt +++ b/dependency_setup/requirements-glossapi-vanilla.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling without GPU OCR extras) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/setup_deepseek_uv.sh b/dependency_setup/setup_deepseek_uv.sh new file mode 100755 index 0000000..04a21ba --- /dev/null +++ b/dependency_setup/setup_deepseek_uv.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +PROJECT_DIR="${SCRIPT_DIR}/deepseek_uv" + +PYTHON_BIN="${PYTHON:-python3}" +VENV_PATH="${GLOSSAPI_DEEPSEEK_VENV:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" +MODEL_ROOT="${DEEPSEEK_ROOT:-${REPO_ROOT}/deepseek-ocr-2-model}" +DOWNLOAD_MODEL=0 +RUN_SMOKE=0 +RUN_TESTS=0 + +info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + +SYNC_ARGS=(--no-dev) + +usage() { + cat <<'EOF' +Usage: setup_deepseek_uv.sh [options] + +Options: + --venv PATH Target virtual environment path + --python PATH Python executable to use for uv venv + --model-root PATH Destination root for the DeepSeek-OCR-2 model + --download-model Download DeepSeek-OCR-2 via huggingface_hub + --run-tests Run the DeepSeek pytest subset after installation + --smoke-test Run dependency_setup/deepseek_gpu_smoke.py + --help Show this help message +EOF +} + +while (( "$#" )); do + case "$1" in + --venv) + shift || { echo "--venv requires a path" >&2; exit 1; } + VENV_PATH="${1:-}" + ;; + --python) + shift || { echo "--python requires a path" >&2; exit 1; } + PYTHON_BIN="${1:-}" + ;; + --model-root|--weights-dir) + shift || { echo "--model-root requires a path" >&2; exit 1; } + MODEL_ROOT="${1:-}" + ;; + --download-model|--download-deepseek) + DOWNLOAD_MODEL=1 + ;; + --run-tests) + RUN_TESTS=1 + ;; + --smoke-test) + RUN_SMOKE=1 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift || true +done + +command -v uv >/dev/null 2>&1 || error "uv is required. Install it first, e.g. 'python3 -m pip install --user uv'." + +MODEL_DIR="${MODEL_ROOT}/DeepSeek-OCR-2" + +if [[ -x "${VENV_PATH}/bin/python" ]]; then + info "Reusing uv environment at ${VENV_PATH}" +else + info "Creating uv environment at ${VENV_PATH}" + uv venv --python "${PYTHON_BIN}" "${VENV_PATH}" +fi + +if [[ "${RUN_TESTS}" -eq 1 ]]; then + SYNC_ARGS+=(--group test) +fi + +info "Syncing DeepSeek runtime from ${PROJECT_DIR}" +UV_PROJECT_ENVIRONMENT="${VENV_PATH}" uv sync --project "${PROJECT_DIR}" --python "${VENV_PATH}/bin/python" "${SYNC_ARGS[@]}" + +info "Installing Rust extensions in editable mode" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_noise" + +if [[ "${DOWNLOAD_MODEL}" -eq 1 ]]; then + info "Downloading DeepSeek-OCR-2 model to ${MODEL_DIR}" + HUGGINGFACE_HUB_TOKEN="${HUGGINGFACE_HUB_TOKEN:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}}" \ + "${VENV_PATH}/bin/python" - <\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + usage() { cat <<'EOF' Usage: setup_glossapi.sh [options] Options: - --mode MODE Environment profile: vanilla, rapidocr, deepseek (default: vanilla) + --mode MODE Environment profile: docling or deepseek (default: docling) --venv PATH Target virtual environment path --python PATH Python executable to use when creating the venv - --download-deepseek Fetch DeepSeek-OCR weights (only meaningful for --mode deepseek) - --weights-dir PATH Destination directory for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr) + --download-deepseek Fetch DeepSeek-OCR-2 weights (DeepSeek mode only) + --weights-dir PATH Destination directory root for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr-2-model) --run-tests Run pytest -q after installation --smoke-test Run dependency_setup/deepseek_gpu_smoke.py (deepseek mode only) --help Show this help message @@ -69,13 +73,27 @@ while (( "$#" )); do done case "${MODE}" in - vanilla|rapidocr|deepseek) ;; + vanilla) + warn "Mode 'vanilla' is deprecated; using 'docling' instead." + MODE="docling" + ;; + docling|deepseek) ;; *) - echo "Invalid mode '${MODE}'. Expected vanilla, rapidocr, or deepseek." >&2 + echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 exit 1 ;; esac +if [[ "${MODE}" == "deepseek" ]]; then + exec "${SCRIPT_DIR}/setup_deepseek_uv.sh" \ + --python "${PYTHON_BIN}" \ + --venv "${VENV_PATH:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" \ + --model-root "${DEEPSEEK_ROOT}" \ + $([[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]] && printf '%s' "--download-model") \ + $([[ "${RUN_TESTS}" -eq 1 ]] && printf '%s' "--run-tests") \ + $([[ "${RUN_SMOKE}" -eq 1 ]] && printf '%s' "--smoke-test") +fi + if [[ -z "${VENV_PATH}" ]]; then VENV_PATH="${REPO_ROOT}/.venv_glossapi_${MODE}" fi @@ -86,10 +104,6 @@ if [[ ! -f "${REQUIREMENTS_FILE}" ]]; then exit 1 fi -info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } -warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } -error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } - ensure_venv() { if [[ ! -d "${VENV_PATH}" ]]; then info "Creating virtual environment at ${VENV_PATH}" @@ -107,44 +121,6 @@ python_run() { "${VENV_PATH}/bin/python" "$@" } -download_deepseek_weights() { - local root="$1" - local target="${root}/DeepSeek-OCR" - - if [[ -d "${target}" ]]; then - info "DeepSeek-OCR weights already present at ${target}" - return 0 - fi - - mkdir -p "${root}" - if command -v huggingface-cli >/dev/null 2>&1; then - info "Downloading DeepSeek weights with huggingface-cli (this may take a while)" - huggingface-cli download deepseek-ai/DeepSeek-OCR \ - --repo-type model \ - --include "DeepSeek-OCR/*" \ - --local-dir "${target}" \ - --local-dir-use-symlinks False || warn "huggingface-cli download failed; falling back to git-lfs" - fi - - if [[ ! -d "${target}" ]]; then - if command -v git >/dev/null 2>&1; then - if ! command -v git-lfs >/dev/null 2>&1; then - warn "git-lfs not available; install git-lfs to clone DeepSeek weights via git." - else - info "Cloning DeepSeek weights via git-lfs" - git lfs install --skip-repo >/dev/null 2>&1 || true - git clone https://huggingface.co/deepseek-ai/DeepSeek-OCR "${target}" - fi - else - warn "Neither huggingface-cli nor git found; skipping DeepSeek weight download." - fi - fi - - if [[ ! -d "${target}" ]]; then - warn "DeepSeek weights were not downloaded. Set DEEPSEEK_ROOT manually once acquired." - fi -} - ensure_venv info "Upgrading pip tooling" pip_run install --upgrade pip wheel setuptools @@ -159,43 +135,18 @@ info "Building Rust extensions via editable installs" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_noise" -if [[ "${MODE}" == "deepseek" ]]; then - export GLOSSAPI_DEEPSEEK_PYTHON="${VENV_PATH}/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="${DEEPSEEK_ROOT}/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="${DEEPSEEK_ROOT}/libjpeg-turbo/lib" - export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 - export LD_LIBRARY_PATH="${GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH:-}" - - if [[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]]; then - download_deepseek_weights "${DEEPSEEK_ROOT}" - else - warn "DeepSeek weights not downloaded (use --download-deepseek to fetch automatically)." - fi -fi - if [[ "${RUN_TESTS}" -eq 1 ]]; then pytest_args=("-q") case "${MODE}" in - vanilla) - pytest_args+=("-m" "not rapidocr and not deepseek") - ;; - rapidocr) + docling) pytest_args+=("-m" "not deepseek") ;; - deepseek) - pytest_args+=("-m" "not rapidocr") - ;; esac info "Running pytest ${pytest_args[*]} tests" python_run -m pytest "${pytest_args[@]}" tests fi -if [[ "${MODE}" == "deepseek" && "${RUN_SMOKE}" -eq 1 ]]; then - info "Running DeepSeek smoke test" - python_run "${SCRIPT_DIR}/deepseek_gpu_smoke.py" -fi - cat < None +``` + +- Purpose: Phase‑1 extraction from source files into markdown plus optional JSON intermediates. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout extraction + - `force_ocr`: deprecated no-op kept for compatibility; OCR remediation now lives in `Corpus.ocr(backend='deepseek')` + - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput + - `export_doc_json=True`: write `json/.docling.json(.zst)` + - `emit_formula_index=True`: also write `json/.formula_index.jsonl` +- Main outputs: + - `markdown/.md` + - `json/.docling.json(.zst)` when enabled + - `json/metrics/.metrics.json` + - `json/metrics/.per_page.metrics.json` + +## clean() + +```python +clean( + input_dir: str | Path | None = None, + threshold: float = 0.10, + num_threads: int | None = None, + drop_bad: bool = True, +) -> None +``` + +- Purpose: run the Rust cleaner/noise pipeline and decide which documents are safe for downstream processing. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if present +- Important parameters: + - `threshold`: badness threshold + - `drop_bad`: whether to remove bad files from downstream selection + - `empty_char_threshold`, `empty_min_pages`: heuristics for OCR rerun recommendation +- Main outputs: + - `clean_markdown/.md` + - cleaner report parquet + - updated parquet columns such as `filter`, `needs_ocr`, and metrics fields +- Operational note: this stage is the quality gate that drives `section()` and `ocr()`. + +## ocr() + +```python +ocr( + *, + fix_bad: bool = True, + mode: str | None = None, + device: str | None = None, + model_dir: str | Path | None = None, + max_pages: int | None = None, + persist_engine: bool = True, + limit: int | None = None, + dpi: int | None = None, + precision: str | None = None, + math_enhance: bool = True, + math_targets: dict[str, list[tuple[int,int]]] | None = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = 'single', + devices: list[int] | None = None, + force: bool | None = None, +) -> None +``` + +- Purpose: selective OCR retry and optional Phase‑2 math/code enrichment. +- Mode selection: + - `ocr_bad`: rerun OCR only for cleaner-flagged docs + - `math_only`: run enrichment from existing Docling JSON + - `ocr_bad_then_math`: OCR flagged docs, then enrich them +- Important parameters: + - `mode`, `fix_bad`, `math_enhance` + - `use_gpus`, `devices` + - `math_targets` to restrict enrichment to specific items +- Main outputs: + - refreshed `markdown/.md` + - refreshed cleaner/parquet metadata after OCR reruns + - `json/.latex_map.jsonl` when enrichment runs + +## formula_enrich_from_json() + +```python +formula_enrich_from_json( + files: list[str] | None = None, + *, + device: str = 'cuda', + batch_size: int = 8, + dpi_base: int = 220, + targets_by_stem: dict[str, list[tuple[int,int]]] | None = None, +) -> None +``` + +- Purpose: Phase‑2 GPU enrichment from previously exported Docling JSON. +- Typical inputs: + - `json/.docling.json(.zst)` + - optional formula/code index data +- Important parameters: + - `files`: restrict to specific stems + - `device`, `batch_size`, `dpi_base` + - `targets_by_stem`: target specific `(page_no, item_index)` tuples +- Main outputs: + - enriched markdown back into `markdown/.md` + - `json/.latex_map.jsonl` + +## section(), annotate() + +```python +section() -> None +annotate(annotation_type: str = 'text', fully_annotate: bool = True) -> None +``` + +- `section()`: + - purpose: convert markdown into one row per section with structural flags + - inputs: markdown selected by cleaner/parquet metadata + - outputs: `sections/sections_for_annotation.parquet` +- `annotate()`: + - purpose: classify sections and optionally expand them into full document structure + - important parameters: `annotation_type='text'|'chapter'|'auto'`, `fully_annotate` + - outputs: `classified_sections.parquet` and `fully_annotated_sections.parquet` + +## download() + +```python +download( + input_parquet: str | Path, + *, + links_column: str | None = None, + parallelize_by: str | None = None, + verbose: bool | None = None, + **kwargs, +) -> pd.DataFrame +``` + +- Purpose: fetch source files described in a parquet dataset. +- Typical inputs: + - an explicit `input_parquet` + - or the first parquet file found in `input_dir` +- Important parameters: + - `links_column`: override URL column name + - `parallelize_by`: choose grouping for the scheduler + - `download_mode`: one of `standard`, `auto`, or `browser` + - `browser_mode=True`: alias for `download_mode="browser"` + - `download_policy_file`: route specific domains/URL patterns to `standard`, `auto`, or `browser` + - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` + - returned `pd.DataFrame` with download status and metadata + +Browser-capable download mode is intended for browser-gated file endpoints where a real file still exists behind session/bootstrap checks. It is not a general viewer extractor. Viewer-only sources should still fail cleanly with a recorded error and no local file artifact. + +Example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_mode="browser", +) +``` + +Policy-routed example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +## triage_math() + +- Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. +- Inputs: `json/metrics/.per_page.metrics.json` +- Outputs: updated `download_results` parquet with routing fields such as formula totals and phase recommendation + +## Suggested Reading Order + +1. `download()` if you start from URLs. +2. `extract()` for Phase‑1 layout/markdown. +3. `clean()` to decide what needs OCR. +4. `ocr()` if you need OCR retry or Phase‑2 enrichment. +5. `section()` and `annotate()` for structured downstream outputs. + +--- + +See also: +- Code map: ../code_map.md +- Pipeline overview and artifacts: ../pipeline.md +- Configuration and environment variables: ../configuration.md +- OCR and math enrichment details: ../ocr_and_math_enhancement.md diff --git a/docs/api_corpus_tmp.md b/docs/api_corpus_tmp.md index 4181094..e584308 100644 --- a/docs/api_corpus_tmp.md +++ b/docs/api_corpus_tmp.md @@ -44,7 +44,7 @@ extract( ) -> None ``` -- Phase‑1 extraction; set `force_ocr=True` for OCR. +- Phase‑1 extraction; `force_ocr` is deprecated and ignored. - Docling layout JSON now writes by default (`json/.docling.json(.zst)`); set `emit_formula_index=True` to also produce `json/.formula_index.jsonl`. - Set `use_gpus='multi'` to use all visible GPUs (shared queue). @@ -85,7 +85,7 @@ ocr( ) -> None ``` -- Convenience shim that re‑runs `extract(force_ocr=True)` on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. +- Convenience shim that re-runs OCR on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. ## formula_enrich_from_json() diff --git a/docs/architecture/index.md b/docs/architecture/index.md index a8d8621..f6e1c85 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -103,7 +103,7 @@ Purpose: Important characteristics: -- can use RapidOCR via Docling or DeepSeek OCR +- uses DeepSeek OCR for remediation while keeping Docling in the surrounding extraction/layout flow - reads metadata to find OCR candidates - skiplist-aware - designed as a corrective stage, not the default for every document diff --git a/docs/code_map.md b/docs/code_map.md new file mode 100644 index 0000000..8616def --- /dev/null +++ b/docs/code_map.md @@ -0,0 +1,60 @@ +# Code Map + +This page maps the main documentation ideas to the code that implements them. It is +meant to help you move from "what does GlossAPI do?" to "where do I change it?" +without reading the entire repo. + +## Top-Level Entry Points + +| Area | Main code | Responsibility | +| --- | --- | --- | +| Public package entry | `src/glossapi/__init__.py` | Lazy-exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes without pulling heavy runtime dependencies at import time. | +| High-level orchestration | `src/glossapi/corpus/corpus_orchestrator.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | + +## Pipeline Stages + +| Stage | Main methods/classes | Notes | +| --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | Supports URL expansion, deduplication, checkpoints, per-domain scheduling, and resume. | +| Extract | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` | Handles backend choice, GPU preflight, and single- vs multi-GPU dispatch. | +| Clean / quality gate | `Corpus.clean()` | Runs the Rust cleaner and merges quality metrics back into parquet metadata. | +| OCR retry / math follow-up | `Corpus.ocr()`, `Corpus.formula_enrich_from_json()` | Re-runs OCR only for flagged documents and optionally performs Phase-2 math/code enrichment from JSON. | +| Sectioning | `Corpus.section()`, `GlossSection.to_parquet()` | Converts markdown documents into section rows for later classification. | +| Classification / annotation | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | Runs the SVM classifier and post-processes section labels into final document structure. | +| Export / triage | `Corpus.jsonl()`, `Corpus.triage_math()` | Produces training/export JSONL and computes routing hints for math-dense documents. | + +## Backend and Runtime Helpers + +| File | Responsibility | +| --- | --- | +| `src/glossapi/ocr/docling/pipeline.py` | Canonical builder for the layout-only Docling Phase-1 pipeline, including runtime tuning knobs for the current Docling API. | +| `src/glossapi/ocr/docling_pipeline.py` | Compatibility re-export for the canonical Docling pipeline builder. | +| `src/glossapi/ocr/deepseek/runner.py` | Launches the DeepSeek OCR remediation path from `Corpus.ocr()`. | +| `src/glossapi/ocr/utils/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/corpus/phase_ocr_math.py` | Runs DeepSeek OCR remediation, math/code enrichment, and parquet status updates. | +| `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | + +## Rust Extensions + +| Crate | Path | Purpose | +| --- | --- | --- | +| Cleaner | `rust/glossapi_rs_cleaner` | Markdown cleaning, script/noise filtering, and report generation used by `Corpus.clean()`. | +| Noise metrics | `rust/glossapi_rs_noise` | Fast quality metrics used by the broader pipeline and package build configuration. | + +## Tests To Read First + +| Test | Why it matters | +| --- | --- | +| `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | +| `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | +| `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | +| `tests/test_ocr_dispatch_backends.py` | Covers the DeepSeek-only OCR dispatch contract and backend validation. | + +## If You Need To Change... + +- Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. +- Phase-1 parsing, worker fanout, or artifact generation: start in `src/glossapi/corpus/phase_extract.py`, `src/glossapi/corpus/corpus_orchestrator.py`, and `src/glossapi/gloss_extract.py`. +- Docling pipeline wiring or runtime tuning: start in `src/glossapi/ocr/docling/pipeline.py` and `src/glossapi/gloss_extract.py`. +- Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus/corpus_orchestrator.py`. diff --git a/docs/configuration.md b/docs/configuration.md index 659d65c..af8737a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -18,30 +18,47 @@ GlossAPI exposes two Phase‑1 profiles. Use `Corpus.extract(..., phase1_backend Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread per worker so multi‑GPU runs do not explode thread counts. -### DeepSeek optional dependencies +### Docling Runtime Tuning -Install DeepSeek backend extras to enable the DeepSeek OCR path (imports remain lazy, so the package is optional). Use the CUDA 12.1 wheels for both vLLM and Torch: +These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: -```bash -pip install '.[deepseek]' +- `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. +- `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. +- `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. +- `GLOSSAPI_DOCLING_QUEUE_MAX_SIZE`: override Docling `queue_max_size`. +- `GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT`: override Docling `document_timeout`. +- `GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL`: override Docling `batch_polling_interval_seconds`. -# Install Torch CUDA 12.1 wheels (required by the DeepSeek script) -pip install --extra-index-url https://download.pytorch.org/whl/cu121 \ - 'torch==2.5.1+cu121' 'torchvision==0.20.1+cu121' +### DeepSeek optional dependencies -# Alternatively, use the requirements file (edit to uncomment torch lines): -pip install -r deepseek-ocr/requirements-deepseek.txt +Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: + +```bash +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. +The dedicated uv profile is OCR-only and does not install the Docling extraction stack. ### DeepSeek runtime controls -- `GLOSSAPI_DEEPSEEK_ALLOW_STUB` (`1` by default): allow the builtin stub runner for tests and lightweight environments. -- `GLOSSAPI_DEEPSEEK_ALLOW_CLI` (`0` by default): flip to `1` to force the real vLLM CLI even when the stub is allowed. -- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs `run_pdf_ocr_vllm.py` (defaults to the current interpreter). -- `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT`: override path to the DeepSeek CLI script (defaults to `deepseek-ocr/run_pdf_ocr_vllm.py` under the repo). -- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths (e.g., for `libjpeg-turbo`) when launching the CLI. +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB`: must remain `0`; stub execution is rejected. +- `GLOSSAPI_DEEPSEEK_ALLOW_CLI`: keep at `1` to require the real runtime. +- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. +- `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`: override path to the OCR runner script (defaults to `src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`). +- `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. +- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. + +Standard OCR defaults: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` +- `repair_mode='auto'` +- `scheduler='auto'` +- `target_batch_pages=160` + +The DeepSeek runners now default to `max_new_tokens=2048`. Do not leave the token cap implicit in one environment and explicit in another when comparing benchmarks. ## Math Enrichment (Phase‑2) @@ -71,10 +88,6 @@ All LaTeX policy knobs are loaded via `glossapi.text_sanitize.load_latex_policy( - `GLOSSAPI_WORKER_LOG_DIR`: override the directory used for per-worker logs and `gpu.current` markers (defaults to `logs/ocr_workers/` or `logs/math_workers/` under the output directory). - `GLOSSAPI_WORKER_LOG_VERBOSE` = `1|0` (default `1`): emit (or suppress) the GPU binding banner each worker prints on startup. -## RapidOCR Model Paths - -- `GLOSSAPI_RAPIDOCR_ONNX_DIR`: directory containing `det/rec/cls` ONNX models and keys. - ## Triage & Parquet - Triage always writes both: diff --git a/docs/getting_started.md b/docs/getting_started.md index f6bf4ce..d1557d3 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -4,46 +4,47 @@ This guide gets a new GlossAPI contributor from clone → first extraction with ## Checklist -- Python 3.8+ (3.10 recommended) +- Python 3.10+ (`3.12` recommended for the DeepSeek runtime) - Recent `pip` (or `uv`) and a C/C++ toolchain for Rust wheels -- Optional: NVIDIA GPU with CUDA 12.x drivers for Docling/RapidOCR acceleration +- Optional: NVIDIA GPU with CUDA drivers for Docling/DeepSeek acceleration ## Install GlossAPI -### Recommended — mode-aware setup script +### Recommended setup -Use `dependency_setup/setup_glossapi.sh` to build an isolated virtualenv with the correct dependency set for vanilla, RapidOCR, or DeepSeek runs. Examples: +Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `dependency_setup/setup_deepseek_uv.sh` for the OCR runtime. Examples: ```bash -# Vanilla pipeline (CPU-only OCR) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# RapidOCR GPU stack -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR on GPU (expects weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR on GPU (uv-managed, downloads DeepSeek-OCR-2 if requested) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Add `--download-deepseek` if you need the script to fetch weights via Hugging Face; otherwise it searches `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Inspect `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation runs. The script installs GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `glossapi[deepseek]` and leaves Docling in the main environment. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the CLI can run (env vars, model dir, flashinfer, cc1plus, libjpeg). -- Force the real CLI and avoid stub fallback by setting: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. +- Force the real runtime and avoid stub fallback by setting: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- Install a CUDA toolkit with `nvcc` and set `CUDA_HOME` / prepend `$CUDA_HOME/bin` to `PATH` (FlashInfer/vLLM JIT expects it). -- If FlashInfer is unstable on your stack, disable it with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- Avoid FP8 KV cache issues by exporting `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1`; tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. -- Keep `LD_LIBRARY_PATH` pointing at the toolkit lib64 (e.g. `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`). + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- Standard OCR defaults after setup: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `scheduler='auto'` + - `target_batch_pages=160` +- `flash-attn` is optional. The runner uses it when available and otherwise falls back to the Transformers `eager` attention implementation. +- Do not benchmark against an ad hoc DeepSeek venv and compare it to the validated `dependency_setup/.venvs/deepseek` results as if they were the same stack. ### Option 1 — pip (evaluate quickly) @@ -74,30 +75,19 @@ chmod +x scripts/setup_conda.sh conda activate glossapi ``` -The helper script provisions Python 3.10, installs Rust + `maturin`, performs an editable install, and applies the Docling RapidOCR patch automatically. +The helper script provisions Python 3.10, installs Rust + `maturin`, and performs an editable install. ## GPU prerequisites (optional but recommended) -`setup_glossapi.sh` pulls the right CUDA/Torch/ONNX wheels for the RapidOCR and DeepSeek profiles. If you are curating dependencies manually, make sure you: +`setup_glossapi.sh` and `setup_deepseek_uv.sh` pull the required Torch wheels for the supported Docling and DeepSeek flows. If you are curating dependencies manually, make sure you: -- Install the GPU build of ONNX Runtime (`onnxruntime-gpu`) and uninstall the CPU wheel. -- Select the PyTorch build that matches your driver/toolkit (the repository currently targets CUDA 12.8 for DeepSeek). +- Select the PyTorch build that matches your driver/toolkit. - Verify the providers with: ```bash - python -c "import onnxruntime as ort; print(ort.get_available_providers())" python -c "import torch; print(torch.cuda.is_available())" ``` -## RapidOCR models & keys - -GlossAPI ships the required ONNX models and Greek keys under `glossapi/models/rapidocr/{onnx,keys}`. To override them, set `GLOSSAPI_RAPIDOCR_ONNX_DIR` to a directory containing: - -- `det/inference.onnx` -- `rec/inference.onnx` -- `cls/ch_ppocr_mobile_v2.0_cls_infer.onnx` -- `greek_ppocrv5_keys.txt` - ## First run (lightweight corpus) ```bash diff --git a/docs/index.md b/docs/index.md index d696c8d..997d2d8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,21 +7,11 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Quickstart Recipes](quickstart.md) — common extraction/OCR flows in copy-paste form. - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. -## Understand the architecture -- [Architecture Overview](architecture/index.md) — the end-to-end staged model and why it exists. -- [Core Design Principles](architecture/core_design_principles.md) — the design constraints that shape the pipeline. -- [Docling Throughput and Batching](architecture/docling_throughput_and_batching.md) — how throughput and stability trade off. -- [Failure Recovery and Skiplist](architecture/docling_failure_recovery_and_skiplist.md) — how the pipeline survives problematic PDFs. -- [Greek Text Validation](architecture/greek_text_validation.md) — why extraction success is not enough for Greek corpora. -- [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. -- [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. -- [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. - ## Learn the pipeline +- [Code Map](code_map.md) links the main documentation ideas to the classes and files that implement them. - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. -- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers Docling + RapidOCR usage. +- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. -- [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. ## Configure and debug - [Configuration](configuration.md) lists all environment knobs. @@ -29,5 +19,5 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. ## Reference -- [Corpus API](api/corpus.md) details public methods and parameters. -- `docs/divio/` contains placeholder pages for the upcoming Divio restructuring—feel free to open PRs fleshing them out. +- [Corpus API](api/corpus.md) gives the compact contract view of the main public methods. +- [Legacy Corpus API Notes](api_corpus_tmp.md) remains available while the docs are being consolidated. diff --git a/docs/math_enrichment_runtime.md b/docs/math_enrichment_runtime.md index 21d8617..096209c 100644 --- a/docs/math_enrichment_runtime.md +++ b/docs/math_enrichment_runtime.md @@ -68,9 +68,8 @@ c.ocr(math_targets=targets, math_batch_size=4) ## OCR/Model Constraints (recap) -- ORT GPU only: uninstall `onnxruntime` CPU; use `onnxruntime-gpu`. -- RapidOCR keys: Docling 2.48.0 needs `Rec.rec_keys_path` patch (see README). -- Model discovery: set `GLOSSAPI_RAPIDOCR_ONNX_DIR` or package models under `glossapi/models/rapidocr/`. +- DeepSeek OCR runs in its own pinned runtime; set `GLOSSAPI_DEEPSEEK_PYTHON`, `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`, and `GLOSSAPI_DEEPSEEK_MODEL_DIR`. +- Keep `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` and `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`. - Optional Torch CUDA: needed for GPU layout/enrichment; see README for the CUDA wheels. ## Multi‑GPU diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index b1b8956..feb3283 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -8,10 +8,11 @@ file paths**, so no worker rescans directories. ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. +- `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 197bb0a..b013dd3 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -1,15 +1,14 @@ # GPU OCR and Math Enrichment -This document summarizes how GlossAPI uses the GPU for OCR and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. +This document summarizes how GlossAPI uses the GPU for OCR remediation and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. ## Overview -- Phase‑1 (Extract): PDF → Markdown via Docling; optional GPU OCR via RapidOCR (ONNXRuntime). Optionally emit JSON + formula index for Phase‑2. +- Phase‑1 (Extract): PDF → Markdown via Docling or the safe backend. Optionally emit JSON + formula index for Phase‑2. - Phase‑2 (Enrich): From Docling JSON, decode math/code on the GPU (CodeFormula) and re‑emit enriched Markdown. Backends -- `backend='rapidocr'` (default): Docling + RapidOCR; Phase‑2 math runs from Docling JSON. -- `backend='deepseek'`: DeepSeek‑OCR; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. +- `backend='deepseek'`: DeepSeek-OCR-2; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. Policy: never OCR and math on the same file - If a file needs OCR, GlossAPI runs OCR only (no Phase‑2 on that file in the same pass). @@ -18,24 +17,43 @@ Policy: never OCR and math on the same file ### Python API layout - DeepSeek entry point: `glossapi.ocr.deepseek.runner.run_for_files(...)` -- RapidOCR dispatcher: `glossapi.ocr.rapidocr.dispatch.run_via_extract(...)` - Math enrichment: `glossapi.ocr.math.enrich.enrich_from_docling_json(...)` - Utility helpers (Docling JSON / cleaning): `glossapi.ocr.utils.*` ## Prerequisites -- RapidOCR/Docling stack: `pip install '.[rapidocr]'` -- DeepSeek CLI stack (in a dedicated venv recommended): `pip install '.[deepseek]'` -- ONNXRuntime GPU installed (no CPU ORT): `onnxruntime-gpu==1.18.1` -- Torch CUDA installed: e.g., `torch==2.5.1+cu121` -- Packaged RapidOCR models/keys found under `glossapi/models/rapidocr/{onnx,keys}` or via `GLOSSAPI_RAPIDOCR_ONNX_DIR`. +- Main GlossAPI stack: `./dependency_setup/setup_glossapi.sh --mode docling` +- DeepSeek runtime: `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek` +- Torch CUDA installed in the DeepSeek env (the uv setup pins the tested stack). - Optional helpers for Phase‑2 JSON: `pypdfium2`, `zstandard`. +### Standard DeepSeek venv + +Use a dedicated OCR runtime and treat it as the source of truth for DeepSeek runs: + +```bash +./dependency_setup/setup_deepseek_uv.sh \ + --venv dependency_setup/.venvs/deepseek \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ + --run-tests --smoke-test +``` + +Recommended environment variables after setup: + +```bash +export GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 +export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 +export GLOSSAPI_DEEPSEEK_PYTHON="$PWD/dependency_setup/.venvs/deepseek/bin/python" +export GLOSSAPI_DEEPSEEK_MODEL_DIR="/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2" +``` + +The OCR runtime should not silently drift between ad hoc virtual environments during benchmarking. If a benchmark uses a different DeepSeek venv, treat the result as a different runtime stack. + Verify GPU readiness before forcing OCR or math: ```bash python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())" # expects True, >=1 -python -c "import onnxruntime as ort; print(ort.get_available_providers())" # must include CUDAExecutionProvider ``` ## Running Phase‑1 (Extract) @@ -44,17 +62,14 @@ python -c "import onnxruntime as ort; print(ort.get_available_providers())" from glossapi import Corpus c = Corpus('IN','OUT') -# GPU OCR on PDFs; emit JSON + formula index for Phase‑2 +# Emit JSON + formula index for Phase‑2 c.extract( input_format='pdf', - accel_type='CUDA', # or use_gpus='multi' for multi‑GPU - force_ocr=True, # OCR always on for PDFs + accel_type='CUDA', emit_formula_index=True, # request json/.formula_index.jsonl alongside the default JSON ) ``` -When `force_ocr=True` (or when math/code enrichment is enabled), GlossAPI automatically switches to the Docling backend and aborts if CUDA‑enabled torch/ONNXRuntime providers are not available. - Outputs: - `markdown/.md` - `json/.docling.json(.zst)` and `json/.formula_index.jsonl` @@ -88,20 +103,64 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → runs OCR only for bad files; equations are included inline; Phase‑2 is skipped ``` -If you need Phase‑2 math on files that do not require OCR, use RapidOCR/Docling and math‑only (expects Docling JSON from Phase‑1): +If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. + +### DeepSeek fast path + +The current recommended high-throughput DeepSeek configuration is: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` as the standard default ceiling +- `repair_mode='auto'` to keep markdown as the primary output while selectively rerunning suspicious pages +- `scheduler='auto'` so multi-GPU vLLM runs resolve to exact-fill page-range batching +- `target_batch_pages=160` +- large `vllm_batch_size` chosen to keep `sec/page/GPU` at or below the best validated floor for the target hardware + +Example: ```python -c.ocr(backend='rapidocr', fix_bad=False, math_enhance=True, mode='math_only') -# → runs Phase‑2 on non‑OCR files only (requires Docling JSON) +c.ocr( + backend='deepseek', + fix_bad=True, + math_enhance=False, + runtime_backend='vllm', + ocr_profile='markdown_grounded', + max_new_tokens=2048, + vllm_batch_size=160, + gpu_memory_utilization=0.9, + repair_mode='auto', + scheduler='auto', + target_batch_pages=160, + use_gpus='multi', +) ``` +`repair_mode='auto'` runs the pipeline in distinct phases inside the vLLM runner: + +1. markdown first pass over all rendered pages +2. cheap per-page triage using output quality plus simple image density statistics +3. plain-text rerun bucket for garbage markdown pages +4. tiled markdown rerun bucket for short coverage failures + +This keeps the fast path batched while avoiding per-page sequential fallback overhead. + +### What is now implemented + +- Empty-page skipping before OCR dispatch +- Streaming garbage early-stop during markdown generation +- Plain-text retry for pages that hit the garbage early-stop +- Multi-GPU exact-fill page-range scheduling for the DeepSeek runner +- Benchmark harness support for `whole_doc`, `fixed_shard`, and `exact_fill` +- Corpus API forwarding for the scheduler controls + ## Multi‑GPU Phase‑1 (extract): ```python -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` -Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. OCR uses ORT GPU under the same process. +Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. Phase‑2 (enrich): ```python @@ -118,9 +177,73 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning +### Validated benchmark floor + +The current non-regression metric is `sec/page/GPU`. + +Validated on 2026-03-30: + +- Host: AWS `g7e.48xlarge` +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Render DPI: `144` +- GPU memory utilization: `0.9` +- Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` + +Production markdown+repair benchmark on the same host: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Max new tokens: `2048` +- GPUs: `8` +- Static sharding (`1` shard/GPU), validated rerun after classifier hardening: `558.88s` wall, `0.0733 sec/page` overall, `0.4912` to `0.5475 sec/page/GPU` +- Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` +- Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU +- Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches + +Validated on 2026-03-31 after standardizing the DeepSeek runtime ceiling back to `2048` and restoring the persistent one-process-per-lane architecture: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Scheduler: `whole_doc` +- Max new tokens: `2048` +- GPUs: `8` +- Clean rebuilt whole-document rerun: about `541s` wall, `0.0710 sec/page` overall, and `0.3927` to `0.5000 sec/page/GPU` + +Interpretation: + +- The rebuilt stack is back near the validated March 30 throughput once the silent `8192` ceiling regression is removed. +- The remaining performance problem is not raw inference speed; it is whole-document tail imbalance, where one oversized PDF can keep a single GPU busy after the other lanes finish. +- Multi-GPU `exact_fill` must therefore be benchmarked only on the persistent lane-worker architecture. The earlier exact-fill regression was caused by spawning a fresh OCR CLI per batch, not by the scheduling idea itself. + +Decision: + +- Keep static sharding as the default large-run pipeline shape for now +- Do not enable streaming admission by default yet; on this benchmark it regressed badly versus static sharding +- Treat the earlier `0.3109 sec/page/GPU` result as the raw floor, and the static repaired-markdown result above as the current production-like baseline on this hardware +- Treat the 2026-03-31 clean whole-document rerun as the restored benchmark sanity check for the standardized `2048` ceiling on the rebuilt runtime + +Attention/runtime note: + +- The production fast path is `vllm`; logs on this stack show `flashinfer` autotuning plus CUDA graph capture +- Transformers remain the fallback path; prefer `flash_attention_2` there and do not optimize around `sdpa` + +That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. + +Default policy note: + +- The standard DeepSeek OCR default is now `max_new_tokens=2048` for both the Transformers and vLLM runners. +- Leaving the flag unset must not silently expand to a larger ceiling such as `8192`. +- When comparing benchmark runs, treat a different token ceiling or a different DeepSeek venv as a different runtime/configuration. + - Batch sizes - - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula docling side throughput. + - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. + - DeepSeek vLLM: push `vllm_batch_size` as high as the hardware allows while tracking `sec/page/GPU`; on the validated `g7e.48xlarge` path, larger batches continued improving throughput through `batch_size=160`. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. @@ -159,11 +282,7 @@ OUT/ ## Troubleshooting -- Missing CUDAExecutionProvider - - Ensure `onnxruntime-gpu` is installed and `onnxruntime` CPU is uninstalled. - Torch reports no CUDA - Check `nvidia-smi` and match Torch CUDA build to your driver. -- OCR is slow or falls back to CPU - - Confirm ORT providers include CUDAExecutionProvider and that `accel_type='CUDA'` is used. - Out of memory - Lower `batch_size` for Phase‑2, reduce `GLOSSAPI_IMAGES_SCALE`, or split inputs. diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md new file mode 100644 index 0000000..590d18b --- /dev/null +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -0,0 +1,496 @@ +# OpenArchives OCR Rollout Plan + +This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. + +## Implemented tooling + +The rollout is backed by concrete scripts in `src/glossapi/scripts/`: + +- `openarchives_ocr_enrich.py` + - reads the canonical OpenArchives parquet + - scans raw HF JSONL shards for the target docs + - extracts `page_count_source`, `pages_total_source`, and `pdf_url` + - writes a shard-ready enriched parquet for OCR deployment +- `openarchives_ocr_shards.py` + - reads the canonical parquet + - filters `needs_ocr=True` + - balances documents across `N` nodes by page count + - writes one shard manifest parquet per node + - writes a JSON summary with page totals and ETA +- `openarchives_ocr_merge.py` + - merges shard-level OCR metadata back into the canonical parquet by `filename` + +These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. + +## Executed result on 2026-03-31 + +The CPU fallback path has now been executed successfully on AWS: + +- CPU cleaner node: + - instance: `c7i.8xlarge` + - instance id: `i-0ccf5ab1a510b31d8` +- Full OA reevaluation fill: + - input rows: `179,845` + - missing `greek_badness_score` rows materialized and cleaned: `89,892` + - unique raw JSONL shards needed for the fill subset: `108` +- Filled routing result: + - `greek_badness_score` coverage: `179,845 / 179,845` + - `needs_ocr == true`: `45,547` +- Enriched OCR target manifest: + - OCR-target docs: `45,547` + - OCR-target pages: `3,292,392` + - raw JSONL shards needed for the full OCR target set: `218` +- Balanced 4-node shard result: + - `4` shard manifests + - `823,098` pages per node + - `11,386` or `11,387` docs per node +- ETA from validated `g7e.48xlarge` throughput: + - one node: `64.94h` + - four nodes: `16.23h` + +Published artifacts on Hugging Face dataset `glossAPI/openarchives.gr`: + +- `data/openarchives_ocr_completion/20260331/summary.json` +- `data/openarchives_ocr_completion/20260331/filled_document_level.parquet` +- `data/openarchives_ocr_completion/20260331/filled_document_quality.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/needs_ocr_enriched.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_00.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_01.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_02.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_03.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_summary.json` + +## Node runner contract + +Each OCR node should materialize one shard into its own GlossAPI corpus root and +run DeepSeek OCR through the standard `Corpus.ocr(...)` API, not through a +standalone benchmark wrapper. + +Stored runner: + +- `python -m glossapi.scripts.openarchives_ocr_run_node` +- `python -m glossapi.scripts.openarchives_download_freeze` + +The runner does four things in order: + +1. reads one shard parquet +2. downloads the shard PDFs into `downloads/` using their OA filenames +3. writes the shard metadata as canonical `download_results/download_results.parquet` +4. runs `Corpus.ocr(...)` with the validated DeepSeek settings + +The download-freeze runner is the matching download-only entrypoint: + +1. reads one OA manifest parquet +2. downloads the PDFs into `downloads/` using their OA filenames +3. writes canonical `download_results/download_results.parquet` +4. stops there, without starting OCR + +Download policy note: + +- OpenArchives download should be host-first, not collection-first. +- GlossAPI now supports host-specific download policy overrides in the normal downloader path for: + - `downloader` + - `request_timeout` + - `ssl_verify` + - `ssl_cafile` + - `request_method` + - `sleep` + - `per_domain_concurrency` + - `domain_concurrency_floor` + - `domain_concurrency_ceiling` + - `skip_failed_after` + - `domain_cookies` +- That means the OA freeze-download phase can stay inside `Corpus.download(...)`; we do not need a separate downloader implementation. +- Stored OA policy sample: + - `samples/openarchives_download_policy.yml` +- Stored OA probe runner: + - `python -m glossapi.scripts.openarchives_download_probe` +- OA download runs should use `scheduler_mode=per_domain` together with `parallelize_by=base_domain`, + otherwise the host-level concurrency policy is mostly inert. +- Probe result on the CPU box: + - `dspace.lib.ntua.gr` succeeds cleanly once OA downloads use `scheduler_mode=per_domain` + and the host is throttled to a single in-flight request + - `ktisis.cut.ac.cy` succeeds with `ssl_verify=false` + - `repository.academyofathens.gr`, `repository.ihu.gr`, `pergamos.lib.uoa.gr`, + and `dione.lib.unipi.gr` behaved like standard hosts in the probe + - `ikee.lib.auth.gr` is not just a pre-ping false negative; direct PDF requests hit + real connection timeouts + - `olympias.lib.uoi.gr` is not just a pre-ping false negative either; direct PDF + requests reach the host but stall on response reads +- Operational recommendation: + - bulk-freeze the good hosts first + - keep `ikee.lib.auth.gr` and `olympias.lib.uoi.gr` in a dedicated slow-path download phase + so they do not dominate the main corpus freeze run + +Standard node command: + +```bash +PYTHONPATH=src /home/ubuntu/venvs/deepseek/bin/python -m glossapi.scripts.openarchives_ocr_run_node \ + --shard-parquet /data/openarchives/shards/openarchives_ocr_shard_node_00.parquet \ + --work-root /data/openarchives/node_00 \ + --heartbeat-path /data/openarchives/heartbeats/node_00.json \ + --instance-id "$INSTANCE_ID" \ + --node-id node-00 \ + --scheduler whole_doc \ + --runtime-backend vllm \ + --ocr-profile markdown_grounded \ + --render-dpi 144 \ + --max-new-tokens 2048 \ + --repair-mode auto \ + --gpu-memory-utilization 0.9 +``` + +Current rollout note: + +- use `scheduler=whole_doc` for the first production OA pass because that is the + last large-run configuration validated cleanly on the standardized stack +- keep `exact_fill` as the next benchmarking target, but do not silently switch + the production rollout to it until the same stack shows a non-regression or + improvement + +## Current validated baseline + +- Validated OCR node type: `g7e.48xlarge` +- Validated AMI: `ami-052266c3e21dff7db` +- AMI name: `Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04) 20260320` +- Validated runtime stack on the OCR node: + - `torch 2.10.0+cu130` + - `vllm 0.18.0` + - `transformers 4.57.6` +- Standard DeepSeek settings: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` +- Restored clean benchmark on the stopped OCR box: + - `7,624` pages in about `541s` + - about `0.0710 sec/page` overall on one `8`-GPU node + - about `0.3927` to `0.5000 sec/page/GPU` +- Derived per-node throughput: + - about `14.08 pages/sec` + - about `50,700 pages/hour` + +## Current AWS capacity + +`us-east-1` service quotas currently allow: + +- `Running On-Demand G and VT instances = 768` +- `Running On-Demand Standard instances = 640` + +For the validated OCR node: + +- `g7e.48xlarge = 192 vCPU, 8 GPUs` + +So the current maximum concurrent validated OCR fleet is: + +- `floor(768 / 192) = 4` nodes +- total rollout capacity: `32 GPUs` + +## Phase 1: Recover or regenerate the canonical OCR routing state + +Goal: + +- produce one canonical `download_results/download_results.parquet` for the OpenArchives corpus root +- ensure it contains, at minimum: + - `filename` + - `needs_ocr` + - `greek_badness_score` + - `mojibake_badness_score` + - `ocr_success` + - `page_count` or `pages_total` + +Decision order: + +1. Check the stopped GPU OCR instance first. +2. If the full corpus parquet is not there, run a dedicated CPU cleaning pass. + +### 1A. Check the stopped OCR instance first + +Reason: + +- the NVMe volume persists across stop/start +- if the full OpenArchives cleaning pass was already run there, this is the fastest path + +Concrete steps: + +1. Start instance `i-0504a326a1fee541f`. +2. SSH in and search for the full OpenArchives corpus root and canonical parquet: + - `find /opt /data /home -name download_results.parquet` + - verify row count is the full OpenArchives set, not the `43`-document benchmark subset +3. Validate that the parquet has the required OCR routing columns listed above. +4. If found: + - copy the canonical parquet and any supporting cleaner outputs back to stable storage + - stage a copy on `home` + - upload the parquet artifact to the Hugging Face dataset repo as routing metadata + +Acceptance check: + +- row count matches the full OpenArchives working set +- `needs_ocr=True` count is available directly from the parquet +- page totals are available + +Current state on 2026-03-31: + +- checked OCR instance `i-0504a326a1fee541f` +- no `download_results.parquet` was found under `/opt`, `/data`, or `/home` +- therefore this path did not recover the canonical OpenArchives routing parquet +- the rollout should proceed with the CPU cleaning-pass fallback below + +### 1B. Fallback: regenerate the routing state on a CPU instance + +If the OCR box does not contain the full canonical parquet: + +- launch a dedicated CPU node for the cleaner pass +- recommended instance family: `c7i` or `r7i` +- recommended first choice: `c7i.8xlarge` with sufficient gp3 storage for the OpenArchives markdown/output root + +Reason: + +- `Corpus.clean()` is CPU-bound and does not need GPUs +- we only need one clean, reproducible routing pass + +Concrete steps: + +1. Launch one Ubuntu 24.04 CPU instance. +2. Clone `glossapi-development` at `development`. +3. Bootstrap the standard GlossAPI environment. +4. Mount or sync the full OpenArchives corpus root. +5. Run `Corpus.clean()` over the full markdown corpus. +6. Verify that `download_results/download_results.parquet` now exists and includes the required OCR routing columns. +7. Store the resulting parquet: + - on the corpus root + - on `home` + - in the Hugging Face dataset repo as routing metadata + +## Phase 2: Quantify the actual OCR workload + +Once the canonical parquet exists: + +1. Filter `needs_ocr == True` +2. Count: + - total documents + - total pages from `pages_total` or `page_count` +3. Also record: + - `greek_badness_score > 60` + - `mojibake_badness_score > 0.1` + - overlap between those conditions and `needs_ocr` + +This step defines the real production workload and the true ETA. + +## Phase 3: Shard across nodes + +Shard across nodes by document, not by page range. + +Reason: + +- cross-node merge stays trivial +- node-local GPU scheduling already exists in GlossAPI +- splitting one document across nodes adds complexity without clear benefit + +### Coordinator manifest + +Build one coordinator manifest from the canonical parquet with: + +- `filename` +- stable OpenArchives document id or canonical filename +- `pages_total` +- `needs_ocr` + +Then: + +1. keep only `needs_ocr=True` +2. greedily bin-pack documents across `N=4` nodes by page count +3. write one shard manifest parquet per node + +Each shard manifest should contain: + +- `filename` +- `pages_total` +- `node_id` +- `shard_id` +- original metadata keys needed for rejoin + +### Node-local execution + +Each node: + +1. loads only its shard manifest +2. runs GlossAPI OCR over that subset +3. keeps standard GlossAPI outputs only: + - `markdown/.md` + - `json/metrics/*.json` + - shard-local `download_results.parquet` + +Inside each node: + +- use the existing GlossAPI DeepSeek path +- let node-local scheduling handle GPU balance +- do not invent a separate OCR metadata format + +## Phase 4: Merge back into the canonical corpus + +Merge rules: + +1. Markdown: + - copy updated `markdown/.md` into the canonical corpus root +2. Metrics: + - copy `json/metrics/*.json` into the canonical corpus root +3. Metadata parquet: + - concatenate shard metadata + - upsert by canonical document id / filename into the master parquet + - preserve the standard GlossAPI contract: + - `needs_ocr` + - `ocr_success` + - `processing_stage` + - page and quality fields + +Recommended additional execution metadata: + +- `ocr_node_id` +- `ocr_shard_id` +- `ocr_started_at` +- `ocr_finished_at` +- `ocr_attempt_count` + +These fields are operational and should not replace the existing GlossAPI routing fields. + +## Phase 5: Standardize all OCR nodes + +All OCR nodes should use the exact same: + +- AMI +- bootstrap script +- DeepSeek venv setup +- model path +- runtime defaults + +Standard production recipe: + +- AMI: `ami-052266c3e21dff7db` +- instance type: `g7e.48xlarge` +- DeepSeek venv created by `dependency_setup/setup_deepseek_uv.sh` +- defaults: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` + +Do not allow per-node env drift during the rollout. + +Cleaner/fallback venv decision: + +- CPU cleaning pass should use the standard GlossAPI environment from `development` +- OCR nodes should use the dedicated DeepSeek venv only +- do not mix the cleaner runtime and the OCR runtime on the same benchmark measurement path + +## Instance options + +Primary OCR choice: + +- `g7e.48xlarge` + - validated benchmarked path + - `192 vCPU` + - `8` RTX PRO Server 6000 GPUs + - current recommended production OCR node + +Secondary OCR options, only if we intentionally rebenchmark: + +- `g6e.48xlarge` + - `192 vCPU` + - `8` L40S GPUs +- `g5.48xlarge` + - `192 vCPU` + - `8` A10G GPUs +- `p5.48xlarge` + - technically available, but not the cost/default target for this rollout + +Cleaner node options: + +- first choice: `c7i.8xlarge` + - `32 vCPU` + - good CPU-bound cleaner candidate +- alternative: `r7i.8xlarge` + - `32 vCPU` + - use if the cleaner pass needs more memory headroom + +## Phase 6: ETA + +Validated throughput on one node: + +- about `50,700 pages/hour` + +With `4` nodes: + +- about `202,800 pages/hour` + +Exact ETA formula: + +- `ETA_hours = total_needs_ocr_pages / 202800` + +Reference scenarios: + +- `400,000` pages: about `1.97h` +- `600,000` pages: about `2.96h` +- `800,000` pages: about `3.95h` +- `1,000,000` pages: about `4.93h` + +Equivalent document scenarios for `40,000` documents: + +- average `10` pages/doc: about `1.97h` +- average `15` pages/doc: about `2.96h` +- average `20` pages/doc: about `3.95h` +- average `25` pages/doc: about `4.93h` + +The exact ETA should be recalculated once the canonical parquet gives the real total page count for `needs_ocr=True`. + +## Phase 7: Deployment and monitoring + +### Deployment + +1. Produce canonical parquet +2. Compute shard manifests +3. Stage manifests and source data +4. Launch `4` OCR nodes +5. Bootstrap the same OCR environment on all nodes +6. Run one shard per node +7. Collect outputs +8. Merge back into the canonical corpus + +### Monitoring + +Each node should write a heartbeat JSON at a fixed interval with: + +- `node_id` +- `docs_done` +- `pages_done` +- current file +- GPU utilization snapshot +- VRAM usage snapshot +- last successful write time +- error count + +The coordinator should watch: + +- stale heartbeat +- zero progress +- failed OCR process +- low GPU utilization for a sustained period + +### Recovery + +- rerun only failed shard manifests +- keep shard manifests immutable +- merge is idempotent by canonical document id / filename + +## Immediate next actions + +1. Start the stopped OCR instance and search for the full OpenArchives canonical parquet. +2. If found, validate and upload the routing parquet to stable storage and Hugging Face. +3. If not found, launch one CPU instance and run the full `Corpus.clean()` pass. +4. Compute exact `needs_ocr` doc/page totals from the canonical parquet. +5. Generate the `4` node shard manifests. +6. Launch the `4` OCR nodes and execute the distributed run. diff --git a/docs/pipeline.md b/docs/pipeline.md index cb11662..2c00354 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -6,44 +6,150 @@ GlossAPI is a staged pipeline. You can enter at any stage and use the same folde The `Corpus` class is the stable surface of the project. New functionality should plug into the existing phase mixins so callers can stick to the small set of entrypoints (`download()`, `extract()`, `clean()`, `ocr()`, `section()`, `annotate()`, `export/jsonl*()`). The expected usage pattern is a short script that chains these calls; avoid ad-hoc monkeypatches or bypassing the orchestrator when adding features so downstream users retain resumability and consistent artifacts. -## Stages - -- Download (optional): fetch source files from URLs → `downloads/` -- Extract (Phase‑1): parse PDFs to Markdown; optional GPU OCR → `markdown/.md` -- Clean: compute quality metrics and filter low‑quality items; decide which to OCR -- OCR (compat shim): re‑run extract on filtered items with `force_ocr=True` -- JSON + index (optional): emit `json/.docling.json(.zst)` and `json/.formula_index.jsonl` for Phase‑2 -- Enrich (Phase‑2): decode FORMULA/CODE from JSON on GPU → overwrite `markdown/.md`, write `json/.latex_map.jsonl` -- Section: produce `sections/sections_for_annotation.parquet` -- Annotate: classify sections; produce `classified_sections.parquet` and `fully_annotated_sections.parquet` +## Stage Map + +| Stage | Main code | Typical inputs | Important parameters | Main outputs | +| --- | --- | --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `use_gpus`, `devices`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | +| OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | +| Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | +| Section | `Corpus.section()`, `GlossSection.to_parquet()` | markdown selected by cleaner/parquet | no major public knobs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | section parquet and classifier model | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage / export | `Corpus.triage_math()`, `Corpus.jsonl()` | metrics, parquet metadata, cleaned markdown | output path for JSONL | parquet routing hints, JSONL export | + +## Stage Contracts + +### 1. Download + +- Main code: `Corpus.download()` -> `GlossDownloader.download_files()` +- Purpose: read a metadata parquet, expand list/JSON URL cells, deduplicate URLs, download supported file types, and checkpoint progress. +- Typical inputs: + - a parquet file in `input_dir` or an explicit `input_parquet` + - a URL column such as `url` or `links_column` +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` +- Read this next if you want the scheduler details: `gloss_downloader.py` + +### 2. Extract (Phase‑1) + +- Main code: `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` +- Purpose: convert source files to markdown and optional intermediate JSON artifacts. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'` + - `use_gpus='single'|'multi'` + - `workers_per_device` to fan out more than one extraction worker onto each GPU + - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Operational note: + - `force_ocr` is deprecated and ignored in Phase‑1; use `Corpus.ocr(backend='deepseek')` after `clean()` for OCR remediation +- Main outputs: + - canonical markdown in `markdown/.md` + - optional Docling JSON and index artifacts in `json/` + - per-document and per-page metrics in `json/metrics/` + +### 3. Clean + +- Main code: `Corpus.clean()` +- Purpose: run the Rust cleaner, remove low-quality or noisy markdown, + and mark documents that may need OCR retry before moving on. +- Typical inputs: + - `markdown/*.md` + - metadata parquet, if available +- Important parameters: + - `threshold` and `drop_bad` + - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions +- Main outputs: + - cleaned markdown in `clean_markdown/` + - updated parquet metadata with quality and OCR-related flags +- Runtime/debug artifacts: + - `.processing_state.pkl` keeps track of progress so interrupted runs can resume + - `problematic_files/` keeps files that could not be cleaned successfully + - `timeout_files/` keeps files that exceeded the cleaning time limit + +### 4. OCR Retry and Phase‑2 Enrichment + +- Main code: `Corpus.ocr()` and `Corpus.formula_enrich_from_json()` +- Purpose: + - rerun OCR only for documents marked bad by the cleaner + - optionally decode formula/code regions from Docling JSON into markdown +- Modes: + - `ocr_bad` + - `math_only` + - `ocr_bad_then_math` +- Main outputs: + - refreshed `markdown/.md` + - `json/.latex_map.jsonl` when math/code enrichment runs + +### 5. Section and Annotate + +- Main code: `Corpus.section()`, `GlossSection.to_parquet()`, `Corpus.annotate()`, `GlossSectionClassifier.*` +- Purpose: + - split markdown into sections suitable for classification + - classify sections and optionally expand coarse labels into full document structure +- Main outputs: + - `sections/sections_for_annotation.parquet` + - `classified_sections.parquet` + - `fully_annotated_sections.parquet` ## Artifact Layout -``` +The tree below shows the main folders and files GlossAPI can create under +the output directory. + +To make the layout easier to follow, artifacts are grouped by the role they +play in the pipeline: + +- canonical — the main outputs a stage is expected to produce, and the + files later stages usually depend on +- runtime — state files used to resume work safely if a run is interrupted +- debug — extra files kept around when something fails or needs a closer look + OUT/ -├── downloads/ -│ └── problematic_math/ -├── download_results/ -├── markdown/ +├── downloads/ (canonical) +│ └── problematic_math/ (debug) +├── download_results/ (canonical) +├── markdown/ (canonical) +│ └── .md +├── clean_markdown/ (canonical) │ └── .md -├── json/ +├── json/ (canonical) │ ├── .docling.json(.zst) │ ├── .formula_index.jsonl │ ├── .latex_map.jsonl │ ├── metrics/ -│ ├── .metrics.json -│ └── .per_page.metrics.json -│ └── problematic_math/ -├── sections/ +│ │ ├── .metrics.json +│ │ └── .per_page.metrics.json +│ └── problematic_math/ (debug) +├── sections/ (canonical) │ └── sections_for_annotation.parquet -├── classified_sections.parquet -└── fully_annotated_sections.parquet -``` +├── classified_sections.parquet (canonical) +├── fully_annotated_sections.parquet (canonical) +├── .processing_state.pkl (runtime) +├── problematic_files/ (debug) +└── timeout_files/ (debug) Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). - Metrics lived under `markdown/` in earlier versions; they now live under `json/metrics/`. - When math enrichment cannot recover after the configured number of respawns, the corresponding PDFs and Docling artifacts are copied into the `problematic_math/` folders above and the stems are added to the fatal skip-list for later review. +- The same folder can act as both `input_dir` and `output_dir`; the pipeline creates its own subdirectories under that root. + +## Readability Shortcut + +If you only need the shortest path through the system: + +1. `Corpus.download()` if you start from URLs. +2. `Corpus.extract()` for Phase‑1 markdown. +3. `Corpus.clean()` to decide what needs OCR. +4. `Corpus.ocr()` for selective OCR and optional math/code enrichment. +5. `Corpus.section()` and `Corpus.annotate()` for structured outputs. + +If you need to jump from these ideas to the source files, see `code_map.md`. ## Exporting corpora diff --git a/docs/quickstart.md b/docs/quickstart.md index 4b10685..a498725 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -38,14 +38,13 @@ Workers report per-batch summaries and extraction progress is persisted into `download_results/download_results.parquet`, so you can restart multi-GPU runs without losing progress (no extra checkpoint files required). -## GPU OCR (opt-in) +## OCR remediation (opt-in) ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', accel_type='CUDA', force_ocr=True) -# or reuse multi-GPU batching -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.clean() +c.ocr(backend='deepseek', fix_bad=True, math_enhance=False) ``` ## Phase‑2 Math Enrichment (from JSON) @@ -76,7 +75,7 @@ c.section() # to parquet c.annotate() # classify/annotate sections ``` -See ocr_and_math_enhancement.md for GPU details, batch sizes, and artifact locations. +See `ocr_and_math_enhancement.md` for OCR runtime details, batch sizes, and artifact locations. ### DeepSeek OCR @@ -89,12 +88,11 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → OCR only for bad files; math is included inline in the Markdown ``` -To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the CLI bits are reachable: +To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the runtime is reachable: ```bash -export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py -export GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek-venv/bin/python -export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR -export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib +export GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek-venv/bin/python +export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2 python -m glossapi.ocr.deepseek.preflight # optional: validates env without running OCR ``` diff --git a/docs/stages/download.md b/docs/stages/download.md index 99bc4f8..c70c551 100644 --- a/docs/stages/download.md +++ b/docs/stages/download.md @@ -8,6 +8,7 @@ The download stage acquires source documents from parquet-based URL metadata and - read URL-bearing parquet input - download files concurrently +- route known browser-gated sources through browser-assisted acquisition when configured - retain source metadata context - avoid refetching previously successful downloads - assign stable-enough local filenames for downstream processing @@ -42,10 +43,34 @@ Typical issues include: - transient network failures - rate limiting +- browser-gated file endpoints that return HTML challenge/interstitial pages +- viewer-only sources that should fail cleanly instead of being recorded as successful downloads - duplicate URLs - filename collisions - partially completed corpus fetches +## Browser-gated sources + +The downloader now distinguishes between: + +- direct file endpoints +- browser-gated file endpoints +- viewer-only/document-reader sources + +For browser-gated file endpoints: + +- `download_mode="auto"` probes with direct HTTP and escalates to a browser session when it detects a recoverable interstitial +- `download_mode="browser"` goes directly to the browser-assisted path +- `download_policy_file=...` can route known domains or URL patterns to the correct path without probing every file + +Browser-assisted mode is designed for retrievable file endpoints, not for sources that only expose page images, tiles, HTML/SVG re-rendering, or DRM-wrapped readers. + +## Session reuse + +Browser-assisted mode reuses cached browser session state per domain so multiple files from the same protected source do not need a fresh browser bootstrap every time. + +This keeps the browser as a session-bootstrap resource rather than the main downloader. + ## Contributor note Any change to filename assignment or result parquet structure can have downstream impact on: diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 3bf8815..3a7e57c 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -25,12 +25,9 @@ The OCR stage repairs documents whose extracted text is considered unreliable, a ## Backend choices -The pipeline supports at least two OCR-oriented modes: - -- RapidOCR through the Docling path -- DeepSeek OCR for environments configured for that backend - -These are operationally different and should not be treated as interchangeable implementation details. +The supported OCR remediation backend is DeepSeek OCR. Docling remains part of +the surrounding extraction and layout flow, but OCR reruns themselves are now +expected to use the DeepSeek runtime. ## Selection model diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md new file mode 100644 index 0000000..29a5e15 --- /dev/null +++ b/docs/testing/compatibility_matrix.md @@ -0,0 +1,276 @@ +# Compatibility And Regression Matrix + +This document defines the release-validation matrix for the DeepSeek-only migration and subsequent Docling upgrades. + +It is not a generic unit-test list. It is a contract-based validation plan tied to the documented pipeline behavior. + +## Scope + +This matrix applies to changes in: + +- DeepSeek-only OCR migration +- no-stub enforcement +- installation simplification +- Docling dependency upgrades +- page-level reevaluation experiments + +## Validation policy + +Release validation for this migration must use: + +- real PDFs +- real Docling +- real DeepSeek +- real GPUs where the code path requires them +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` + +Developer-only tests may still use mocks or lightweight stubs for fast iteration, but those do not satisfy release gates for this migration. + +## Test levels + +### L0: Install and import sanity + +Purpose: + +- prove the supported environments install cleanly and that removed components are truly gone + +Typical inputs: + +- fresh venv +- supported Python version + +### L1: Lightweight smoke corpus + +Purpose: + +- prove the baseline end-to-end flow still works on the small repo corpus + +Typical inputs: + +- `samples/lightweight_pdf_corpus/` + +### L2: Real-PDF contract validation + +Purpose: + +- prove the documented artifacts and metadata contracts still hold on real documents + +Typical inputs: + +- real PDFs from a representative sample + +### L3: Multi-GPU and operational recovery + +Purpose: + +- prove the runtime behavior remains correct under parallel execution and rerun conditions + +Typical inputs: + +- multiple real PDFs +- at least two visible GPUs + +### L4: Comparative corpus evaluation + +Purpose: + +- compare baseline and changed behavior on a real evaluation slice + +Typical inputs: + +- real corpus slice such as the Pergamos sample + +## Mandatory invariants + +The following must remain true unless a change explicitly revises the contract and updates the docs: + +- canonical Markdown is written to `markdown/.md` +- Docling JSON artifacts are emitted when requested +- cleaner output still drives `needs_ocr` +- OCR remains selective rather than defaulting to all documents +- metadata parquet remains the durable operational record +- reruns skip completed work unless forced +- skiplist semantics remain explicit and stable +- no production path silently falls back to stub OCR + +## Release-gate matrix + +| ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | +| --- | --- | --- | --- | --- | --- | --- | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed legacy OCR install modes | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no dead imports from removed OCR integrations | +| `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | +| `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | +| `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | +| `OCR-001` | L2 | DeepSeek-only remediation | docs with `needs_ocr=True` | `ocr(backend="deepseek", fix_bad=True)` | recovered docs updated, metadata marks `ocr_success=True` | no stub output, no silent success | +| `OCR-002` | L2 | No-stub enforcement | broken/missing DeepSeek runtime | run OCR with `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` | run fails explicitly | failure must not produce placeholder success artifacts | +| `MTH-001` | L2 | Formula/code enrichment compatibility | math-heavy real PDF | Docling extract plus Phase-2 enrichment | enriched outputs and metadata remain coherent | no schema drift breaking enrichment | +| `SEC-001` | L2 | Sectioning contract | usable real docs | `section()` | `sections/sections_for_annotation.parquet` produced | no empty-output regression caused by upstream changes | +| `ANN-001` | L2 | Annotation contract | section parquet | `annotate()` | classified outputs produced | model integration must not break on changed upstream text/layout | +| `EXP-001` | L2 | Export contract | processed docs | `jsonl()` / `jsonl_sharded()` | JSONL and metadata outputs match documented layout | no dropped metadata fields without explicit design change | +| `RES-001` | L3 | Resumability | interrupted or partial run | rerun with defaults | completed items skipped correctly | no duplicate reprocessing by default | +| `RES-002` | L3 | Force/reprocess semantics | prior successful run | rerun with force/reprocess flag | selected items are reprocessed | no stale completion flags blocking intended rerun | +| `SKP-001` | L3 | Skiplist semantics | run with known problematic items | extract/OCR rerun | skiplist excludes intended stems only | no hidden filtering of healthy items | +| `GPU-001` | L3 | Multi-GPU OCR | real PDF slice on 2 GPUs | DeepSeek OCR in parallel | work is distributed and completes per GPU | no worker success masking failures | +| `CMP-001` | L4 | Baseline quality comparison | Pergamos sample slice | compare pre/post change outputs | no material regression in artifact completeness and downstream usability | runtime improvement alone does not justify quality loss | +| `CMP-002` | L4 | Whole-text vs page-level experiment | long PDFs | compare baseline branch vs page-level branch | quality/runtime tradeoff explicitly measured | experimental branch does not replace baseline without evidence | + +## Detailed test groups + +### Install and runtime compatibility + +What to prove: + +- supported environment installs cleanly +- unsupported/removed OCR components are not required +- Python floor matches actual upstream dependencies + +Critical checks: + +- packaging metadata uses a supported Python minimum +- setup docs expose only supported install paths +- removal of the old OCR integration does not leave dead GlossAPI imports or entrypoints + +## Extraction contract + +What to prove: + +- Phase-1 still produces canonical Markdown +- Docling extraction still produces JSON artifacts when requested +- metrics continue to be written where downstream stages expect them + +Artifacts to check: + +- `markdown/.md` +- `json/.docling.json(.zst)` +- `json/.formula_index.jsonl` when requested +- `json/metrics/.metrics.json` +- `json/metrics/.per_page.metrics.json` + +## Cleaning and Greek-quality routing + +What to prove: + +- cleaner still computes routing decisions required for selective OCR +- Greek-text validation remains first-class rather than incidental cleanup + +Fields to check in metadata parquet: + +- `needs_ocr` +- `filter` +- Greek-quality and badness-related fields currently emitted by the cleaner + +## DeepSeek OCR contract + +What to prove: + +- DeepSeek is the only OCR remediation backend +- no-stub enforcement is real +- recovered documents update metadata correctly + +Required environment behavior: + +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` +- real model weights present +- real CLI/runtime path present + +Negative checks: + +- no markdown contains placeholder stub markers +- no OCR pass succeeds after a DeepSeek CLI failure unless real output exists +- no removed OCR backend is referenced during final validation + +## Formula and code enrichment + +What to prove: + +- if retained, enrichment still works with the upgraded Docling stack +- if later removed, the removal is justified by evaluation rather than convenience + +Checks: + +- enriched Markdown is generated where expected +- `json/.latex_map.jsonl` remains coherent when enrichment is enabled +- metadata updates for math enrichment still work + +## Section, annotate, and export contracts + +What to prove: + +- downstream stages still consume the extraction outputs +- output layout and metadata structure remain compatible with the documented pipeline + +Artifacts to check: + +- `sections/sections_for_annotation.parquet` +- `classified_sections.parquet` +- `fully_annotated_sections.parquet` +- exported JSONL shards and related metadata + +## Resumability and operational recovery + +What to prove: + +- reruns still honor completion state +- skiplist semantics remain intact +- multi-worker failures remain visible and recoverable + +Checks: + +- default rerun skips completed items +- explicit force/reprocess reruns the intended items +- problematic stems are persisted and not silently lost + +## Comparative evaluation set + +Suggested real-world slice: + +- lightweight corpus for smoke validation +- representative real PDFs spanning: + - short documents + - medium documents + - long documents + - structure-rich documents + - math-heavy documents where applicable + +For current local evaluation work, a Pergamos sample manifest has been prepared outside the repo and can be used as the L3/L4 real-PDF slice. + +## Suggested release sequence + +For the planned migration, run gates in this order: + +1. `ENV-*` +2. `EXT-*` +3. `CLN-*` +4. `OCR-*` +5. `MTH-*` +6. `SEC-*`, `ANN-*`, `EXP-*` +7. `RES-*`, `SKP-*`, `GPU-*` +8. `CMP-*` + +This keeps low-level compatibility failures from being confused with downstream quality regressions. + +## Exit criteria per stage + +### Stage 1 exit criteria + +- DeepSeek-only OCR path works on real PDFs +- no-stub enforcement verified +- no supported GlossAPI OCR backend remains besides DeepSeek + +### Stage 2 exit criteria + +- install paths reduced to supported environments +- packaging/docs no longer reference removed OCR components + +### Stage 3 exit criteria + +- upgraded Docling passes `EXT-*`, `MTH-*`, `SEC-*`, `ANN-*`, and `EXP-*` + +### Stage 4 exit criteria + +- retained or removed Docling capabilities are justified by evaluation evidence + +### Stage 5 exit criteria + +- page-level branch is compared against the stabilized baseline before any adoption decision diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6691407..24cc470 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -2,19 +2,15 @@ ## OCR runs on CPU -- Verify ONNXRuntime GPU: `python -c "import onnxruntime as ort; print(ort.get_available_providers())"` — must include `CUDAExecutionProvider`. -- Ensure CPU ORT wheel is not installed: `pip uninstall -y onnxruntime`. -- Make sure you pass `accel_type='CUDA'` (or `use_gpus='multi'`). +- Verify Torch CUDA: `python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())"`. +- Make sure the DeepSeek runtime is the one configured in `GLOSSAPI_DEEPSEEK_PYTHON`. +- Run `python -m glossapi.ocr.deepseek.preflight` in the DeepSeek env before large OCR jobs. ## Torch doesn’t see the GPU - Check `nvidia-smi` and driver installation. - Match Torch CUDA build to your driver; see getting_started.md for the recommended wheel. -## RapidOCR font download failure - -- The first OCR call might download a visualization font. Ensure egress is allowed; the file is cached afterwards. - ## Out of memory - Lower Phase‑2 `batch_size` (e.g., 8) and reduce inline `GLOSSAPI_FORMULA_BATCH`. diff --git a/install_glossapi.py b/install_glossapi.py new file mode 100644 index 0000000..ef7a7c9 --- /dev/null +++ b/install_glossapi.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +def _bootstrap_repo_src() -> None: + repo_root = Path(__file__).resolve().parent + src_dir = repo_root / "src" + src_str = str(src_dir) + if src_str not in sys.path: + sys.path.insert(0, src_str) + + +def main() -> int: + _bootstrap_repo_src() + from glossapi.scripts.install_glossapi import main as _main + + return int(_main()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/mkdocs.yml b/mkdocs.yml index ba13512..43b70fa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: GlossAPI -site_description: Academic document processing pipeline (Docling + RapidOCR + Rust) +site_description: Academic document processing pipeline (Docling + DeepSeek + Rust) repo_url: https://github.com/eellak/glossAPI theme: name: material @@ -22,6 +22,7 @@ nav: - Metadata, Artifacts, and Run Diagnostics: architecture/metadata_artifacts_and_run_diagnostics.md - Artifact Layout and Stage Handoffs: architecture/artifact_layout_and_stage_handoffs.md - Resumability, Recovery, and Retention: architecture/resumability_recovery_and_retention.md + - DeepSeek-Only Upgrade Roadmap: architecture/deepseek_only_upgrade_roadmap.md - Pipeline: - Pipeline Overview: pipeline.md - OCR & Math Enrichment: ocr_and_math_enhancement.md @@ -39,15 +40,12 @@ nav: - Configuration: configuration.md - AWS Job Distribution: aws_job_distribution.md - Troubleshooting: troubleshooting.md + - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: + - Code Map: code_map.md - Corpus API: api/corpus.md + - Legacy Corpus API Notes: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md - - Divio Skeleton: - - Overview: divio/overview.md - - Tutorials: divio/tutorials.md - - How-to Guides: divio/how_to_guides.md - - Reference: divio/reference.md - - Explanation: divio/explanation.md docs_dir: docs markdown_extensions: - admonition diff --git a/pyproject.toml b/pyproject.toml index 3d0d5fa..a296c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,11 +10,11 @@ authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"} ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", # ORT+RapidOCR best compatibility + "numpy>=1.26,<3", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -37,28 +37,31 @@ classifiers = [ ] [project.optional-dependencies] -# Docling + RapidOCR ONNX stack (kept optional to preserve import-light installs) -rapidocr = [ - "docling==2.48.0", - # Use RapidOCR core package; avoid rapidocr_onnxruntime to prevent pip - # from auto-installing the CPU-only 'onnxruntime' wheel. - "rapidocr>=3.3.0", - "onnxruntime-gpu==1.18.1", +# Browser automation fallback for browser-gated file endpoints +browser = [ + "playwright>=1.52,<2", +] +# Docling extraction/layout stack +docling = [ + "docling==2.81.0", ] # Optional CUDA layout acceleration (Docling) cuda = [ "torch==2.5.1", "torchvision==0.20.1", ] -# DeepSeek OCR backend extras (CUDA 12.1 build of vLLM). Torch is not pinned here -# because users should install the CUDA wheel from the PyTorch index -# (see docs: installing torch==2.5.1+cu121 via extra index URL). +# DeepSeek OCR backend extras (Torch should be installed from the PyTorch index). deepseek = [ - "vllm>=0.11.0", - "transformers>=4.45,<5", + "vllm==0.18.0", + "transformers==4.57.6", + "tokenizers==0.22.2", "accelerate>=1.2.1,<2", "pymupdf==1.24.10", - "Pillow==10.4.0", + "Pillow==12.1.1", + "img2pdf>=0.5.1", + "einops", + "easydict", + "addict", ] docs = [ "mkdocs>=1.5", @@ -78,6 +81,5 @@ glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ - "rapidocr: requires the RapidOCR/Docling execution stack", "deepseek: exercises the DeepSeek OCR pipeline", ] diff --git a/requirements.txt b/requirements.txt index 95f4678..32b555c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -### GlossAPI runtime requirements (aligned with repro_rapidocr_onnx) +### GlossAPI runtime requirements # Core pipeline deps pandas>=1.3.0 -numpy<2 +numpy>=1.26,<3 python-dateutil>=2.8.2 pytz>=2021.1 scikit-learn==1.6.1 @@ -15,17 +15,12 @@ ftfy>=6.0.0 tenacity>=8.0.0 tqdm>=4.67.0 -# Docling + RapidOCR ONNX stack -docling==2.48.0 -# Prefer RapidOCR core package; it works with the GPU ORT wheel without pulling -# the CPU-only 'onnxruntime' dependency. -rapidocr>=3.3.0 -onnxruntime-gpu==1.18.1 +# Docling extraction/layout stack +docling==2.81.0 pyyaml>=6.0 # Enrichment & JSON compression (required for Phase-2 math/code and JSON zstd) pypdfium2>=4.0.0 zstandard>=0.22.0 -# Optional: install Torch CUDA for GPU layout (not required for OCR) -# pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 +# Optional: install Torch CUDA for GPU-backed Docling layout / enrichment diff --git a/samples/openarchives_download_policy.yml b/samples/openarchives_download_policy.yml new file mode 100644 index 0000000..8e1091e --- /dev/null +++ b/samples/openarchives_download_policy.yml @@ -0,0 +1,92 @@ +default: + downloader: standard + request_timeout: 60 + ssl_verify: true + per_domain_concurrency: 8 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.25 + +rules: + - match: + domains: [ikee.lib.auth.gr] + downloader: standard + request_timeout: 180 + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 5 + sleep: 1.5 + + - match: + domains: [dspace.lib.ntua.gr] + downloader: standard + request_timeout: 120 + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 + sleep: 1.0 + + - match: + domains: [olympias.lib.uoi.gr] + downloader: standard + request_timeout: 180 + ssl_verify: false + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 + sleep: 1.0 + + - match: + domains: [ktisis.cut.ac.cy] + downloader: standard + request_timeout: 90 + ssl_verify: false + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 4 + sleep: 0.5 + + - match: + domains: [repository.academyofathens.gr] + downloader: standard + request_timeout: 45 + per_domain_concurrency: 16 + domain_concurrency_floor: 2 + domain_concurrency_ceiling: 16 + skip_failed_after: 3 + sleep: 0.1 + + - match: + domains: + - dione.lib.unipi.gr + - pergamos.lib.uoa.gr + - hellanicus.lib.aegean.gr + - dias.library.tuc.gr + downloader: standard + request_timeout: 60 + per_domain_concurrency: 12 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.2 + + - match: + domains: + - repository.ihu.gr + - dlib.statistics.gr + - apothesis.eap.gr + - repository.edulll.gr + - dspace.lib.uom.gr + - dspace.aua.gr + downloader: standard + request_timeout: 75 + per_domain_concurrency: 6 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 8 + skip_failed_after: 4 + sleep: 0.25 diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index 4539ead..14f0c31 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -1,54 +1,7 @@ -""" -GlossAPI Library - -A library for processing academic texts in Greek and other languages: -- Extracting content from PDFs and other formats with Docling -- Robust batch processing with error isolation and automatic resumption -- Clustering documents based on extraction quality -- Extracting and cleaning academic sections -- Classifying sections using machine learning - -This is an open source project that provides tools for linguistic annotations -and text processing, with a special focus on the Greek language. -""" +"""GlossAPI library.""" from __future__ import annotations -import os - -# Keep Docling/RapidOCR bootstrap optional and import‑light by default. -# If the environment requests skipping (common in tests or minimal envs), -# or if Docling is not installed, we avoid importing heavy dependencies here. -_SKIP_DOCLING_BOOT = os.environ.get("GLOSSAPI_SKIP_DOCLING_BOOT") == "1" - -def _attempt_patch_docling() -> bool: - if _SKIP_DOCLING_BOOT: - return False - try: - # Import inside the function to avoid pulling Docling when unused or missing. - from .ocr.rapidocr.safe import patch_docling_rapidocr # type: ignore - - try: - return bool(patch_docling_rapidocr()) - except Exception: - # Swallow any runtime error to keep top‑level import light/safe. - return False - except Exception: - # Docling (or its transitive deps) not available – keep going. - return False - - -def patch_docling_rapidocr() -> bool: - """Best‑effort registration of the SafeRapidOcrModel. - - Returns True when the patch was applied; False when unavailable or skipped. - Safe to call multiple times. - """ - return _attempt_patch_docling() - -# Attempt the patch once at import time, but never fail import if it does not apply. -_ = _attempt_patch_docling() - __all__ = [ 'GlossSection', 'GlossSectionClassifier', @@ -56,7 +9,7 @@ def patch_docling_rapidocr() -> bool: 'Sampler', 'Section', 'GlossDownloader', - 'patch_docling_rapidocr', + 'BrowserGlossDownloader', ] def __getattr__(name: str): @@ -79,9 +32,11 @@ def __getattr__(name: str): if name == 'GlossDownloader': from .gloss_downloader import GlossDownloader # type: ignore return GlossDownloader + if name == 'BrowserGlossDownloader': + from .gloss_browser_downloader import BrowserGlossDownloader # type: ignore + return BrowserGlossDownloader raise AttributeError(name) -# Derive version dynamically from installed package metadata if possible try: from importlib.metadata import version as _pkg_version __version__: str = _pkg_version(__name__) diff --git a/src/glossapi/_pipeline.py b/src/glossapi/_pipeline.py index 73e5ecc..1909b60 100644 --- a/src/glossapi/_pipeline.py +++ b/src/glossapi/_pipeline.py @@ -1,7 +1,7 @@ """Backward-compatible adapter. -Docling pipeline builders moved to `glossapi.ocr.rapidocr.pipeline`. +Docling pipeline builders moved to `glossapi.ocr.docling.pipeline`. This module re-exports the public API to preserve legacy imports. """ -from .ocr.rapidocr.pipeline import * # noqa: F401,F403 +from .ocr.docling.pipeline import * # noqa: F401,F403 diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index dd2fad6..7f254f1 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -350,6 +350,8 @@ def _load_metadata(self) -> None: # Top-level worker function for multi-GPU extraction (picklable by multiprocessing) def gpu_extract_worker_queue( device_id: int, + worker_slot: int, + worker_key: str, in_dir: str, out_dir: str, work_q, # multiprocessing Queue of filename strings @@ -392,12 +394,13 @@ def _ensure_thread_caps(): _ensure_thread_caps() _status_proxy = status_map - _marker_path = _Path(marker_dir).expanduser() / f"gpu{device_id}.current" if marker_dir else None + _worker_label = worker_key or f"gpu{device_id}-w{worker_slot}" + _marker_path = _Path(marker_dir).expanduser() / f"{_worker_label}.current" if marker_dir else None def _update_current(batch_items: List[str]) -> None: if _status_proxy is not None: try: - _status_proxy[device_id] = list(batch_items) + _status_proxy[_worker_label] = list(batch_items) except Exception: pass if _marker_path is not None: @@ -409,7 +412,7 @@ def _update_current(batch_items: List[str]) -> None: def _clear_current() -> None: if _status_proxy is not None: try: - _status_proxy.pop(device_id, None) + _status_proxy.pop(_worker_label, None) except Exception: pass if _marker_path is not None: @@ -423,7 +426,7 @@ def _clear_current() -> None: if _log_dir: _log_path = _Path(_log_dir).expanduser() _log_path.mkdir(parents=True, exist_ok=True) - _worker_log_file = _log_path / f"gpu{device_id}_{_os.getpid()}.log" + _worker_log_file = _log_path / f"{_worker_label}_{_os.getpid()}.log" _worker_log_handle = open(_worker_log_file, "a", encoding="utf-8", buffering=1) _sys.stdout = _worker_log_handle _sys.stderr = _worker_log_handle @@ -458,9 +461,13 @@ def _clear_current() -> None: except Exception: _phys = "" try: - print(f"[GPU{device_id}] bound: CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}") + print( + f"[GPU{device_id}/W{worker_slot}] bound: " + f"CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} " + f"pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}" + ) if _phys: - print(f"[GPU{device_id}] physical: {_phys}") + print(f"[GPU{device_id}/W{worker_slot}] physical: {_phys}") except Exception: pass except Exception: @@ -475,13 +482,15 @@ def _clear_current() -> None: _ensure_thread_caps() from glossapi import Corpus as _Corpus # type: ignore except Exception as _e: - print(f"[GPU{device_id}] Cannot import glossapi in worker: {_e}") + print(f"[{_worker_label}] Cannot import glossapi in worker: {_e}") if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -507,14 +516,16 @@ def _clear_current() -> None: phase1_backend=backend, ) except Exception as _e: - msg = f"[GPU{device_id}] Prime failed: {_e}" + msg = f"[{_worker_label}] Prime failed: {_e}" print(msg) if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -534,7 +545,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [str(x) for x in ok_list], "problematic": [str(x) for x in bad_list], "pid": _os.getpid(), @@ -553,15 +566,12 @@ def _report_batch(ok_list, bad_list): _batch_env = int(str(_os.environ.get("GLOSSAPI_GPU_BATCH_SIZE", "")).strip() or 0) except Exception: _batch_env = 0 - default_batch = 5 if not force else 1 + default_batch = 5 try: extractor = getattr(c, "extractor", None) if extractor is not None: configured = int(getattr(extractor, "max_batch_files", default_batch)) - if force: - default_batch = 1 - else: - default_batch = max(1, configured) + default_batch = max(1, configured) except Exception: pass BATCH_SIZE = max(1, _batch_env) if _batch_env else max(1, default_batch) @@ -605,7 +615,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -653,7 +665,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -667,7 +681,7 @@ def _report_batch(ok_list, bad_list): # Occasional heartbeat if _time.time() - last_progress > 30: try: - print(f"[GPU{device_id}] processed ~{processed} files…") + print(f"[{_worker_label}] processed ~{processed} files...") except Exception: pass last_progress = _time.time() @@ -692,7 +706,9 @@ def _report_batch(ok_list, bad_list): try: result_q.put({ "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": exit_code, "pid": _os.getpid(), }) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index abdaa5e..e5a4329 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -346,6 +346,8 @@ def finalize(self) -> None: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + encoding="utf-8", + errors="replace", bufsize=1, ) try: diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py index 38179fd..c543076 100644 --- a/src/glossapi/corpus/phase_download.py +++ b/src/glossapi/corpus/phase_download.py @@ -19,6 +19,7 @@ import pandas as pd from .._naming import canonical_stem +from ..gloss_browser_downloader import BrowserGlossDownloader from ..gloss_downloader import GlossDownloader # Avoid importing section/classifier here; download phase does not use them. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path @@ -212,6 +213,22 @@ def _looks_like_list(s: str) -> bool: # Initialize downloader configuration (kwargs take precedence) dl_cfg = dict(self.downloader_config) dl_cfg.update(kwargs) + browser_mode = dl_cfg.pop('browser_mode', None) + if browser_mode is not None and 'download_mode' not in dl_cfg: + dl_cfg['download_mode'] = 'browser' if browser_mode else 'standard' + download_mode = str(dl_cfg.pop('download_mode', 'standard')).strip().lower() + policy_requested = bool(dl_cfg.get('download_policy_file') or dl_cfg.get('download_policy')) + if download_mode in {'standard', 'default', 'http'} and not policy_requested: + downloader_cls = GlossDownloader + default_download_route = 'standard' + elif download_mode in {'browser', 'browser_protected'} or policy_requested: + downloader_cls = BrowserGlossDownloader + default_download_route = 'browser' if download_mode in {'browser', 'browser_protected'} else 'standard' + elif download_mode in {'auto', 'browser_fallback'}: + downloader_cls = BrowserGlossDownloader + default_download_route = 'auto' + else: + raise ValueError(f"Unsupported download_mode: {download_mode}") # Allow caller to override which column holds links if links_column: url_column = links_column @@ -232,14 +249,18 @@ def _looks_like_list(s: str) -> bool: except Exception: pass - downloader = GlossDownloader( - url_column=url_column, - output_dir=str(self.output_dir), - log_level=self.logger.level, - verbose=verbose if verbose is not None else self.verbose, + downloader_kwargs = { + "url_column": url_column, + "output_dir": str(self.output_dir), + "log_level": self.logger.level, + "verbose": verbose if verbose is not None else self.verbose, **{k: v for k, v in dl_cfg.items() if k not in {'input_parquet'}}, - _used_filename_bases=used_bases - ) + "_used_filename_bases": used_bases, + } + if downloader_cls is BrowserGlossDownloader: + downloader_kwargs["default_download_route"] = default_download_route + + downloader = downloader_cls(**downloader_kwargs) # Download files self.logger.info(f"Downloading files from URLs in {input_parquet}...") diff --git a/src/glossapi/corpus/phase_export.py b/src/glossapi/corpus/phase_export.py index 26a6a82..4bcc6a8 100644 --- a/src/glossapi/corpus/phase_export.py +++ b/src/glossapi/corpus/phase_export.py @@ -471,8 +471,6 @@ def _normalize_value(value: Any) -> Any: chunk_paths: List[Path] = entry.get("chunk_paths", []) or [] base_path: Optional[Path] = entry.get("base_path") representative_path: Optional[Path] = base_path - if representative_path is None and chunk_paths: - representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0] base_metadata = metadata_by_stem.get(stem) chunk_metadata = metadata_chunks_by_stem.get(stem, []) if base_metadata is None and not chunk_metadata: @@ -480,17 +478,11 @@ def _normalize_value(value: Any) -> Any: metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata) metadata = {k: _normalize_value(v) for k, v in metadata.items()} original_filename_value = metadata.get("filename") - if chunk_paths: - ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key) - parts: List[str] = [] - for path in ordered_chunks: - parts.append(path.read_text(encoding="utf-8")) - document_text = "\n".join(parts) - elif representative_path is not None: - document_text = representative_path.read_text(encoding="utf-8") - else: + if base_path is None or not base_path.exists(): continue + document_text = base_path.read_text(encoding="utf-8") + filetype = metadata.get("filetype") or metadata.get("file_ext") if not filetype: filename_candidate = original_filename_value or metadata.get("filename") diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a584eaf..296429a 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -96,10 +96,16 @@ def prime_extractor( except Exception: images_scale_env = "1.25" + if force_ocr: + self.logger.warning( + "Corpus.extract(force_ocr=True) is deprecated and no longer executes OCR. " + "Use Corpus.ocr(backend='deepseek') for OCR remediation." + ) + # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, - require_ocr=bool(force_ocr), + require_ocr=False, require_math=bool(formula_enrichment or code_enrichment), require_backend_gpu=(backend_choice == "docling"), ) @@ -113,8 +119,8 @@ def prime_extractor( # Ensure converter exists (reuse when unchanged) self.extractor.ensure_extractor( - enable_ocr=bool(force_ocr), - force_full_page_ocr=bool(force_ocr), + enable_ocr=False, + force_full_page_ocr=False, formula_enrichment=bool(formula_enrichment), code_enrichment=bool(code_enrichment), images_scale=float(images_scale_env), @@ -136,12 +142,12 @@ def _resolve_phase1_backend( raise ValueError( f"Invalid phase1_backend='{requested}'. Expected one of: 'auto', 'safe', 'docling'." ) - needs_gpu = bool(force_ocr or formula_enrichment or code_enrichment) + needs_gpu = bool(formula_enrichment or code_enrichment) if choice == "auto": choice = "docling" if needs_gpu else "safe" if choice == "safe" and needs_gpu: self.logger.info( - "Phase-1 backend 'safe' overridden to 'docling' because OCR/math enrichment was requested." + "Phase-1 backend 'safe' overridden to 'docling' because math/code enrichment was requested." ) choice = "docling" return choice @@ -154,12 +160,12 @@ def _gpu_preflight( require_math: bool, require_backend_gpu: bool = False, ) -> None: - """Abort early when GPU OCR/math is requested but CUDA is unavailable.""" + """Abort early when GPU-backed Docling work is requested but CUDA is unavailable.""" if not (require_ocr or require_math or require_backend_gpu): return instructions = ( - "GPU OCR and math enrichment require CUDA-enabled torch and onnxruntime-gpu. " + "GPU-backed Docling extraction and math enrichment require CUDA-enabled torch. " "Install the CUDA wheels and ensure NVIDIA drivers expose the desired devices." ) @@ -167,30 +173,15 @@ def _gpu_preflight( accel_lower = str(accel_type or "").strip().lower() if accel_lower.startswith("cpu"): raise RuntimeError( - "GPU OCR was requested (force_ocr/math) but accel_type='CPU'. " + "GPU-backed Docling extraction was requested but accel_type='CPU'. " f"{instructions}" ) - try: - import onnxruntime as _ort # type: ignore - providers = _ort.get_available_providers() - except Exception as exc: - raise RuntimeError( - "onnxruntime not available while attempting GPU OCR. " - "Install onnxruntime-gpu and rerun." - ) from exc - - if "CUDAExecutionProvider" not in providers: - raise RuntimeError( - "CUDAExecutionProvider missing from onnxruntime providers. " - f"Detected providers={providers}. {instructions}" - ) - torch_mod = _maybe_import_torch(force=True) if torch_mod is None or not getattr(torch_mod, "cuda", None) or not torch_mod.cuda.is_available(): raise RuntimeError( - "Torch CUDA is not available but GPU OCR/math was requested. " - "Install the CUDA wheel (e.g. torch==2.5.1+cu121) and ensure CUDA drivers/devices are visible." + "Torch CUDA is not available but GPU-backed Docling extraction/math was requested. " + "Install the CUDA wheel and ensure CUDA drivers/devices are visible." ) device_count = torch_mod.cuda.device_count() @@ -208,13 +199,12 @@ def _gpu_preflight( if not self._gpu_banner_logged: self.logger.info( - "GPU preflight: using torch + onnxruntime GPU backends; ensure CUDA drivers are available." + "GPU preflight: using torch-backed Docling extraction; ensure CUDA drivers are available." ) self._gpu_banner_logged = True self.logger.info( - "GPU preflight OK: providers=%s torch_devices=%s", - ",".join(providers), + "GPU preflight OK: torch_devices=%s", ", ".join(device_names) or "", ) @@ -237,6 +227,7 @@ def extract( export_doc_json: bool = True, emit_formula_index: bool = False, phase1_backend: str = "auto", + workers_per_device: int = 1, _prepared: bool = False, ) -> None: """ @@ -250,8 +241,9 @@ def extract( export_doc_json: When True (default), writes Docling layout JSON to `json/.docling.json(.zst)` emit_formula_index: Also emit `json/.formula_index.jsonl` (default: False) phase1_backend: Selects the Phase-1 backend. ``"auto"`` (default) keeps the safe backend unless - OCR/math is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` forces the - Docling backend. + math/code enrichment is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` + forces the Docling backend. + workers_per_device: Number of extraction workers to bind to each visible GPU when ``use_gpus='multi'``. """ if not file_paths: @@ -425,12 +417,14 @@ def extract( except Exception: threads_effective = int(num_threads) if isinstance(num_threads, int) else max(2, 2 * max(1, len(devs))) - batch_hint = 5 if backend_choice == "docling" and not force_ocr else 1 + workers_per_device = max(1, int(workers_per_device or 1)) + batch_hint = 1 self.logger.info( - "Phase-1 config: backend=%s batch_size=%s threads=%s skip_existing=%s benchmark=%s", + "Phase-1 config: backend=%s batch_size=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, batch_hint, threads_effective, + workers_per_device, bool(skip_existing), bool(benchmark_mode), ) @@ -464,6 +458,7 @@ def extract( return # Dynamic work queue across GPUs + from .corpus_orchestrator import gpu_extract_worker_queue from multiprocessing import get_context ctx = get_context("spawn") manager = ctx.Manager() @@ -494,14 +489,29 @@ def extract( marker_base.mkdir(parents=True, exist_ok=True) except Exception as exc: self.logger.debug("Unable to prepare marker directory %s: %s", marker_base, exc) - procs: List[Any] = [] - proc_gpu: Dict[int, int] = {} - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} + worker_specs: List[Dict[str, Any]] = [] for dev_id in devs: + for worker_slot in range(workers_per_device): + worker_specs.append( + { + "device_id": int(dev_id), + "worker_slot": int(worker_slot), + "worker_key": f"gpu{dev_id}-w{worker_slot}", + } + ) + procs: List[Any] = [] + proc_specs: Dict[int, Dict[str, Any]] = {} + marker_files: Dict[str, Path] = { + spec["worker_key"]: marker_base / f"{spec['worker_key']}.current" + for spec in worker_specs + } + for spec in worker_specs: p = ctx.Process( target=gpu_extract_worker_queue, args=( - dev_id, + spec["device_id"], + spec["worker_slot"], + spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -524,7 +534,7 @@ def extract( p.start() procs.append(p) if p.pid is not None: - proc_gpu[p.pid] = dev_id + proc_specs[p.pid] = dict(spec) active = list(procs) any_fail = False last_summary = time.time() @@ -541,20 +551,21 @@ def extract( procs.remove(p) pid = p.pid or -1 heartbeat[pid] = time.time() - gpu_id = proc_gpu.pop(pid, None) + worker_spec = proc_specs.pop(pid, None) + worker_key = worker_spec["worker_key"] if worker_spec else None if p.exitcode not in (0, None): any_fail = True self.logger.warning("GPU worker pid=%s exited with code %s", p.pid, p.exitcode) current_paths: List[str] = [] stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) + if worker_key is not None: + current_entry = status_map.pop(worker_key, None) if current_entry: if not isinstance(current_entry, (list, tuple, set)): current_entry = [current_entry] current_paths = [str(x) for x in current_entry] stems_for_skip = [canonical_stem(path) for path in current_paths] - marker_path = marker_files.get(gpu_id) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -565,12 +576,17 @@ def extract( state_mgr.save(processed_files, problematic_files) if stems_for_skip: skip_mgr.add(stems_for_skip) - if gpu_id is not None: - self.logger.info("Respawning GPU%s worker after crash.", gpu_id) + if worker_spec is not None: + self.logger.info( + "Respawning %s after crash.", + worker_spec["worker_key"], + ) replacement = ctx.Process( target=gpu_extract_worker_queue, args=( - gpu_id, + worker_spec["device_id"], + worker_spec["worker_slot"], + worker_spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -594,13 +610,13 @@ def extract( procs.append(replacement) active.append(replacement) if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id + proc_specs[replacement.pid] = dict(worker_spec) heartbeat[replacement.pid] = time.time() continue else: - if gpu_id is not None: - status_map.pop(gpu_id, None) - marker_path = marker_files.get(gpu_id) + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -628,7 +644,7 @@ def extract( skip_mgr.add(bad_stems) state_mgr.save(processed_files, problematic_files) self.logger.info( - "GPU%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", + "%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", result.get("worker"), len(ok_stems), len(bad_stems), @@ -642,25 +658,20 @@ def extract( if result.get("exitcode", 0) not in (0, None): any_fail = True self.logger.warning( - "GPU%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") + "%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") ) worker_pid = result.get("pid") if worker_pid is not None: heartbeat[worker_pid] = time.time() - worker_gpu = result.get("worker") - if worker_gpu is not None: - try: - worker_gpu_int = int(worker_gpu) - except Exception: - worker_gpu_int = None - else: - status_map.pop(worker_gpu_int, None) - marker_path = marker_files.get(worker_gpu_int) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass + worker_key = result.get("worker") + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(str(worker_key)) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass now = time.time() if now - last_summary > 30: diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 4dec423..552af09 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -33,7 +33,7 @@ def ocr( *, fix_bad: bool = True, mode: Optional[str] = None, - backend: str = "rapidocr", + backend: str = "deepseek", device: Optional[str] = None, model_dir: Optional[Union[str, Path]] = None, max_pages: Optional[int] = None, @@ -41,6 +41,26 @@ def ocr( limit: Optional[int] = None, dpi: Optional[int] = None, # reserved for future use precision: Optional[str] = None, # reserved for future use ("fp16","bf16") + workers_per_gpu: int = 1, + runtime_backend: str = "transformers", + ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = 2048, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = None, + disable_fp8_kv: bool = False, + repair_mode: str = "auto", + scheduler: str = "auto", + target_batch_pages: int = 160, + shard_pages: int = 0, + shard_threshold_pages: int = 0, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -70,10 +90,34 @@ def ocr( fix_bad only -> 'ocr_bad'; math_enhance only -> 'math_only'; neither -> no‑op. - - backend: 'rapidocr' (default) uses the Docling + RapidOCR path via Phase‑1 extract(). - 'deepseek' uses the DeepSeek‑OCR path (no Docling JSON, math unsupported). + - backend: 'deepseek' (default) uses the DeepSeek OCR remediation path. + Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). + - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use + ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. + Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers + per visible GPU. + - scheduler/target_batch_pages/shard_pages/shard_threshold_pages: + Multi-GPU scheduling controls. ``scheduler='auto'`` resolves to + exact-fill page-range batching for multi-GPU vLLM runs and falls back + to whole-document scheduling elsewhere. ``target_batch_pages`` is the + per-lane page budget the scheduler tries to fill. ``fixed_shard`` uses + ``shard_pages`` and ``shard_threshold_pages`` when explicit shard-based + planning is requested. + - runtime_backend: ``transformers`` (default) or ``vllm``. + - ocr_profile/prompt_override/attn_backend/base_size/image_size/crop_mode/render_dpi: + DeepSeek rendering and attention controls used for throughput/quality + benchmarking. + - max_new_tokens/repetition_penalty/no_repeat_ngram_size: + Optional generation controls forwarded to DeepSeek. These are exposed + for runtime experiments; leave them unset unless a benchmark calls for + them explicitly. + - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode: + Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first + repair pipeline (plain fallback for garbage pages, tiled fallback for + short coverage failures). These are ignored by the transformers runtime + except for ``prompt_override``. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -82,9 +126,9 @@ def ocr( ``reprocess_completed=False``). Prefer the explicit ``reprocess_completed`` toggle. """ # Normalize backend - backend_norm = str(backend or "rapidocr").strip().lower() - if backend_norm not in {"rapidocr", "deepseek"}: - raise ValueError("backend must be 'rapidocr' or 'deepseek'") + backend_norm = str(backend or "deepseek").strip().lower() + if backend_norm != "deepseek": + raise ValueError("backend must be 'deepseek'") # CONTENT_DEBUG override (preferred uppercase alias) # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags @@ -147,13 +191,21 @@ def ocr( reprocess_completed = reprocess_flag # DeepSeek semantics note - if backend_norm == "deepseek": + if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: try: self.logger.info( "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." ) except Exception: pass + if mode_norm == "ocr_bad_then_math": + try: + self.logger.info( + "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." + ) + except Exception: + pass + mode_norm = "ocr_bad" # Identify bad documents from parquet (Rust cleaner output) bad_files: List[str] = [] skipped_completed = 0 @@ -573,29 +625,37 @@ def _run_math(stems: List[str]) -> None: self, bad_files, model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=bool(persist_engine), + precision=precision, + device=device, + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + runtime_backend=runtime_backend, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + scheduler=scheduler, + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), content_debug=bool(content_debug), ) except Exception as _e: self.logger.error("DeepSeek OCR runner failed: %s", _e) raise - else: - # RapidOCR/Docling path via Phase-1 extract - self.extract( - input_format="pdf", - num_threads=os.cpu_count() or 4, - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=bad_files, - skip_existing=False, - use_gpus=use_gpus, - devices=devices, - # Do not generate Docling JSON for OCR targets; math will skip them - export_doc_json=False, - emit_formula_index=False, - phase1_backend="docling", - ) reran_ocr = True # Update metadata to reflect successful OCR reruns try: diff --git a/src/glossapi/download_policy.py b/src/glossapi/download_policy.py new file mode 100644 index 0000000..36d3ce6 --- /dev/null +++ b/src/glossapi/download_policy.py @@ -0,0 +1,135 @@ +"""Policy routing for downloader selection.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Optional +from urllib.parse import urlparse + +import yaml + +VALID_DOWNLOADERS = {"standard", "browser", "auto"} +ROUTE_OPTION_KEYS = { + "request_timeout", + "ssl_verify", + "ssl_cafile", + "request_method", + "sleep", + "per_domain_concurrency", + "domain_concurrency_floor", + "domain_concurrency_ceiling", + "skip_failed_after", + "domain_cookies", + "browser_timeout_ms", + "browser_post_load_wait_ms", + "browser_engine", + "browser_headless", + "browser_session_ttl_seconds", +} + + +def _normalize_downloader(value: Any, default: str = "standard") -> str: + normalized = str(value or default).strip().lower() + if normalized in {"default", "http"}: + normalized = "standard" + if normalized in {"browser_fallback"}: + normalized = "auto" + if normalized in {"browser_protected"}: + normalized = "browser" + if normalized not in VALID_DOWNLOADERS: + raise ValueError(f"Unsupported downloader route: {value}") + return normalized + + +@dataclass(frozen=True) +class DownloadPolicyMatch: + domains: tuple[str, ...] = () + url_regex: Optional[re.Pattern[str]] = None + + def matches(self, url: str) -> bool: + parsed = urlparse(url) + hostname = (parsed.hostname or "").lower() + if self.domains: + matched_domain = any( + hostname == domain or hostname.endswith(f".{domain}") + for domain in self.domains + ) + if not matched_domain: + return False + if self.url_regex and not self.url_regex.search(url): + return False + return True + + +@dataclass(frozen=True) +class DownloadPolicyRule: + matcher: DownloadPolicyMatch + downloader: str + options: Dict[str, Any] + + def matches(self, url: str) -> bool: + return self.matcher.matches(url) + + +@dataclass(frozen=True) +class DownloadPolicy: + default_downloader: str = "standard" + default_options: Dict[str, Any] | None = None + rules: tuple[DownloadPolicyRule, ...] = () + + def resolve(self, url: str) -> tuple[str, Dict[str, Any]]: + for rule in self.rules: + if rule.matches(url): + return rule.downloader, dict(rule.options) + return self.default_downloader, dict(self.default_options or {}) + + +def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS} + + +def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch: + domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip()) + url_regex = raw.get("url_regex") + compiled = re.compile(str(url_regex)) if url_regex else None + return DownloadPolicyMatch(domains=domains, url_regex=compiled) + + +def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy: + default_block = dict(data.get("default") or {}) + default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard") + default_options = _extract_route_options(default_block) + + rules = [] + for raw_rule in data.get("rules") or []: + raw_rule = dict(raw_rule or {}) + matcher = _build_matcher(dict(raw_rule.get("match") or {})) + downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader) + options = _extract_route_options(raw_rule) + rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options)) + + return DownloadPolicy( + default_downloader=default_downloader, + default_options=default_options, + rules=tuple(rules), + ) + + +def load_download_policy(path: str | Path) -> DownloadPolicy: + policy_path = Path(path).expanduser().resolve() + payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {} + if not isinstance(payload, dict): + raise ValueError("Download policy file must define a mapping at the top level") + return build_download_policy(payload) + + +__all__ = [ + "DownloadPolicy", + "DownloadPolicyMatch", + "DownloadPolicyRule", + "VALID_DOWNLOADERS", + "build_download_policy", + "load_download_policy", +] diff --git a/src/glossapi/gloss_browser_downloader.py b/src/glossapi/gloss_browser_downloader.py new file mode 100644 index 0000000..1fc41fa --- /dev/null +++ b/src/glossapi/gloss_browser_downloader.py @@ -0,0 +1,415 @@ +"""Browser-capable downloader mode for browser-gated file endpoints.""" + +from __future__ import annotations + +import asyncio +import os +import time +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Any, Dict, Optional, Tuple + +import aiofiles +import aiohttp + +from .download_policy import DownloadPolicy, load_download_policy +from .gloss_downloader import GlossDownloader + + +@dataclass +class BrowserSessionState: + user_agent: str + cookie_header: str + cached_at: float + + +class BrowserGlossDownloader(GlossDownloader): + """ + Downloader variant that retries browser-gated file endpoints via Playwright. + + This mode only targets file endpoints that are protected by browser/session + checks. It intentionally does not attempt viewer-style extraction. + """ + + def __init__( + self, + *args, + browser_timeout_ms: int = 60000, + browser_post_load_wait_ms: int = 3000, + browser_engine: str = "chromium", + browser_headless: bool = True, + browser_session_ttl_seconds: int = 900, + browser_max_parallel_bootstraps: int = 2, + default_download_route: str = "auto", + **kwargs, + ): + super().__init__(*args, **kwargs) + self.browser_timeout_ms = int(browser_timeout_ms) + self.browser_post_load_wait_ms = int(browser_post_load_wait_ms) + self.browser_engine = str(browser_engine or "chromium") + self.browser_headless = bool(browser_headless) + self.browser_session_ttl_seconds = int(browser_session_ttl_seconds) + self.browser_max_parallel_bootstraps = max(1, int(browser_max_parallel_bootstraps)) + self.browser_bootstrap_semaphore = asyncio.Semaphore(self.browser_max_parallel_bootstraps) + self._browser_session_cache: Dict[str, BrowserSessionState] = {} + self._browser_session_locks: Dict[str, asyncio.Lock] = {} + self.default_download_route = str(default_download_route or "auto").strip().lower() + self.policy = self._load_policy() + + def _load_policy(self) -> Optional[DownloadPolicy]: + if self.download_policy is not None: + return self.download_policy + if self.download_policy_file: + return load_download_policy(self.download_policy_file) + return None + + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + if self.policy is not None: + return self.policy.resolve(url) + return self.default_download_route, {} + + def _route_setting(self, route_options: Dict[str, Any], name: str, fallback: Any) -> Any: + return route_options.get(name, fallback) + + def _domain_key(self, url: str) -> str: + return self._extract_base_domain(url) or (urlparse(url).hostname or "").lower() + + def _choose_browser_bootstrap_url(self, url: str) -> str: + if self._url_looks_like_file_endpoint(url): + return self.get_base_url(url) + return url + + def _should_ignore_navigation_exception(self, url: str, exc: Exception) -> bool: + message = str(exc) + if self._url_looks_like_file_endpoint(url) and "net::ERR_ABORTED" in message: + return True + return False + + def _session_lock_for_domain(self, domain_key: str) -> asyncio.Lock: + lock = self._browser_session_locks.get(domain_key) + if lock is None: + lock = asyncio.Lock() + self._browser_session_locks[domain_key] = lock + return lock + + def _is_browser_session_fresh(self, state: BrowserSessionState, route_options: Dict[str, Any]) -> bool: + ttl = int(self._route_setting(route_options, "browser_session_ttl_seconds", self.browser_session_ttl_seconds)) + if ttl <= 0: + return False + return (time.time() - state.cached_at) < ttl + + def _should_attempt_browser_recovery(self, url: str, html_issue: str) -> bool: + issue = str(html_issue or "").lower() + if "document viewer returned" in issue: + return False + if "challenge page returned" in issue: + return True + if "cookie bootstrap is required" in issue: + return True + if "expected a file-like response but received html instead" in issue: + return self._url_looks_like_file_endpoint(url) + return False + + def _build_ssl_connector(self) -> Optional[aiohttp.TCPConnector]: + connector = None + if not self.ssl_verify: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + def _domain_cookies_for_url(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} + for domain_pattern, domain_cookies in self.domain_cookies.items(): + if domain_pattern in url: + cookies.update(domain_cookies) + return cookies + + async def _write_recovered_file(self, row_index: int, filename: str, body: bytes) -> None: + tmp_path = self.downloads_dir / f".part_browser_{row_index}" + async with aiofiles.open(tmp_path, "wb") as handle: + await handle.write(body) + final_path = self.downloads_dir / filename + os.replace(tmp_path, final_path) + + async def _fetch_with_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + state: BrowserSessionState, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + request_headers = { + "User-Agent": state.user_agent, + "Accept": "application/pdf,application/octet-stream,*/*;q=0.8", + } + if state.cookie_header: + request_headers["Cookie"] = state.cookie_header + if referer: + request_headers["Referer"] = referer + + connector = self._build_ssl_connector() + timeout = aiohttp.ClientTimeout(total=min(max(self.request_timeout, 30), 180)) + async with aiohttp.ClientSession(connector=connector) as session: + async with session.get(url, headers=request_headers, timeout=timeout) as response: + response.raise_for_status() + body = await response.read() + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + return body, response_headers, {"candidate_url": url, "session_reused": True} + + async def _bootstrap_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + route_options: Dict[str, Any], + ) -> tuple[BrowserSessionState, list[tuple[str, Dict[str, str], str]]]: + timeout_ms = int(self._route_setting(route_options, "browser_timeout_ms", self.browser_timeout_ms)) + post_load_wait_ms = int( + self._route_setting(route_options, "browser_post_load_wait_ms", self.browser_post_load_wait_ms) + ) + browser_engine = str(self._route_setting(route_options, "browser_engine", self.browser_engine)) + browser_headless = bool(self._route_setting(route_options, "browser_headless", self.browser_headless)) + + try: + from playwright.async_api import async_playwright + except ImportError as exc: # pragma: no cover - exercised via monkeypatch + raise RuntimeError( + "Browser download mode requires the optional 'browser' dependencies " + "(install Playwright and browser binaries)" + ) from exc + + accepted_responses: list[tuple[str, Dict[str, str], str]] = [] + bootstrap_url = self._choose_browser_bootstrap_url(url) + + async with self.browser_bootstrap_semaphore: + async with async_playwright() as playwright: + browser_type = getattr(playwright, browser_engine, None) + if browser_type is None: + raise RuntimeError(f"Unsupported browser engine: {browser_engine}") + + browser = await browser_type.launch(headless=browser_headless) + context = await browser.new_context(ignore_https_errors=not self.ssl_verify) + parsed = urlparse(url) + browser_cookies = [ + { + "name": key, + "value": str(value), + "domain": parsed.hostname or "", + "path": "/", + } + for key, value in self._domain_cookies_for_url(url).items() + ] + if browser_cookies: + await context.add_cookies(browser_cookies) + page = await context.new_page() + if referer: + await page.set_extra_http_headers({"Referer": referer}) + + async def _route_filter(route: Any) -> None: + req = route.request + if req.resource_type in {"image", "media", "font"}: + await route.abort() + return + req_url = str(req.url or "") + if "googletagmanager" in req_url or "google-analytics.com" in req_url: + await route.abort() + return + await route.continue_() + + await page.route("**/*", _route_filter) + + def _record_response(response: Any) -> None: + try: + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + file_ext = self.infer_file_extension(response.url, response_headers, b"") + if file_ext and file_ext != "html" and self.is_supported_format(file_ext): + accepted_responses.append((response.url, response_headers, file_ext)) + except Exception: + return + + page.on("response", _record_response) + + try: + main_response = None + try: + main_response = await page.goto(bootstrap_url, wait_until="networkidle", timeout=timeout_ms) + except Exception as exc: + if not self._should_ignore_navigation_exception(bootstrap_url, exc): + raise + if main_response is not None: + main_headers = {str(k): str(v) for k, v in (main_response.headers or {}).items()} + main_ext = self.infer_file_extension(main_response.url, main_headers, b"") + if main_ext and main_ext != "html" and self.is_supported_format(main_ext): + accepted_responses.insert(0, (main_response.url, main_headers, main_ext)) + if not accepted_responses and post_load_wait_ms > 0: + await page.wait_for_timeout(post_load_wait_ms) + + browser_user_agent = await page.evaluate("() => navigator.userAgent") + browser_cookies = await context.cookies() + finally: + await browser.close() + + cookie_header = "; ".join( + f"{cookie['name']}={cookie['value']}" for cookie in browser_cookies if cookie.get("name") + ) + return BrowserSessionState( + user_agent=browser_user_agent, + cookie_header=cookie_header, + cached_at=time.time(), + ), accepted_responses + + async def _download_via_browser_session( + self, + *, + url: str, + referer: Optional[str], + route_options: Optional[Dict[str, Any]] = None, + force_refresh: bool = False, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + options = dict(route_options or {}) + domain_key = self._domain_key(url) + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + lock = self._session_lock_for_domain(domain_key) + async with lock: + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + state, accepted_responses = await self._bootstrap_browser_session_state( + url=url, + referer=referer, + route_options=options, + ) + self._browser_session_cache[domain_key] = state + candidate_url = accepted_responses[0][0] if accepted_responses else url + body, response_headers, meta = await self._fetch_with_browser_session_state( + url=candidate_url, + referer=referer, + state=state, + ) + meta.update({ + "candidate_url": candidate_url, + "session_reused": False, + "domain_key": domain_key, + }) + return body, response_headers, meta + + async def _download_browser_route( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + route_options: Dict[str, Any], + ) -> Tuple[bool, str, str, str, int]: + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + error_msg = f"Browser-routed download failed: {exc}" + self.logger.warning(error_msg) + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + return await self._finalize_download_result( + row_index=row_index, + url=meta.get("candidate_url") or url, + resp_headers=response_headers, + content=body, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route != "browser": + return None + return await self._download_browser_route( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + route_options=route_options, + ) + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route == "standard": + return None + if route == "auto" and not self._should_attempt_browser_recovery(url, html_issue): + return None + + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + message = f"{html_issue}; browser recovery failed: {exc}" + self.logger.warning(message) + return False, "", "html", message, retry_count + 1 + + file_ext = self.infer_file_extension(meta["candidate_url"], response_headers, body) + if file_ext == "html": + message = ( + f"{html_issue}; browser recovery still returned HTML from {meta['candidate_url']}" + ) + self.logger.warning(message) + return False, "", file_ext, message, retry_count + 1 + if not self.is_supported_format(file_ext): + message = ( + f"{html_issue}; browser recovery returned unsupported format: {file_ext}" + ) + self.logger.warning(message) + return False, "", file_ext or "", message, retry_count + 1 + + if filename_base and str(filename_base).strip(): + filename = f"{filename_base}.{file_ext}" + else: + filename = self.generate_filename(row_index, file_ext) + + await self._write_recovered_file(row_index, filename, body) + self.logger.info( + "Recovered browser-gated download via browser mode: %s -> %s", + url, + filename, + ) + return True, filename, file_ext, "", retry_count diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index f9a7bf2..b1b6c61 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -141,6 +141,8 @@ def __init__( error_burst_window: int = 20, error_burst_threshold: float = 0.5, park_403_seconds: float = 600.0, + download_policy_file: Optional[Union[str, Path]] = None, + download_policy: Optional[Any] = None, _used_filename_bases: Optional[Set[str]] = None, ): """ @@ -241,6 +243,8 @@ def verbose_log(self, message, level=logging.DEBUG): self.checkpoint_seconds = float(checkpoint_seconds) if checkpoint_seconds else None # Warnings JSON path self.domain_warnings_path = self.output_dir / 'domain_scheduler_warnings.json' + self.download_policy_file = Path(download_policy_file).expanduser().resolve() if download_policy_file else None + self.download_policy = download_policy # Progress logger (separate file; default to output logs dir) self.progress_logger = self.logger @@ -530,12 +534,47 @@ def _extract_base_domain(self, url: str) -> str: except Exception: return '' + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + return "standard", {} + + def _route_setting(self, route_options: Optional[Dict[str, Any]], name: str, fallback: Any) -> Any: + if route_options and name in route_options: + return route_options[name] + return fallback + + def _resolve_domain_scheduler_settings( + self, + route_options: Optional[Dict[str, Any]], + ) -> tuple[int, int, int, int]: + floor = max( + 1, + int(self._route_setting(route_options, "domain_concurrency_floor", self.domain_concurrency_floor)), + ) + raw_ceiling = self._route_setting(route_options, "domain_concurrency_ceiling", self.domain_concurrency_ceiling) + if raw_ceiling is None: + ceiling = max(floor, int(self.domain_concurrency_ceiling)) + else: + ceiling = max(floor, int(raw_ceiling)) + start = max( + floor, + min( + int(self._route_setting(route_options, "per_domain_concurrency", self.per_domain_concurrency)), + max(1, self.concurrency), + ceiling, + ), + ) + skip_after = max(1, int(self._route_setting(route_options, "skip_failed_after", self.skip_failed_after))) + return floor, ceiling, start, skip_after + @dataclass class _DomainState: base: str queue: deque = field(default_factory=deque) active: int = 0 concurrency: int = 1 + concurrency_floor: int = 1 + concurrency_ceiling: int = 1 + skip_failed_after: int = 3 successes: int = 0 failures: int = 0 http_429: int = 0 @@ -765,41 +804,115 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes # 5) Fall back to URL ext if any, otherwise 'bin' return url_ext if url_ext else 'bin' - - async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], - rate_limiter: RateLimiter, retry_count: int = 0, - filename_base: Optional[str] = None, - referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + + def _url_looks_like_file_endpoint(self, url: str) -> bool: + """Return True when the URL shape suggests a direct file download endpoint.""" + try: + lowered = str(url or "").lower() + except Exception: + return False + hints = ( + ".pdf", + ".docx", + ".pptx", + ".xml", + ".csv", + "/pdf", + "format=pdf", + "type=pdf", + "download", + "attachment", + "/file", + "getfile.php", + ) + return any(token in lowered for token in hints) + + def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: bytes) -> Optional[str]: """ - Download a file from a URL - - Args: - row_index: Index in the dataframe - url: URL to download - semaphore: Semaphore for concurrency control - rate_limiter: Rate limiter for API limits - retry_count: Current retry count - Returns: - Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + Detect HTML challenge/viewer pages that should not count as successful downloads. + + We still allow regular HTML documents, but fail fast on common interstitials + such as WAF challenge pages and JavaScript-only document viewers. """ - if not url or pd.isna(url): - return False, "", "", "Empty URL", retry_count - - # Get a new user-agent for each request - user_agent = next(self.user_agents) - domain = urlparse(url).netloc - - # Ensure URL has scheme + try: + lower_headers = {str(k).lower(): str(v).lower() for k, v in (headers or {}).items()} + lower_body = (content or b"")[: 1 << 17].decode("utf-8", errors="ignore").lower() + except Exception: + lower_headers = {} + lower_body = "" + + if not lower_body: + return None + + if ( + "x-amzn-waf-action" in lower_headers + or "awswafintegration" in lower_body + or "challenge.js" in lower_body + or "verify that you're not a robot" in lower_body + ): + return ( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ) + + viewer_markers = ( + "fliphtml5_pages", + "monitor:player:html5", + "javascript/loadingjs.js", + "javascript/main.js", + "bookconfig.totalpagecount", + "getfile.php?lib=", + ) + viewer_hits = sum(1 for marker in viewer_markers if marker in lower_body) + if viewer_hits >= 2: + return ( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ) + + content_type = lower_headers.get("content-type", "") + if self._url_looks_like_file_endpoint(url) and "text/html" in content_type: + return "Expected a file-like response but received HTML instead" + + return None + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to recover from HTML interstitials via alternate fetch modes.""" + return None + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to short-circuit the direct HTTP path for known routes.""" + return None + + def _normalize_request_url(self, url: str) -> str: if not url.startswith(("http://", "https://")): - url = f"https://{url}" - - # Get base URL for referer header + return f"https://{url}" + return url + + def _build_request_headers(self, url: str, user_agent: str, referer: Optional[str]) -> Dict[str, str]: + domain = urlparse(url).netloc base_url = self.get_base_url(url) - - # Enhanced headers with common browser-like attributes to bypass 403 errors - # Prefer caller-provided referer (e.g., the external_link page) - _referer = (referer or '').strip() - headers = { + referer_value = (referer or '').strip() + return { 'User-Agent': user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', @@ -813,75 +926,323 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'TE': 'trailers', - 'Referer': _referer if _referer else f"https://www.google.com/search?q={domain}", + 'Referer': referer_value if referer_value else f"https://www.google.com/search?q={domain}", 'Origin': base_url, 'DNT': '1' } - - # Check for domain-specific cookies - cookies = {} + + def _resolve_request_cookies(self, url: str, route_options: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + cookies: Dict[str, str] = {} for domain_pattern, domain_cookies in self.domain_cookies.items(): if domain_pattern in url: cookies.update(domain_cookies) # If the domain needs dynamic values like random IDs - for key, value in cookies.items(): + for key, value in list(cookies.items()): if 'random.randint' in str(value): # Replace with an actual random value (only supporting this pattern for now) - if 'session-id' in value: + if 'session-id' in str(value): cookies[key] = f"session-id-{random.randint(100000000, 999999999)}" + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict): + cookies.update({str(k): str(v) for k, v in extra_cookies.items()}) + return cookies + + def _build_request_timeout( + self, + retry_count: int, + route_options: Optional[Dict[str, Any]] = None, + ) -> aiohttp.ClientTimeout: + base_request_timeout = float(self._route_setting(route_options, "request_timeout", self.request_timeout)) + return aiohttp.ClientTimeout( + total=min(base_request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes + connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute + sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute + sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + ) + + def _build_session_connector( + self, + url: str, + route_options: Optional[Dict[str, Any]] = None, + ) -> Optional[aiohttp.TCPConnector]: + connector = None + url_base = self._extract_base_domain(url) + force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) + ssl_verify = bool(self._route_setting(route_options, "ssl_verify", self.ssl_verify)) + ssl_cafile = self._route_setting(route_options, "ssl_cafile", self.ssl_cafile) + if (not ssl_verify) or force_insecure: + connector = aiohttp.TCPConnector(ssl=False) + elif ssl_cafile: + import ssl as _ssl + ctx = _ssl.create_default_context(cafile=str(ssl_cafile)) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + async def _bootstrap_download_session( + self, + session: aiohttp.ClientSession, + url: str, + headers: Dict[str, str], + route_options: Optional[Dict[str, Any]] = None, + ) -> Dict[str, str]: + headers = await self.setup_session(session, url, headers) + + # Set a shorter timeout for the initial connection attempt + base_timeout = aiohttp.ClientTimeout(total=10) + try: + # Visit the base domain to establish cookies if needed + base_domain = urlparse(url).netloc + all_cookie_domains = set(self.domain_cookies.keys()) + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict) and extra_cookies: + all_cookie_domains.add(base_domain) + if any(domain in base_domain for domain in all_cookie_domains): + base_url = f"https://{base_domain}" + async with session.get(base_url, headers=headers, timeout=base_timeout): + pass + except Exception as e: + # Non-fatal error, just log and continue + self.logger.debug(f"Initial base URL visit failed: {str(e)}") + return headers + + def _best_effort_url_extension(self, url: str) -> str: + try: + return self.get_file_extension_from_url(url) + except Exception: + return "" + + def _build_output_filename(self, row_index: int, file_ext: str, filename_base: Optional[str]) -> str: + if filename_base and str(filename_base).strip(): + return f"{filename_base}.{file_ext}" + return self.generate_filename(row_index, file_ext) + + def _cleanup_temp_file(self, tmp_path: Optional[Path]) -> None: + if not tmp_path: + return + try: + os.remove(tmp_path) + except Exception: + pass + + def _move_temp_file_to_final(self, tmp_path: Path, filename: str) -> None: + final_path = Path(self.downloads_dir) / filename + try: + os.replace(tmp_path, final_path) + except Exception: + try: + os.rename(tmp_path, final_path) + except Exception: + pass + + async def _finalize_download_result( + self, + *, + row_index: int, + url: str, + resp_headers: Dict[str, str], + content: bytes, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + tmp_path: Optional[Path] = None, + ) -> Tuple[bool, str, str, str, int]: + file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self._cleanup_temp_file(tmp_path) + recovered = await self._recover_html_interstitial( + row_index=row_index, + url=url, + headers=resp_headers, + content=content, + html_issue=html_issue, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + if recovered is not None: + return recovered + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count + if not self.is_supported_format(file_ext): + self._cleanup_temp_file(tmp_path) + self.logger.warning( + f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}" + ) + return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count + + filename = self._build_output_filename(row_index, file_ext, filename_base) + if tmp_path is not None: + self._move_temp_file_to_final(tmp_path, filename) + else: + await self.write_file(filename, content, self.downloads_dir) + self.logger.info(f"Successfully downloaded {filename} from {url}") + return True, filename, file_ext, "", retry_count + + async def _download_via_streaming_get( + self, + *, + session: aiohttp.ClientSession, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + from tenacity import AsyncRetrying + + head = bytearray() + async for attempt in AsyncRetrying( + stop=stop_after_attempt(max(1, int(self.max_retries))), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=(retry_if_exception_type(aiohttp.ClientError) | + retry_if_exception_type(asyncio.TimeoutError)), + before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), + reraise=True, + ): + with attempt: + async with session.get(url, headers=headers, timeout=timeout) as response: + response.raise_for_status() + resp_headers = dict(response.headers or {}) + tmp_path = Path(self.downloads_dir) / f".part_{row_index}" + async with aiofiles.open(tmp_path, 'wb') as f: + async for chunk in response.content.iter_chunked(1 << 16): + if chunk: + if len(head) < (1 << 16): + need = (1 << 16) - len(head) + head.extend(chunk[:need]) + await f.write(chunk) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=bytes(head), + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + tmp_path=tmp_path, + ) + return False, "", "", "Retry exhaustion", retry_count + 1 + + async def _download_via_buffered_request( + self, + *, + session: aiohttp.ClientSession, + requester: str, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + content, status, resp_headers = await self.make_request( + session, requester, url, headers, timeout + ) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=content, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + def _build_http_error_result( + self, + url: str, + error: aiohttp.ClientResponseError, + retry_count: int, + ) -> Tuple[bool, str, str, str, int]: + status = error.status + self.logger.warning(f"Received {status} for {url}") + + if self.verbose: + self.logger.debug(f"HTTP Error Details - Status: {error.status}, Message: {error.message}") + self.logger.debug(f"Headers: {error.headers if hasattr(error, 'headers') else 'No headers available'}") + self.logger.debug(f"Request info: {error.request_info if hasattr(error, 'request_info') else 'No request info available'}") + + retry_after = None + try: + hdrs = dict(getattr(error, 'headers', {}) or {}) + for k, v in hdrs.items(): + if k.lower() == 'retry-after': + val = str(v).strip() + if val.isdigit(): + retry_after = int(val) + else: + try: + dt = parsedate_to_datetime(val) + retry_after = max(0, int((dt.timestamp() - time.time()))) + except Exception: + retry_after = None + break + except Exception: + retry_after = None + error_msg = f"HTTP {status}: {str(error)}" + if status in (429, 503) and retry_after is not None: + error_msg += f" retry_after={retry_after}" + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + + async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], + rate_limiter: RateLimiter, retry_count: int = 0, + filename_base: Optional[str] = None, + referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + """ + Download a file from a URL + + Args: + row_index: Index in the dataframe + url: URL to download + semaphore: Semaphore for concurrency control + rate_limiter: Rate limiter for API limits + retry_count: Current retry count + Returns: + Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + """ + if not url or pd.isna(url): + return False, "", "", "Empty URL", retry_count + + url = self._normalize_request_url(url) + _, route_options = self._resolve_route(url) + user_agent = next(self.user_agents) + headers = self._build_request_headers(url, user_agent, referer) + cookies = self._resolve_request_cookies(url, route_options=route_options) if semaphore: await semaphore.acquire() try: - # Apply rate limiting await rate_limiter.acquire() - - # Implement exponential backoff - sleep_time = self.sleep * (2 ** retry_count) + base_sleep = float(self._route_setting(route_options, "sleep", self.sleep)) + sleep_time = base_sleep * (2 ** retry_count) await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) - - # Set up timeout with exponential backoff - timeout = aiohttp.ClientTimeout( - total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes - connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute - sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute - sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + preflight = await self._preflight_download( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - + if preflight is not None: + return preflight + timeout = self._build_request_timeout(retry_count, route_options=route_options) + try: - # Prepare optional SSL connector - connector = None - # Domain-specific insecure override (discovered via ping) - url_base = self._extract_base_domain(url) - _force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) - if (not self.ssl_verify) or _force_insecure: - connector = aiohttp.TCPConnector(ssl=False) - elif self.ssl_cafile: - import ssl as _ssl - ctx = _ssl.create_default_context(cafile=self.ssl_cafile) - connector = aiohttp.TCPConnector(ssl=ctx) - # Create a new session for each download to avoid cookie contamination + connector = self._build_session_connector(url, route_options=route_options) async with aiohttp.ClientSession(cookies=cookies, connector=connector) as session: try: - # Try to access the base domain first to establish cookies - headers = await self.setup_session(session, url, headers) - - # Set a shorter timeout for the initial connection attempt - base_timeout = aiohttp.ClientTimeout(total=10) - try: - # Visit the base domain to establish cookies if needed - base_domain = urlparse(url).netloc - if any(domain in base_domain for domain in self.domain_cookies.keys()): - base_url = f"https://{base_domain}" - async with session.get(base_url, headers=headers, timeout=base_timeout): - pass - except Exception as e: - # Non-fatal error, just log and continue - self.logger.debug(f"Initial base URL visit failed: {str(e)}") - pass - - # Choose request method and perform streaming for GET - requester = self.request_method.lower() + headers = await self._bootstrap_download_session( + session, + url, + headers, + route_options=route_options, + ) + requester = str(self._route_setting(route_options, "request_method", self.request_method)).lower() try: self.verbose_log(f"Attempting download request to URL: {url}") @@ -889,112 +1250,30 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn self.verbose_log(f"Headers: {headers}") if requester == 'get': - # Streaming GET with retries - from tenacity import AsyncRetrying - head = bytearray() - resp_headers = {} - async for attempt in AsyncRetrying( - stop=stop_after_attempt(max(1, int(self.max_retries))), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=(retry_if_exception_type(aiohttp.ClientError) | - retry_if_exception_type(asyncio.TimeoutError)), - before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), - reraise=True, - ): - with attempt: - async with session.get(url, headers=headers, timeout=timeout) as response: - response.raise_for_status() - resp_headers = dict(response.headers or {}) - # Write to a temp file first - tmp_path = Path(self.downloads_dir) / f".part_{row_index}" - async with aiofiles.open(tmp_path, 'wb') as f: - async for chunk in response.content.iter_chunked(1 << 16): - if chunk: - if len(head) < (1 << 16): - need = (1 << 16) - len(head) - head.extend(chunk[:need]) - await f.write(chunk) - # Infer extension using URL, headers and first bytes - file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) - if not self.is_supported_format(file_ext): - # Clean up temp and report - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - # Decide final filename - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - final_path = Path(self.downloads_dir) / filename - try: - os.replace(tmp_path, final_path) - except Exception: - # Fallback to copy/rename - try: - os.rename(tmp_path, final_path) - except Exception: - pass - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count - else: - # Fallback to non-streaming POST - content, status, resp_headers = await self.make_request( - session, requester, url, headers, timeout + return await self._download_via_streaming_get( + session=session, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - file_ext = self.infer_file_extension(url, resp_headers, content) - if not self.is_supported_format(file_ext): - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - await self.write_file(filename, content, self.downloads_dir) - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count + return await self._download_via_buffered_request( + session=session, + requester=requester, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) except aiohttp.ClientResponseError as e: - # Handle HTTP errors - status = e.status - self.logger.warning(f"Received {status} for {url}") - - # Detailed verbose logging for HTTP errors - if self.verbose: - self.logger.debug(f"HTTP Error Details - Status: {e.status}, Message: {e.message}") - self.logger.debug(f"Headers: {e.headers if hasattr(e, 'headers') else 'No headers available'}") - self.logger.debug(f"Request info: {e.request_info if hasattr(e, 'request_info') else 'No request info available'}") - - # Build error with optional Retry-After info - retry_after = None - try: - hdrs = dict(getattr(e, 'headers', {}) or {}) - for k, v in hdrs.items(): - if k.lower() == 'retry-after': - val = str(v).strip() - if val.isdigit(): - retry_after = int(val) - else: - try: - dt = parsedate_to_datetime(val) - retry_after = max(0, int((dt.timestamp() - time.time()))) - except Exception: - retry_after = None - break - except Exception: - retry_after = None - error_msg = f"HTTP {status}: {str(e)}" - if status in (429, 503) and retry_after is not None: - error_msg += f" retry_after={retry_after}" - # Best-effort ext from URL if possible - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return self._build_http_error_result(url, e, retry_count) except Exception as e: error_msg = str(e) @@ -1007,11 +1286,7 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn import traceback self.logger.debug(f"Traceback: {traceback.format_exc()}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Overall timeout exceeded for {url}") @@ -1023,22 +1298,14 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn except aiohttp.ClientError as e: error_msg = str(e) self.logger.error(f"ClientError while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Timeout while downloading {url}") return False, "", "", "Timeout", retry_count + 1 except Exception as e: error_msg = str(e) self.logger.error(f"Error while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 finally: if semaphore: try: @@ -1137,6 +1404,8 @@ def _write_checkpoint() -> None: for i, row_idx in enumerate(batch_indices): url = df.loc[row_idx, self.url_column] retry_count = df.loc[row_idx, 'download_retry_count'] + _, route_options = self._resolve_route(url) + _, _, _, skip_after = self._resolve_domain_scheduler_settings(route_options) # Optional per-row referer (e.g., external_link page) ref_val = None if self.referer_column and self.referer_column in df.columns: @@ -1156,7 +1425,7 @@ def _write_checkpoint() -> None: pass # Skip URLs that have failed too many times - if retry_count >= self.skip_failed_after: + if retry_count >= skip_after: self.logger.info(f"Skipping URL at row {row_idx} - too many failures: {retry_count}") continue @@ -1367,6 +1636,7 @@ def _write_checkpoint() -> None: domains: Dict[str, GlossDownloader._DomainState] = {} for idx in row_indices: url = df.at[idx, self.url_column] + _, route_options = self._resolve_route(url) # Determine grouping key if self.scheduler_group_by and self.scheduler_group_by != 'base_domain': key = str(df.at[idx, self.scheduler_group_by]) if self.scheduler_group_by in df.columns else '' @@ -1377,9 +1647,14 @@ def _write_checkpoint() -> None: if not key: key = '' if key not in domains: - # Each group starts with up to per_domain_concurrency, but not exceeding global - start_c = min(self.per_domain_concurrency, max(1, self.concurrency)) - domains[key] = GlossDownloader._DomainState(base=key, concurrency=start_c) + floor_c, ceiling_c, start_c, skip_after = self._resolve_domain_scheduler_settings(route_options) + domains[key] = GlossDownloader._DomainState( + base=key, + concurrency=start_c, + concurrency_floor=floor_c, + concurrency_ceiling=ceiling_c, + skip_failed_after=skip_after, + ) domains[key].queue.append(idx) if not domains: @@ -1638,7 +1913,7 @@ def estimate_eta_s(state: GlossDownloader._DomainState) -> float: if remaining <= 0: return 0.0 avg = state.avg_duration() or 5.0 # default initial guess - eff_c = max(self.domain_concurrency_floor, min(state.concurrency, self.domain_concurrency_ceiling)) + eff_c = max(state.concurrency_floor, min(state.concurrency, state.concurrency_ceiling)) # ETA ≈ remaining * avg / eff_c (assuming steady parallelism) return float(remaining) * avg / max(1, eff_c) @@ -1722,7 +1997,7 @@ async def dispatch_ready(): if pending_domains: active_order.append(pending_domains.popleft()) continue - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Unparked domain: {dom}; resuming at concurrency={state.concurrency}") # Attempt to launch up to (state.concurrency - state.active) while ( @@ -1734,7 +2009,7 @@ async def dispatch_ready(): url = df.at[row_idx, self.url_column] retry_count = int(df.at[row_idx, 'download_retry_count']) if 'download_retry_count' in df.columns else 0 # Skip rows with too many failures - if retry_count >= self.skip_failed_after: + if retry_count >= state.skip_failed_after: continue # Launch task t0 = time.time() @@ -1916,7 +2191,7 @@ async def dispatch_ready(): # Dynamic tuning: ease if overloaded if self.dynamic_tuning and should_ease(state): - if state.concurrency > self.domain_concurrency_floor: + if state.concurrency > state.concurrency_floor: state.concurrency -= 1 self.logger.info(f"Easing concurrency for {dom} -> {state.concurrency}") @@ -1936,14 +2211,14 @@ async def dispatch_ready(): if retry_after is None: retry_after = max(1, int(self.ping_recheck_seconds)) state.parked_until = now2 + retry_after - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Rate limited: {dom}; parked for {retry_after}s") # Timeout streak -> exponential backoff elif state.timeout_streak >= int(getattr(self, 'timeout_streak_threshold', 5)): backoff = min(float(getattr(self, 'backoff_min_s', 60.0)) * (2 ** max(0, state.ping_failures)), float(getattr(self, 'backoff_max_s', 900.0))) state.ping_failures += 1 state.parked_until = now2 + backoff - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) state.timeout_streak = 0 self.progress_logger.info(f"[park] Timeout streak: {dom}; parked for {int(backoff)}s (level={state.ping_failures})") else: @@ -1965,7 +2240,7 @@ async def dispatch_ready(): state.eta_exceeded_count += 1 if state.eta_exceeded_count == 1: # Try to increase concurrency gently to improve ETA, up to ceiling - if state.concurrency < self.domain_concurrency_ceiling: + if state.concurrency < state.concurrency_ceiling: state.concurrency += 1 self.logger.info( f"ETA high for {dom} ({int(eta_s)}s). Bumping concurrency -> {state.concurrency}" diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..1c21cf1 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -10,7 +10,6 @@ AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, - RapidOcrOptions, LayoutOptions, TableStructureOptions, TableFormerMode, @@ -47,9 +46,9 @@ def _maybe_import_torch(*, force: bool = False): MarkdownFormatOption = None CsvFormatOption = None StandardPdfPipeline = None -DoclingParseV2DocumentBackend = None DoclingParseDocumentBackend = None PyPdfiumDocumentBackend = None +_DOCLING_PARSE_BACKEND_NAME = "docling_parse" class _NoOpOption: # minimal stand-ins for optional helpers @@ -84,19 +83,23 @@ def _ensure_docling_converter_loaded() -> None: def _ensure_docling_pipeline_loaded() -> None: global _DOC_PIPELINE_LOADED, StandardPdfPipeline - global DoclingParseV2DocumentBackend, DoclingParseDocumentBackend, PyPdfiumDocumentBackend + global DoclingParseDocumentBackend, PyPdfiumDocumentBackend, _DOCLING_PARSE_BACKEND_NAME if _DOC_PIPELINE_LOADED: return try: StandardPdfPipeline = importlib.import_module( "docling.pipeline.standard_pdf_pipeline" ).StandardPdfPipeline - DoclingParseV2DocumentBackend = importlib.import_module( - "docling.backend.docling_parse_v2_backend" - ).DoclingParseV2DocumentBackend - DoclingParseDocumentBackend = importlib.import_module( - "docling.backend.docling_parse_backend" - ).DoclingParseDocumentBackend + try: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_backend" + ).DoclingParseDocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse" + except Exception: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_v2_backend" + ).DoclingParseV2DocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse_v2" PyPdfiumDocumentBackend = importlib.import_module( "docling.backend.pypdfium2_backend" ).PyPdfiumDocumentBackend @@ -106,11 +109,8 @@ def _ensure_docling_pipeline_loaded() -> None: from docling.pipeline.simple_pipeline import SimplePipeline -# Ensure RapidOCR plugin is registered for factory-based OCR construction -import docling.models.rapid_ocr_model # noqa: F401 -from .ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from .ocr.rapidocr.pool import GLOBAL_RAPID_OCR_POOL import inspect +from .ocr.docling_pipeline import build_layout_pipeline import ftfy import logging @@ -328,7 +328,7 @@ def _apply_thread_caps(self) -> None: self._thread_caps_applied = True def release_resources(self) -> None: - """Release Docling converters, pooled RapidOCR engines, and GPU caches.""" + """Release Docling converters and GPU caches.""" try: self.converter = None except Exception: @@ -343,10 +343,6 @@ def release_resources(self) -> None: setattr(self, attr, None) except Exception: pass - try: - GLOBAL_RAPID_OCR_POOL.clear() - except Exception: - pass torch_mod = _maybe_import_torch() if torch_mod is not None and getattr(torch_mod, "cuda", None): try: @@ -390,7 +386,7 @@ def _convert_all_with_timeout(self, files: Iterable[Path], timeout_s: int, **kwa timeout_kw = None backend_cls = getattr(self, "_active_pdf_backend", None) - is_native_backend = backend_cls is DoclingParseV2DocumentBackend if backend_cls else False + is_native_backend = backend_cls is DoclingParseDocumentBackend if backend_cls else False if timeout_kw and not is_native_backend and len(set(budgets)) == 1: kw = dict(raises_on_error=False) @@ -553,12 +549,7 @@ def create_extractor( ocr_langs: list[str] | None = None, profile_timings: bool = True, ): - """Create a document converter with configured options using the canonical builder. - - Delegates PDF pipeline construction to `glossapi.ocr.rapidocr.pipeline.build_rapidocr_pipeline` - to avoid duplicated provider checks and option wiring. Falls back to the legacy - inline path if the canonical builder is unavailable. - """ + """Create a Docling document converter for Phase-1 extraction.""" _ensure_docling_converter_loaded() _ensure_docling_pipeline_loaded() # Enable/disable Docling pipeline timings collection (for benchmarks) @@ -569,176 +560,88 @@ def create_extractor( pass # Record the PDF backend name for provenance (default to native backend) - self.pdf_backend_name = "docling_parse_v2" - self._active_pdf_backend = DoclingParseV2DocumentBackend + self.pdf_backend_name = _DOCLING_PARSE_BACKEND_NAME + self._active_pdf_backend = DoclingParseDocumentBackend # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: - if formula_enrichment: + if formula_enrichment or code_enrichment: torch_mod = _maybe_import_torch(force=True) if torch_mod is None: - raise RuntimeError("Torch not available but formula enrichment requested.") + raise RuntimeError("Torch not available but Docling GPU enrichment was requested.") if hasattr(torch_mod, "cuda") and isinstance(getattr(self, "pipeline_options", None), PdfPipelineOptions): dev = getattr(self.pipeline_options, "accelerator_options", None) dv = getattr(dev, "device", None) if (isinstance(dv, str) and dv.lower().startswith("cuda")) and not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") + raise RuntimeError("Torch CUDA not available but Docling GPU enrichment was requested.") except Exception as e: raise RuntimeError(f"Torch CUDA preflight failed: {e}") - # Build PDF pipeline via the canonical builder (preferred) - opts = None - active_backend = DoclingParseV2DocumentBackend - try: - from .ocr.rapidocr.pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - except Exception: # pragma: no cover - adapter fallback - from ._pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - - device_str = self._current_device_str() or "cuda:0" - builder = build_rapidocr_pipeline if enable_ocr else build_layout_pipeline - - try: - _, opts = builder( - device=device_str, - images_scale=float(images_scale), - formula_enrichment=bool(formula_enrichment), - code_enrichment=bool(code_enrichment), - **({"text_score": float(text_score)} if enable_ocr else {}), - ) - - if enable_ocr and hasattr(opts, "ocr_options") and getattr(opts, "ocr_options", None) is not None: - if use_cls is not None: - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - if ocr_langs: - setattr(opts.ocr_options, "lang", list(ocr_langs)) # type: ignore[attr-defined] - if force_full_page_ocr is not None: - setattr(opts.ocr_options, "force_full_page_ocr", bool(force_full_page_ocr)) # type: ignore[attr-defined] - + if enable_ocr: try: - setattr(opts, "images_scale", float(images_scale)) + self._log.warning( + "Docling Phase-1 OCR is no longer supported. " + "Ignoring enable_ocr/force_full_page_ocr; use Corpus.ocr(backend='deepseek') instead." + ) except Exception: pass - self._active_pdf_options = opts - self._current_ocr_enabled = bool(enable_ocr) - - # Create a multi-format DocumentConverter using the built PDF options - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - if opts is None: - opts = self.pipeline_options - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=opts, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), - InputFormat.XML_JATS: XMLJatsFormatOption(), - InputFormat.HTML: HTMLFormatOption(), - InputFormat.PPTX: PowerpointFormatOption(), - InputFormat.CSV: CsvFormatOption(), - InputFormat.MD: MarkdownFormatOption(), - }, - ) - self._active_pdf_backend = active_backend + active_backend = DoclingParseDocumentBackend + device_str = self._current_device_str() or "cuda:0" + _, opts = build_layout_pipeline( + device=device_str, + images_scale=float(images_scale), + formula_enrichment=bool(formula_enrichment), + code_enrichment=bool(code_enrichment), + ) + try: + opts.do_ocr = False + setattr(opts, "images_scale", float(images_scale)) except Exception: - # Fallback to legacy inline configuration path - if enable_ocr: - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError( - "RapidOCR ONNX models/keys not found. Ensure models exist under glossapi.models/rapidocr or set GLOSSAPI_RAPIDOCR_ONNX_DIR." - ) - langs = ocr_langs or ["el", "en"] - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=langs, - force_full_page_ocr=bool(force_full_page_ocr), - use_det=True, - use_cls=bool(use_cls), - use_rec=True, - text_score=float(text_score), - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - self.pipeline_options.ocr_options = ocr_opts - # Attach core toggles to existing pipeline_options - try: - self.pipeline_options.do_ocr = bool(enable_ocr) - self.pipeline_options.do_formula_enrichment = bool(formula_enrichment) - self.pipeline_options.do_code_enrichment = bool(code_enrichment) - try: - setattr(self.pipeline_options, "images_scale", float(images_scale)) - except Exception: - pass - except Exception: - pass - if not enable_ocr: - try: - setattr(self.pipeline_options, "ocr_options", None) - except Exception: - pass + pass - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=self.pipeline_options, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - }, - ) + self._active_pdf_options = opts + self._current_ocr_enabled = False - self._active_pdf_options = self.pipeline_options - self._current_ocr_enabled = bool(enable_ocr) - self._active_pdf_backend = active_backend + pdf_backend = DoclingParseDocumentBackend + try: + if getattr(self, "use_pypdfium_backend", False): + pdf_backend = PyPdfiumDocumentBackend + self.pdf_backend_name = "pypdfium" + except Exception: + pdf_backend = DoclingParseDocumentBackend + active_backend = pdf_backend + + self.converter = DocumentConverter( + allowed_formats=[ + InputFormat.PDF, + InputFormat.DOCX, + InputFormat.XML_JATS, + InputFormat.HTML, + InputFormat.PPTX, + InputFormat.CSV, + InputFormat.MD, + ], + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=opts, + pipeline_cls=StandardPdfPipeline, + backend=active_backend, + ), + InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), + InputFormat.XML_JATS: XMLJatsFormatOption(), + InputFormat.HTML: HTMLFormatOption(), + InputFormat.PPTX: PowerpointFormatOption(), + InputFormat.CSV: CsvFormatOption(), + InputFormat.MD: MarkdownFormatOption(), + }, + ) + self._active_pdf_backend = active_backend # Record last configuration for reuse try: self._last_extractor_cfg = self._cfg_signature( - enable_ocr=enable_ocr, + enable_ocr=False, force_full_page_ocr=force_full_page_ocr, text_score=text_score, images_scale=images_scale, @@ -914,6 +817,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: except Exception as e: self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}") + # Always attempt to assemble whatever chunks succeeded (best-effort) + out_md_path = output_dir / f"{stem}.md" + final_md_written = False + if all_segments: + try: + final_md = "\n\n".join(all_segments) + out_md_path.write_text(final_md, encoding="utf-8") + final_md_written = True + except Exception as e: + self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not completed: # Record failure/timeout provenance in parquet try: @@ -928,6 +842,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: chunk_size=self.chunk_size, chunk_count=len(manifest.get("entries", [])), chunk_manifest_path=manifest_path, + no_partial_output=not final_md_written, ) except Exception as e: self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}") @@ -939,14 +854,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}") return False - # Assemble final markdown - try: - final_md = "\n\n".join(all_segments) - out_md_path = output_dir / f"{stem}.md" - with out_md_path.open("w", encoding="utf-8") as fp: - fp.write(final_md) - except Exception as e: - self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not final_md_written: return False # Record success provenance in parquet try: @@ -1294,7 +1202,7 @@ def _update_extraction_metadata( if chunk_manifest_path is not None: data["chunk_manifest_path"] = str(chunk_manifest_path) # Backend and failure - backend_name = getattr(self, "pdf_backend_name", None) or ("docling_parse_v2" if getattr(self, "USE_V2", True) else "docling_parse") + backend_name = getattr(self, "pdf_backend_name", None) or _DOCLING_PARSE_BACKEND_NAME data["extraction_backend"] = backend_name if status in ("timeout", "error", "failure"): data["failure_mode"] = status diff --git a/src/glossapi/ocr/__init__.py b/src/glossapi/ocr/__init__.py index bb167c4..df79456 100644 --- a/src/glossapi/ocr/__init__.py +++ b/src/glossapi/ocr/__init__.py @@ -1,7 +1,7 @@ """Lightweight OCR backend package. Exports minimal, import-safe helpers for OCR backends. Heavy -dependencies (vLLM, transformers, PyMuPDF) are imported lazily +dependencies (transformers, PyMuPDF) are imported lazily inside the specific backend functions so importing this package does not require GPU stacks or model weights. """ @@ -12,17 +12,14 @@ __all__ = [ "deepseek", - "rapidocr", "math", "utils", "deepseek_runner", - "rapidocr_dispatch", ] -_SUBPACKAGES = {"deepseek", "rapidocr", "math", "utils"} +_SUBPACKAGES = {"deepseek", "math", "utils"} _ALIASES = { "deepseek_runner": "glossapi.ocr.deepseek.runner", - "rapidocr_dispatch": "glossapi.ocr.rapidocr.dispatch", } diff --git a/src/glossapi/ocr/deepseek/__init__.py b/src/glossapi/ocr/deepseek/__init__.py index 5326c42..a5fb1ca 100644 --- a/src/glossapi/ocr/deepseek/__init__.py +++ b/src/glossapi/ocr/deepseek/__init__.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR backend with a lightweight stub fallback.""" +"""DeepSeek OCR backend.""" from .runner import run_for_files from . import preflight diff --git a/src/glossapi/ocr/deepseek/preflight.py b/src/glossapi/ocr/deepseek/preflight.py index 76810e6..6669707 100644 --- a/src/glossapi/ocr/deepseek/preflight.py +++ b/src/glossapi/ocr/deepseek/preflight.py @@ -1,17 +1,16 @@ -"""Preflight checks for the DeepSeek OCR CLI environment.""" +"""Preflight checks for the DeepSeek OCR environment.""" from __future__ import annotations import dataclasses import os -import shutil import sys from pathlib import Path from typing import Dict, Iterable, List, Optional -DEFAULT_SCRIPT = Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" -DEFAULT_MODEL_DIR = Path.cwd() / "deepseek-ocr" / "DeepSeek-OCR" -DEFAULT_LIB_DIR = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_MODEL_DIR = REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2" @dataclasses.dataclass(frozen=True) @@ -46,9 +45,6 @@ def summarize(self) -> str: def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[Path]: - if not path: - errors.append(CheckResult(label, False, "Not provided")) - return None if not path.exists(): errors.append(CheckResult(label, False, f"Missing at {path}")) return None @@ -58,38 +54,45 @@ def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[ def check_deepseek_env( env: Optional[Dict[str, str]] = None, *, - check_flashinfer: bool = True, + check_torch: bool = True, ) -> PreflightReport: - """Validate DeepSeek CLI prerequisites without running the model.""" + """Validate DeepSeek OCR prerequisites without running the model.""" env = dict(env or os.environ) errors: List[CheckResult] = [] warnings: List[CheckResult] = [] infos: List[CheckResult] = [] - allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" + allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") == "1" + allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1" if not allow_cli: - warnings.append( + errors.append( CheckResult( "allow_cli", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 to force the real CLI.", + "DeepSeek OCR requires the real CLI/runtime. Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1.", ) ) if allow_stub: - warnings.append( + errors.append( CheckResult( "allow_stub", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 to fail instead of falling back to stub output.", + "Stub execution is no longer supported. Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0.", ) ) - script = Path(env.get("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT") or DEFAULT_SCRIPT) - _ensure_path(script, "vllm_script", errors) + script = Path( + env.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT") + or DEFAULT_SCRIPT + ) + _ensure_path(script, "runner_script", errors) - python_bin = Path(env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") or sys.executable) + python_bin = Path( + env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") + or env.get("GLOSSAPI_DEEPSEEK_PYTHON") + or sys.executable + ) _ensure_path(python_bin, "deepseek_python", errors) model_dir = Path( @@ -99,7 +102,7 @@ def check_deepseek_env( ) model_dir = _ensure_path(model_dir, "model_dir", errors) if model_dir: - has_weights = any(model_dir.glob("*.safetensors")) or (model_dir / "model-00001-of-000001.safetensors").exists() + has_weights = any(model_dir.glob("*.safetensors")) has_config = (model_dir / "config.json").exists() if not has_weights or not has_config: errors.append( @@ -110,34 +113,21 @@ def check_deepseek_env( ) ) - ld_path_env = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - lib_dir = Path(ld_path_env) if ld_path_env else DEFAULT_LIB_DIR - _ensure_path(lib_dir, "ld_library_path", errors) - - cc1plus_path = shutil.which("cc1plus", path=env.get("PATH", "")) - if not cc1plus_path: - errors.append( - CheckResult( - "cc1plus", - False, - "C++ toolchain missing (cc1plus not on PATH); install g++ and ensure PATH includes gcc's cc1plus.", - ) - ) - else: - infos.append(CheckResult("cc1plus", True, f"Found at {cc1plus_path}")) - - if check_flashinfer: + if check_torch: try: - import flashinfer # type: ignore + import torch # type: ignore - infos.append(CheckResult("flashinfer", True, f"flashinfer {flashinfer.__version__} import ok")) + infos.append(CheckResult("torch", True, f"torch {torch.__version__} import ok")) + if not torch.cuda.is_available(): + warnings.append(CheckResult("cuda", False, "Torch CUDA is not available.")) except Exception as exc: # pragma: no cover - depends on env - errors.append(CheckResult("flashinfer", False, f"flashinfer import failed: {exc}")) + errors.append(CheckResult("torch", False, f"torch import failed: {exc}")) return PreflightReport(errors=errors, warnings=warnings, infos=infos) def main(argv: Optional[Iterable[str]] = None) -> int: + del argv report = check_deepseek_env() summary = report.summarize() if summary: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py new file mode 100644 index 0000000..213fdcf --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -0,0 +1,499 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files.""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +import tempfile +import time +from pathlib import Path +from typing import Iterable, List + +from PIL import Image + +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + +LOGGER = logging.getLogger(__name__) +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." +PAGE_SPLIT = "\n<--- Page Split --->\n" +DEFAULT_MAX_NEW_TOKENS = 2048 + + +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": True, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _parse_page_range_spec(input_dir: Path, spec: str) -> dict: + try: + name, start_raw, end_raw = str(spec).rsplit(":", 2) + except ValueError as exc: + raise ValueError(f"Invalid page range spec: {spec}") from exc + start_page = int(start_raw) + end_page = int(end_raw) + if start_page <= 0 or end_page < start_page: + raise ValueError(f"Invalid page range bounds: {spec}") + pdf_path = (input_dir / name).resolve() + return { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": start_page, + "end_page": end_page, + "stem": f"{pdf_path.stem}__p{start_page:05d}-{end_page:05d}", + } + + +def _iter_pdf_jobs(input_dir: Path, files: List[str], page_ranges: List[str]) -> List[dict]: + jobs: List[dict] = [] + if files: + for name in files: + pdf_path = (input_dir / name).resolve() + jobs.append( + { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": 1, + "end_page": None, + "stem": pdf_path.stem, + } + ) + if page_ranges: + jobs.extend(_parse_page_range_spec(input_dir, spec) for spec in page_ranges) + if jobs: + return jobs + return [ + { + "pdf_path": path.resolve(), + "source_name": path.name, + "source_stem": path.stem, + "start_page": 1, + "end_page": None, + "stem": path.stem, + } + for path in sorted(input_dir.glob("*.pdf")) + ] + + +def _render_pages( + pdf_path: Path, + max_pages: int | None, + render_dpi: int, + *, + start_page: int = 1, + end_page: int | None = None, +) -> List[Image.Image]: + import fitz + + images: List[Image.Image] = [] + doc = fitz.open(pdf_path) + try: + doc_page_count = int(doc.page_count) + first_idx = max(0, int(start_page) - 1) + last_idx = doc_page_count - 1 if end_page is None else min(doc_page_count - 1, int(end_page) - 1) + if max_pages is not None: + last_idx = min(last_idx, first_idx + int(max_pages) - 1) + if last_idx < first_idx: + return images + zoom = float(render_dpi) / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for idx in range(first_idx, last_idx + 1): + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + images.append(img) + finally: + doc.close() + return images + + +def _clean_markdown(text: str) -> str: + text = (text or "").replace("<|end▁of▁sentence|>", "").strip() + pattern = re.compile(r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL) + matches = pattern.findall(text) + for full_match, label, _coords in matches: + if label == "image": + text = text.replace(full_match, "") + else: + text = text.replace(full_match, "") + return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() + + +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested + try: + import flash_attn # noqa: F401 + return "flash_attention_2" + except Exception: + # DeepSeek-OCR-2's custom decoder path has not behaved reliably with SDPA + # on the stacks we have exercised; if FA2 is unavailable, prefer the known + # fallback instead of silently selecting a backend that then downgrades. + return "eager" + + +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', + ) + return any(marker in message for marker in markers) + + +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): + return + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + import torch + from transformers import AutoModel, AutoTokenizer + + attn_impl = _resolve_attn_backend(attn_backend) + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + if device.startswith("cuda"): + model = model.eval().to(device).to(torch.bfloat16) + else: + model = model.eval().to(device) + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + return tokenizer, model, attn_impl + + +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: + result = model.infer( + tokenizer, + prompt=prompt, + image_file=str(image_path), + output_path=str(output_dir), + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + save_results=False, + eval_mode=True, + ) + return _clean_markdown(str(result)) + + +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + (md_dir / f"{stem}.md").write_text(markdown.strip() + "\n", encoding="utf-8") + metrics = { + "page_count": page_count, + "model": "deepseek-ai/DeepSeek-OCR-2", + } + if extra_metrics: + metrics.update(extra_metrics) + (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") + partial_path = progress_dir / f"{stem}.partial.md" + if partial_path.exists(): + partial_path.unlink() + + +def _write_progress( + output_dir: Path, + stem: str, + page_outputs: List[str], + total_pages: int, + completed_pages: int, +) -> None: + """Emit lightweight progress artifacts during long OCR runs.""" + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + partial_markdown = PAGE_SPLIT.join(page_outputs).strip() + if partial_markdown: + (progress_dir / f"{stem}.partial.md").write_text(partial_markdown + "\n", encoding="utf-8") + progress = { + "completed_pages": completed_pages, + "total_pages": total_pages, + "status": "running" if completed_pages < total_pages else "complete", + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.progress.json").write_text( + json.dumps(progress, indent=2), + encoding="utf-8", + ) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + jobs = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs: + return 0 + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, + ) + + for job in jobs: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) + render_sec = time.perf_counter() - render_start + page_outputs: List[str] = [] + page_metrics: List[dict] = [] + total_pages = len(images) + _write_progress(output_dir, stem, page_outputs, total_pages, 0) + with tempfile.TemporaryDirectory(prefix=f"{stem}_deepseek_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + for idx, image in enumerate(images): + page_png = tmp_dir / f"page_{idx + 1:04d}.png" + image.save(page_png, format="PNG") + infer_start = time.perf_counter() + raw_page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) + _write_progress( + output_dir, + stem, + page_outputs, + total_pages, + idx + 1, + ) + markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" + _write_outputs( + output_dir, + stem, + markdown, + len(images), + extra_metrics={ + "source_file": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "source_end_page": int(job["start_page"]) + max(0, len(images) - 1), + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py new file mode 100644 index 0000000..6368f81 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -0,0 +1,562 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files using vLLM.""" + +from __future__ import annotations + +import argparse +import logging +import tempfile +import time +from pathlib import Path +from typing import Dict, List + +from PIL import Image + +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( + DEFAULT_MAX_NEW_TOKENS, + PAGE_SPLIT, + _iter_pdf_jobs, + _postprocess_page_text, + _profile_defaults, + _render_pages, + _write_outputs, + _write_progress, +) +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector + +LOGGER = logging.getLogger(__name__) +REPAIR_DARK_THRESHOLD = 235 +EMPTY_PAGE_OVERALL_DARK_MAX = 0.0015 +EMPTY_PAGE_BAND_DARK_MAX = 0.0025 +GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS = 48 +GARBAGE_EARLY_STOP_WINDOW_TOKENS = 160 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) + parser.add_argument("--attn-backend", default="vllm") + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: bool): + from vllm import LLM + + logits_processors = [] + try: + from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor + + logits_processors.append(NGramPerReqLogitsProcessor) + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("DeepSeek OCR logits processor unavailable in vLLM; continuing without it: %s", exc) + + try: + from transformers import AutoTokenizer + from vllm.sampling_params import SamplingParams + from vllm.v1.sample.logits_processor import AdapterLogitsProcessor + + class _GarbageStopPerReqLogitsProcessor: + def __init__( + self, + tokenizer, + eos_token_id: int | None, + *, + min_output_tokens: int, + window_tokens: int, + ) -> None: + self.tokenizer = tokenizer + self.eos_token_id = eos_token_id + self.min_output_tokens = int(min_output_tokens) + self.window_tokens = int(window_tokens) + self.detector = StreamingGarbageDetector() + self.seen_output_tokens = 0 + + def __call__(self, prompt_ids: list[int], output_ids: list[int], logits): + del prompt_ids + if self.eos_token_id is None: + return logits + current_len = len(output_ids) + if current_len <= self.seen_output_tokens: + return logits + new_ids = output_ids[self.seen_output_tokens :] + self.seen_output_tokens = current_len + if not new_ids: + return logits + new_text = self.tokenizer.decode(new_ids, skip_special_tokens=False) + if new_text: + self.detector.feed(new_text) + if current_len < self.min_output_tokens or self.detector.triggered_reason is None: + return logits + eos_token_id = int(self.eos_token_id) + eos_value = logits[eos_token_id].clone() + logits[:] = float("-inf") + logits[eos_token_id] = eos_value + return logits + + class GarbageEarlyStopLogitsProcessor(AdapterLogitsProcessor): + @classmethod + def validate_params(cls, params: SamplingParams): + extra = params.extra_args or {} + enabled = extra.get("garbage_early_stop") + if enabled is None: + return + if not isinstance(enabled, bool): + raise ValueError("garbage_early_stop must be a bool when provided") + min_output_tokens = extra.get("garbage_min_output_tokens") + if min_output_tokens is not None and int(min_output_tokens) <= 0: + raise ValueError("garbage_min_output_tokens must be > 0") + window_tokens = extra.get("garbage_window_tokens") + if window_tokens is not None and int(window_tokens) <= 0: + raise ValueError("garbage_window_tokens must be > 0") + + def __init__(self, vllm_config, device, is_pin_memory): + super().__init__(vllm_config, device, is_pin_memory) + self._tokenizer = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True) + self._eos_token_id = self._tokenizer.eos_token_id + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor(self, params: SamplingParams): + extra = params.extra_args or {} + if not bool(extra.get("garbage_early_stop", False)): + return None + return _GarbageStopPerReqLogitsProcessor( + self._tokenizer, + self._eos_token_id, + min_output_tokens=int( + extra.get("garbage_min_output_tokens", GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS) + ), + window_tokens=int( + extra.get("garbage_window_tokens", GARBAGE_EARLY_STOP_WINDOW_TOKENS) + ), + ) + + logits_processors.append(GarbageEarlyStopLogitsProcessor) + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("Garbage-stop logits processor unavailable in vLLM; continuing without it: %s", exc) + + engine_kwargs = { + "model": str(model_dir), + "tokenizer": str(model_dir), + "trust_remote_code": True, + "dtype": "bfloat16", + "enable_prefix_caching": False, + "mm_processor_cache_gb": 0, + "gpu_memory_utilization": float(gpu_memory_utilization), + "tensor_parallel_size": 1, + } + if disable_fp8_kv: + engine_kwargs["kv_cache_dtype"] = "auto" + if logits_processors: + engine_kwargs["logits_processors"] = logits_processors + return LLM(**engine_kwargs) + + +def _sampling_params(max_new_tokens: int | None, *, enable_garbage_early_stop: bool): + from vllm import SamplingParams + + return SamplingParams( + temperature=0.0, + max_tokens=int(max_new_tokens or DEFAULT_MAX_NEW_TOKENS), + skip_special_tokens=False, + extra_args={ + "ngram_size": 30, + "window_size": 90, + "whitelist_token_ids": {128821, 128822}, + "garbage_early_stop": bool(enable_garbage_early_stop), + "garbage_min_output_tokens": int(GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS), + "garbage_window_tokens": int(GARBAGE_EARLY_STOP_WINDOW_TOKENS), + }, + ) + + +def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: + size = max(1, int(batch_size)) + return [items[idx : idx + size] for idx in range(0, len(items), size)] + + +def _image_content_stats(image: Image.Image) -> dict: + sample = image.convert("L") + sample.thumbnail((256, 256)) + width, height = sample.size + pixels = list(sample.getdata()) + + def _dark_ratio(y0: int, y1: int) -> float: + values = [] + for row in range(y0, y1): + start = row * width + values.extend(pixels[start : start + width]) + total = len(values) + if total <= 0: + return 0.0 + dark = sum(1 for value in values if value < REPAIR_DARK_THRESHOLD) + return float(dark) / float(total) + + half = max(1, height // 2) + third = max(1, height // 3) + top_third_end = min(height, third) + middle_third_end = min(height, third * 2) + dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) + return { + "top_dark_ratio": _dark_ratio(0, half), + "bottom_dark_ratio": _dark_ratio(half, height), + "top_third_dark_ratio": _dark_ratio(0, top_third_end), + "middle_third_dark_ratio": _dark_ratio(top_third_end, middle_third_end), + "bottom_third_dark_ratio": _dark_ratio(middle_third_end, height), + "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), + } + + +def _text_quality_metrics(text: str) -> dict: + stripped = str(text or "").strip() + letters = sum(1 for ch in stripped if ch.isalpha()) + digits = sum(1 for ch in stripped if ch.isdigit()) + pua_chars = sum( + 1 + for ch in stripped + if 0xE000 <= ord(ch) <= 0xF8FF + or 0xF0000 <= ord(ch) <= 0xFFFFD + or 0x100000 <= ord(ch) <= 0x10FFFD + ) + lines = [line.strip() for line in stripped.splitlines() if line.strip()] + avg_line_length = (sum(len(line) for line in lines) / float(len(lines))) if lines else 0.0 + score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) + return { + "chars": int(len(stripped)), + "letters": int(letters), + "digits": int(digits), + "pua_chars": int(pua_chars), + "line_count": int(len(lines)), + "avg_line_length": float(avg_line_length), + "quality_score": float(score), + } + + +def _is_effectively_empty_page(image_stats: dict, repair_mode: str) -> bool: + if str(repair_mode or "off").strip().lower() != "auto": + return False + overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if overall_dark > EMPTY_PAGE_OVERALL_DARK_MAX: + return False + return all( + float(image_stats.get(key, 0.0)) <= EMPTY_PAGE_BAND_DARK_MAX + for key in ( + "top_dark_ratio", + "bottom_dark_ratio", + "top_third_dark_ratio", + "middle_third_dark_ratio", + "bottom_third_dark_ratio", + ) + ) + + +def _load_job_image(item: dict) -> Image.Image: + return Image.open(item["image_path"]).convert("RGB") + + +def _generate_batch_outputs( + llm, + *, + jobs: List[dict], + prompt: str, + batch_size: int, + sampling_params, +) -> List[dict]: + outputs_by_key: Dict[tuple[str, int], dict] = {} + for batch in _batched(jobs, batch_size): + prompt_batch = [] + opened_images: List[Image.Image] = [] + keys: List[tuple[str, int]] = [] + for item in batch: + image = _load_job_image(item) + opened_images.append(image) + keys.append((str(item["stem"]), int(item["page_number"]))) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + infer_start = time.perf_counter() + batch_outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + per_item_sec = infer_sec / max(1, len(batch)) + for image in opened_images: + image.close() + for item, key, output in zip(batch, keys, batch_outputs): + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + outputs_by_key[key] = { + "item": item, + "raw_text": raw_text, + "infer_sec": float(per_item_sec), + } + return [outputs_by_key[(str(item["stem"]), int(item["page_number"]))] for item in jobs] + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + jobs_to_run = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs_to_run: + return 0 + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + plain_prompt = _profile_defaults("plain_ocr")["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + llm = _load_vllm( + model_dir, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + ) + sampling_params = _sampling_params( + args.max_new_tokens, + enable_garbage_early_stop=str(args.repair_mode or "off").strip().lower() == "auto", + ) + + with tempfile.TemporaryDirectory(prefix="deepseek_vllm_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + doc_states: Dict[str, dict] = {} + jobs: List[dict] = [] + plain_retry_jobs: List[dict] = [] + + for job in jobs_to_run: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) + render_sec = time.perf_counter() - render_start + total_pages = len(images) + state = { + "stem": stem, + "source_name": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "page_outputs": [""] * total_pages, + "page_metrics": [None] * total_pages, + "render_sec": float(render_sec), + "doc_start": float(doc_start), + "completed_pages": 0, + "total_pages": total_pages, + } + doc_states[stem] = state + _write_progress(output_dir, stem, [], total_pages, 0) + for idx, image in enumerate(images): + page_path = tmp_dir / f"{stem}_page_{idx + 1:04d}.png" + image_stats = _image_content_stats(image) + if _is_effectively_empty_page(image_stats, args.repair_mode): + state["page_metrics"][idx] = { + "page_number": int(idx + 1), + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "skip_empty", + "repair_reason": "empty_page", + "repair_attempted": False, + "repair_applied": False, + "empty_page_skipped": True, + "garbage_early_stop_applied": False, + **image_stats, + } + state["completed_pages"] = int(state["completed_pages"]) + 1 + _write_progress( + output_dir, + stem, + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + image.close() + continue + image.save(page_path, format="PNG") + image.close() + jobs.append( + { + "stem": stem, + "page_number": int(idx + 1), + "image_path": page_path, + "image_stats": image_stats, + } + ) + + first_pass_outputs = _generate_batch_outputs( + llm, + jobs=jobs, + prompt=prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in first_pass_outputs: + item = result["item"] + state = doc_states[item["stem"]] + raw_text = str(result["raw_text"]) + image_stats = dict(item.get("image_stats", {})) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + state["page_outputs"][item["page_number"] - 1] = page_text + quality = _text_quality_metrics(page_text) + metric = { + "page_number": int(item["page_number"]), + "infer_sec": float(result["infer_sec"]), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + "first_pass_quality_score": float(quality["quality_score"]), + "first_pass_letters": int(quality["letters"]), + "first_pass_digits": int(quality["digits"]), + "first_pass_pua_chars": int(quality["pua_chars"]), + "repair_strategy": "plain" if bool(postprocess_metrics.get("early_stops", 0)) else "none", + "repair_reason": "early_stop_markdown_garbage" if bool(postprocess_metrics.get("early_stops", 0)) else None, + "repair_attempted": False, + "repair_applied": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": bool(postprocess_metrics.get("early_stops", 0)), + **image_stats, + **postprocess_metrics, + } + state["page_metrics"][item["page_number"] - 1] = metric + if bool(postprocess_metrics.get("early_stops", 0)) and str(args.repair_mode or "off").strip().lower() == "auto": + plain_retry_jobs.append(item) + state["completed_pages"] = int(state["completed_pages"]) + 1 + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + if plain_retry_jobs: + plain_repair_outputs = _generate_batch_outputs( + llm, + jobs=plain_retry_jobs, + prompt=plain_prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in plain_repair_outputs: + item = result["item"] + state = doc_states[item["stem"]] + metric = state["page_metrics"][item["page_number"] - 1] + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + repair_text = f"\n{repair_text}".strip() + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_final_chars"] = int(len(repair_text.strip())) + metric["repair_profile"] = "plain_ocr" + metric["repair_quality_score"] = float(_text_quality_metrics(repair_text)["quality_score"]) + metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) + if repair_text.strip(): + state["page_outputs"][item["page_number"] - 1] = repair_text + metric["repair_applied"] = True + metric["final_chars"] = int(len(repair_text.strip())) + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + for stem, state in doc_states.items(): + markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" + page_metrics = sorted( + [item for item in state["page_metrics"] if item], + key=lambda item: int(item["page_number"]), + ) + repair_summary = { + "repair_mode": str(args.repair_mode), + "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), + "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), + "plain_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied")))), + "tiled_repairs": 0, + "empty_pages_skipped": int(sum(1 for item in page_metrics if bool(item.get("empty_page_skipped")))), + "pages_with_early_stop": int(sum(1 for item in page_metrics if bool(item.get("garbage_early_stop_applied")))), + } + _write_outputs( + output_dir, + stem, + markdown, + int(state["total_pages"]), + extra_metrics={ + "source_file": str(state["source_name"]), + "source_stem": str(state["source_stem"]), + "source_start_page": int(state["source_start_page"]), + "source_end_page": int(state["source_start_page"]) + max(0, len(page_metrics) - 1), + "ocr_profile": args.ocr_profile, + "attn_backend": "vllm", + "runtime_backend": "vllm", + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "batch_size": int(args.batch_size), + "gpu_memory_utilization": float(args.gpu_memory_utilization), + "disable_fp8_kv": bool(args.disable_fp8_kv), + "repair_mode": str(args.repair_mode), + "render_sec": float(state["render_sec"]), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "repair_summary": repair_summary, + "page_metrics": page_metrics, + }, + ) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index d68f05c..7a22018 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -1,7 +1,8 @@ -"""DeepSeek OCR runner with stub and optional CLI dispatch.""" +"""DeepSeek OCR runner.""" from __future__ import annotations +from contextlib import ExitStack import json import logging import os @@ -11,12 +12,26 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Optional +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) + try: import pypdfium2 as _pypdfium2 except Exception: # pragma: no cover - optional dependency _pypdfium2 = None LOGGER = logging.getLogger(__name__) +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" +AUTO_VLLM_BATCH_PAGE_CAP = 160 +DEFAULT_MAX_NEW_TOKENS = 2048 def _page_count(pdf_path: Path) -> int: @@ -28,17 +43,34 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, + files: List[str], + page_ranges: Optional[List[str]], + model_dir: Path, python_bin: Optional[Path], script: Path, max_pages: Optional[int], content_debug: bool, - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, -) -> None: + device: Optional[str], + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -47,151 +79,705 @@ def _run_cli( str(input_dir), "--output-dir", str(output_dir), + "--model-dir", + str(model_dir), ] + if files: + cmd += ["--files", *files] + if page_ranges: + cmd += ["--page-ranges", *page_ranges] if max_pages is not None: cmd += ["--max-pages", str(max_pages)] if content_debug: cmd.append("--content-debug") - if gpu_memory_utilization is not None: - cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)] - if disable_fp8_kv: - cmd.append("--no-fp8-kv") + if device: + cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if prompt_override: + cmd += ["--prompt-override", str(prompt_override)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + if runtime_backend_norm == "vllm": + if vllm_batch_size is not None: + cmd += ["--batch-size", str(int(vllm_batch_size))] + if gpu_memory_utilization is not None: + cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] + if disable_fp8_kv: + cmd.append("--disable-fp8-kv") + if repair_mode: + cmd += ["--repair-mode", str(repair_mode)] + return cmd + +def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: - # FlashInfer JIT (via vLLM) needs a C++ toolchain; add a known cc1plus location if missing. for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): - env["PATH"] = f"{candidate.parent}:{env.get('PATH','')}" + env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" break ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: - env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH','')}" + env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + page_ranges=None, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device) - LOGGER.info("Running DeepSeek CLI: %s", " ".join(cmd)) + LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments -def _run_one_pdf(pdf_path: Path, md_out: Path, metrics_out: Path, cfg: Dict[str, Any]) -> Dict[str, Any]: - """Stub processor for a single PDF.""" - page_count = _page_count(pdf_path) - max_pages = cfg.get("max_pages") - if max_pages is not None and page_count: - page_count = min(page_count, max_pages) +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None - md_lines = [ - f"# DeepSeek OCR (stub) — {pdf_path.name}", - "", - f"Pages: {page_count if page_count else 'unknown'}", - ] - if cfg.get("content_debug"): - md_lines.append("") - md_lines.append("") - md_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("\n".join(md_lines) + "\n", encoding="utf-8") - metrics = {"page_count": page_count} - metrics_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - return metrics +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _source_documents( + *, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> List[SourceDocument]: + documents: List[SourceDocument] = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + documents.append( + SourceDocument( + name=str(name), + pages=int(_effective_page_count(pdf_path, max_pages)), + ) + ) + return documents + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _resolve_scheduler( + *, + scheduler: Optional[str], + runtime_backend: str, + lane_devices: List[int], + workers_per_gpu: int, +) -> str: + scheduler_norm = str(scheduler or "auto").strip().lower() + if scheduler_norm not in {"auto", "whole_doc", "fixed_shard", "exact_fill"}: + raise ValueError("scheduler must be one of 'auto', 'whole_doc', 'fixed_shard', or 'exact_fill'") + if scheduler_norm != "auto": + return scheduler_norm + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + lane_count = max(1, len(lane_devices)) * max(1, int(workers_per_gpu)) + if runtime_backend_norm == "vllm" and lane_count > 1: + return "exact_fill" + return "whole_doc" + + +def _plan_lane_batches( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], + runtime_backend: str, + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + documents = _source_documents( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + ) + scheduler_norm = _resolve_scheduler( + scheduler=scheduler, + runtime_backend=runtime_backend, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches( + documents, + target_batch_pages=max(1, int(target_batch_pages)), + ) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches( + slices, + target_batch_pages=max(1, int(target_batch_pages)), + ) + lanes = assign_batches_to_lanes( + batches, + devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + return [lane.to_dict() for lane in lanes if lane.batches] + + +def _auto_vllm_batch_size( + *, + runtime_backend: str, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + total_pages = 0 + for name in file_list: + pdf_path = (input_root / name).resolve() + total_pages += int(_effective_page_count(pdf_path, max_pages)) + if total_pages <= 0: + return 1 + return min(int(total_pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + +def _auto_vllm_batch_size_for_pages(*, runtime_backend: str, pages: int) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + if int(pages) <= 0: + return 1 + return min(int(pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> None: + lanes = _plan_lane_batches( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + runtime_backend=runtime_backend, + scheduler=scheduler, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + + for lane in lanes: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) + if pages <= 0: + continue + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size_for_pages( + runtime_backend=runtime_backend, + pages=min(int(target_batch_pages), int(pages)), + ) + ) + log_path = log_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=files, + page_ranges=page_ranges, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s pages=%s planned_batches=%s files=%d ranges=%d: %s", + lane_id, + visible_device, + pages, + lane_plan["planned_batch_count"], + len(files), + len(page_ranges), + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) + + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) def run_for_files( self_ref: Any, files: Iterable[str], *, - model_dir: Optional[Path] = None, # kept for API compatibility + model_dir: Optional[Path] = None, output_dir: Optional[Path] = None, - log_dir: Optional[Path] = None, # unused placeholder to mirror rapidocr + log_dir: Optional[Path] = None, # kept for API compatibility max_pages: Optional[int] = None, - allow_stub: bool = True, - allow_cli: bool = False, + allow_stub: bool = False, # ignored after stub removal; kept for compatibility + allow_cli: bool = True, # ignored after stub removal; kept for compatibility python_bin: Optional[Path] = None, vllm_script: Optional[Path] = None, content_debug: bool = False, persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved - device: Optional[str] = None, # reserved + device: Optional[str] = None, + runtime_backend: str = "transformers", + ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, + vllm_batch_size: Optional[int] = None, + repair_mode: str = "auto", + scheduler: str = "auto", + target_batch_pages: int = AUTO_VLLM_BATCH_PAGE_CAP, + shard_pages: int = 0, + shard_threshold_pages: int = 0, **_: Any, ) -> Dict[str, Any]: - """Run DeepSeek OCR for the provided files. + """Run DeepSeek OCR for the provided files.""" + + requested_stub = bool(allow_stub) + del allow_stub, allow_cli, persist_engine, precision + if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": + raise RuntimeError( + "DeepSeek stub execution has been removed. " + "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." + ) - Returns a mapping of stem -> minimal metadata (page_count). - """ + runtime_backend_norm = str( + runtime_backend or os.environ.get("GLOSSAPI_DEEPSEEK_RUNTIME_BACKEND", "transformers") + ).strip().lower() + if runtime_backend_norm not in {"transformers", "vllm"}: + raise ValueError("runtime_backend must be 'transformers' or 'vllm'") file_list = [str(f) for f in files or []] if not file_list: return {} input_root = Path(getattr(self_ref, "input_dir", ".")).resolve() + pdf_root = (input_root / "downloads") if (input_root / "downloads").exists() else input_root out_root = Path(output_dir) if output_dir else Path(getattr(self_ref, "output_dir", input_root)) md_dir = out_root / "markdown" metrics_dir = out_root / "json" / "metrics" md_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True) - env_allow_stub = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" - env_allow_cli = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" + model_root = Path( + model_dir + or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR", "") + or (REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2") + ) + if not model_root.exists(): + raise FileNotFoundError( + "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." + ) - use_cli = allow_cli or env_allow_cli - use_stub = allow_stub and env_allow_stub + default_script = DEFAULT_VLLM_SCRIPT if runtime_backend_norm == "vllm" else DEFAULT_SCRIPT + script_path = Path( + vllm_script + or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") + or default_script + ) + if not script_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") - script_path = Path(vllm_script) if vllm_script else Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" - # Optional GPU memory utilization override (env wins over kwarg) - env_gpu_mem = os.environ.get("GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION") - gpu_mem_fraction = gpu_memory_utilization - if env_gpu_mem: - try: - gpu_mem_fraction = float(env_gpu_mem) - except Exception: - gpu_mem_fraction = gpu_memory_utilization - disable_fp8_kv = disable_fp8_kv or os.environ.get("GLOSSAPI_DEEPSEEK_NO_FP8_KV") == "1" + python_exe = Path( + python_bin + or os.environ.get("GLOSSAPI_DEEPSEEK_PYTHON", "") + or os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", "") + or sys.executable + ) + if not python_exe.exists(): + raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - if use_cli and script_path.exists(): - try: - _run_cli( - input_root, - out_root, - python_bin=python_bin, - script=script_path, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + device=device, + ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=pdf_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + scheduler=scheduler, + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), + ) + else: + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend_norm, + file_list=file_list, + input_root=pdf_root, max_pages=max_pages, - content_debug=content_debug, - gpu_memory_utilization=gpu_mem_fraction, - disable_fp8_kv=disable_fp8_kv, ) - results: Dict[str, Any] = {} - for name in file_list: - pdf_path = (input_root / name).resolve() - stem = Path(name).stem - md_path = md_dir / f"{stem}.md" - metrics_path = metrics_dir / f"{stem}.metrics.json" - if not md_path.exists() or not md_path.read_text(encoding="utf-8").strip(): - placeholder = [ - f"# DeepSeek OCR — {pdf_path.name}", - "", - "[[Blank page]]", - ] - md_path.parent.mkdir(parents=True, exist_ok=True) - md_path.write_text("\n".join(placeholder) + "\n", encoding="utf-8") - page_count = _page_count(pdf_path) - if not metrics_path.exists(): - metrics_path.parent.mkdir(parents=True, exist_ok=True) - metrics_path.write_text(json.dumps({"page_count": page_count}, indent=2), encoding="utf-8") - results[stem] = {"page_count": page_count} - return results - except Exception as exc: - if not use_stub: - raise - LOGGER.warning("DeepSeek CLI failed (%s); falling back to stub output", exc) - - cfg = {"max_pages": max_pages, "content_debug": content_debug} + ) + _run_cli( + input_dir=pdf_root, + output_dir=out_root, + files=file_list, + page_ranges=None, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + ) + results: Dict[str, Any] = {} for name in file_list: - pdf_path = (input_root / name).resolve() + pdf_path = (pdf_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" - results[stem] = _run_one_pdf(pdf_path, md_path, metrics_path, cfg) + if not md_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR did not produce markdown for {name}: {md_path}") + if not md_path.read_text(encoding="utf-8").strip(): + raise RuntimeError(f"DeepSeek OCR produced empty markdown for {name}: {md_path}") + page_count = _page_count(pdf_path) + if metrics_path.exists(): + try: + results[stem] = json.loads(metrics_path.read_text(encoding="utf-8")) + continue + except Exception: + pass + results[stem] = {"page_count": page_count} + metrics_path.write_text(json.dumps(results[stem], indent=2), encoding="utf-8") return results diff --git a/src/glossapi/ocr/deepseek/scheduling.py b/src/glossapi/ocr/deepseek/scheduling.py new file mode 100644 index 0000000..339b3e6 --- /dev/null +++ b/src/glossapi/ocr/deepseek/scheduling.py @@ -0,0 +1,242 @@ +"""Scheduling helpers for DeepSeek OCR page-range planning. + +The core abstraction is a divisible PDF page stream. We can cut a document into +page ranges exactly where a batch boundary needs it, then reconstruct outputs +later by `(doc_id, page_number)`. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import heapq +from typing import Iterable, List, Optional + + +@dataclass(frozen=True) +class SourceDocument: + name: str + pages: int + + +@dataclass(frozen=True) +class WorkSlice: + source_name: str + source_pages: int + start_page: int + end_page: int + + @property + def pages(self) -> int: + return int(self.end_page) - int(self.start_page) + 1 + + @property + def is_full_document(self) -> bool: + return int(self.start_page) == 1 and int(self.end_page) == int(self.source_pages) + + @property + def item_id(self) -> str: + if self.is_full_document: + return str(self.source_name) + return f"{self.source_name}:{int(self.start_page)}:{int(self.end_page)}" + + @property + def cli_file(self) -> Optional[str]: + return str(self.source_name) if self.is_full_document else None + + @property + def cli_page_range(self) -> Optional[str]: + if self.is_full_document: + return None + return self.item_id + + def to_dict(self) -> dict: + return { + "item_id": self.item_id, + "pages": int(self.pages), + "file": self.cli_file, + "page_range": self.cli_page_range, + "source_name": str(self.source_name), + "start_page": int(self.start_page), + "end_page": int(self.end_page), + "is_full_document": bool(self.is_full_document), + } + + +@dataclass +class DocumentCursor: + name: str + total_pages: int + next_page: int = 1 + + @property + def remaining_pages(self) -> int: + return max(0, int(self.total_pages) - int(self.next_page) + 1) + + def take(self, requested_pages: int) -> WorkSlice: + take_pages = min(max(1, int(requested_pages)), int(self.remaining_pages)) + start_page = int(self.next_page) + end_page = start_page + take_pages - 1 + self.next_page = end_page + 1 + return WorkSlice( + source_name=str(self.name), + source_pages=int(self.total_pages), + start_page=int(start_page), + end_page=int(end_page), + ) + + +@dataclass +class BatchPlan: + batch_id: int + items: List[WorkSlice] = field(default_factory=list) + + @property + def pages(self) -> int: + return sum(int(item.pages) for item in self.items) + + def to_dict(self) -> dict: + return { + "batch_id": int(self.batch_id), + "item_ids": [item.item_id for item in self.items], + "files": [item.cli_file for item in self.items if item.cli_file], + "page_ranges": [item.cli_page_range for item in self.items if item.cli_page_range], + "pages": int(self.pages), + "items": [item.to_dict() for item in self.items], + } + + +@dataclass +class LanePlan: + lane_id: int + visible_device: int + batches: List[BatchPlan] = field(default_factory=list) + + @property + def assigned_pages(self) -> int: + return sum(int(batch.pages) for batch in self.batches) + + def to_dict(self) -> dict: + return { + "lane_id": int(self.lane_id), + "visible_device": int(self.visible_device), + "assigned_pages": int(self.assigned_pages), + "batches": [batch.to_dict() for batch in self.batches], + } + + +def build_whole_document_slices(documents: Iterable[SourceDocument]) -> List[WorkSlice]: + return [ + WorkSlice( + source_name=str(doc.name), + source_pages=int(doc.pages), + start_page=1, + end_page=int(doc.pages), + ) + for doc in documents + ] + + +def build_fixed_shard_slices( + documents: Iterable[SourceDocument], + *, + shard_pages: int, + shard_threshold_pages: int, +) -> List[WorkSlice]: + shard_size = max(0, int(shard_pages)) + threshold = max(0, int(shard_threshold_pages)) + slices: List[WorkSlice] = [] + for doc in documents: + total_pages = int(doc.pages) + if shard_size <= 0 or total_pages <= max(threshold, shard_size): + slices.extend(build_whole_document_slices([doc])) + continue + start_page = 1 + while start_page <= total_pages: + end_page = min(total_pages, start_page + shard_size - 1) + slices.append( + WorkSlice( + source_name=str(doc.name), + source_pages=total_pages, + start_page=int(start_page), + end_page=int(end_page), + ) + ) + start_page = end_page + 1 + return slices + + +def build_exact_fill_batches( + documents: Iterable[SourceDocument], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + heap: List[tuple[int, int, DocumentCursor]] = [] + for idx, doc in enumerate(documents): + cursor = DocumentCursor(name=str(doc.name), total_pages=int(doc.pages)) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + + batches: List[BatchPlan] = [] + while heap: + remaining_capacity = int(target) + items: List[WorkSlice] = [] + while remaining_capacity > 0 and heap: + _neg_remaining, idx, cursor = heapq.heappop(heap) + take_pages = min(int(cursor.remaining_pages), int(remaining_capacity)) + items.append(cursor.take(take_pages)) + remaining_capacity -= int(take_pages) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + batches.append(BatchPlan(batch_id=len(batches), items=items)) + return batches + + +def pack_slices_into_batches( + slices: Iterable[WorkSlice], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + ordered = sorted(list(slices), key=lambda item: (-int(item.pages), item.item_id)) + batches: List[BatchPlan] = [] + current: List[WorkSlice] = [] + current_pages = 0 + + def flush() -> None: + nonlocal current, current_pages + if not current: + return + batches.append(BatchPlan(batch_id=len(batches), items=list(current))) + current = [] + current_pages = 0 + + for item in ordered: + item_pages = int(item.pages) + if current and current_pages + item_pages > target: + flush() + current.append(item) + current_pages += item_pages + if current_pages >= target: + flush() + flush() + return batches + + +def assign_batches_to_lanes( + batches: Iterable[BatchPlan], + *, + devices: List[int], + workers_per_gpu: int, +) -> List[LanePlan]: + lanes: List[LanePlan] = [] + lane_id = 0 + for visible_device in devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append(LanePlan(lane_id=lane_id, visible_device=int(visible_device))) + lane_id += 1 + for batch in batches: + lane = min(lanes, key=lambda item: (int(item.assigned_pages), int(item.lane_id))) + lane.batches.append(batch) + return lanes + diff --git a/src/glossapi/ocr/docling/__init__.py b/src/glossapi/ocr/docling/__init__.py new file mode 100644 index 0000000..28d4b0a --- /dev/null +++ b/src/glossapi/ocr/docling/__init__.py @@ -0,0 +1,5 @@ +"""Docling PDF pipeline helpers used by GlossAPI.""" + +from .pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py new file mode 100644 index 0000000..8162e60 --- /dev/null +++ b/src/glossapi/ocr/docling/pipeline.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import os +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + LayoutOptions, + PdfPipelineOptions, + PictureDescriptionApiOptions, + TableFormerMode, + TableStructureOptions, +) + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + acc = AcceleratorOptions(device=dev) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def _apply_common_pdf_options( + *, + acc: AcceleratorOptions, + images_scale: float, + formula_enrichment: bool, + code_enrichment: bool, +) -> PdfPipelineOptions: + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + opts = PdfPipelineOptions( + accelerator_options=acc, + layout_options=LayoutOptions(), + do_ocr=False, + do_table_structure=True, + do_formula_enrichment=bool(formula_enrichment), + do_code_enrichment=bool(code_enrichment), + force_backend_text=False, + generate_parsed_pages=False, + table_structure_options=table_opts, + allow_external_plugins=True, + ) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", images_scale) + except Exception: + pass + _apply_runtime_overrides(opts) + return opts + + +def _apply_runtime_overrides(opts: PdfPipelineOptions) -> None: + """Apply optional runtime tuning knobs exposed by newer Docling releases.""" + + int_env_map = { + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE": "layout_batch_size", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE": "table_batch_size", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE": "ocr_batch_size", + "GLOSSAPI_DOCLING_QUEUE_MAX_SIZE": "queue_max_size", + "GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT": "document_timeout", + } + float_env_map = { + "GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL": "batch_polling_interval_seconds", + } + + for env_name, attr_name in int_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = int(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + for env_name, attr_name in float_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = float(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Create a Docling layout-only PDF pipeline.""" + + acc, _ = _resolve_accelerator(device) + opts = _apply_common_pdf_options( + acc=acc, + images_scale=float(images_scale), + formula_enrichment=formula_enrichment, + code_enrichment=code_enrichment, + ) + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py new file mode 100644 index 0000000..4a96e09 --- /dev/null +++ b/src/glossapi/ocr/docling_pipeline.py @@ -0,0 +1,5 @@ +"""Compatibility wrapper for the canonical Docling pipeline builder.""" + +from .docling.pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/__init__.py b/src/glossapi/ocr/rapidocr/__init__.py deleted file mode 100644 index c0d1232..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""RapidOCR subpackage with lazy re-exports.""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - -__all__ = [ - "dispatch", - "docling_pipeline", - "pool", - "safe", - "onnx", - "_paths", - "pipeline", -] - - -def __getattr__(name: str) -> Any: - if name in __all__: - return import_module(f"glossapi.ocr.rapidocr.{name}") - raise AttributeError(name) - - -def __dir__() -> list[str]: - return sorted(set(globals().keys()) | set(__all__)) diff --git a/src/glossapi/ocr/rapidocr/__init__.py.backup b/src/glossapi/ocr/rapidocr/__init__.py.backup deleted file mode 100644 index 865f119..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py.backup +++ /dev/null @@ -1,6 +0,0 @@ -"""RapidOCR subpackage (shim).""" - -from __future__ import annotations - -__all__ = ["dispatch"] - diff --git a/src/glossapi/ocr/rapidocr/_paths.py b/src/glossapi/ocr/rapidocr/_paths.py deleted file mode 100644 index 4c1cc2a..0000000 --- a/src/glossapi/ocr/rapidocr/_paths.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Tuple -import importlib -import os - - -@dataclass -class ResolvedOnnx: - det: Optional[str] - rec: Optional[str] - cls: Optional[str] - keys: Optional[str] - - -def _find_first(base: Path, patterns: list[str]) -> Optional[str]: - for pat in patterns: - for p in base.rglob(pat): - if p.is_file(): - return str(p) - return None - - -def _resolve_packaged_cls_fallback() -> Optional[str]: - try: - rapidocr = importlib.import_module("rapidocr") - base = Path(rapidocr.__file__).resolve().parent / "models" - pref = base / "ch_ppocr_mobile_v2.0_cls_infer.onnx" - if pref.exists(): - return str(pref) - return _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - except Exception: - return None - - -def resolve_packaged_onnx_and_keys() -> ResolvedOnnx: - """Locate ONNX det/rec/cls and Greek keys packaged with the glossapi package. - - Search order: - 1) GLOSSAPI_RAPIDOCR_ONNX_DIR (env var) with heuristic file names - 2) Under the installed glossapi package folder `models/` and common subfolders - 3) CLS only: fallback to RapidOCR’s bundled cls model if missing - """ - # 1) Explicit override directory - override = os.getenv("GLOSSAPI_RAPIDOCR_ONNX_DIR") - if override: - base = Path(override) - det = _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - if det or rec or cls or keys: - return ResolvedOnnx(det, rec, cls, keys) - - # 2) Search inside installed glossapi package - try: - glossapi = importlib.import_module("glossapi") - pkg_root = Path(glossapi.__file__).resolve().parent - # Candidate asset directories inside the package - candidates = [ - pkg_root / "models", - pkg_root / "models" / "rapidocr", - pkg_root / "models" / "rapidocr" / "onnx", - pkg_root / "models" / "rapidocr" / "keys", - pkg_root / "resources", - pkg_root / "assets", - pkg_root / "data", - ] - det = rec = cls = keys = None - for base in candidates: - if not base.exists(): - continue - det = det or _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = rec or _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = cls or _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = keys or _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - - if cls is None: - cls = _resolve_packaged_cls_fallback() - return ResolvedOnnx(det, rec, cls, keys) - except Exception: - return ResolvedOnnx(None, None, _resolve_packaged_cls_fallback(), None) - - -def summarize_resolution() -> Tuple[bool, str]: - r = resolve_packaged_onnx_and_keys() - ok = bool(r.det and r.rec and r.cls and r.keys) - msg = f"det={bool(r.det)} rec={bool(r.rec)} cls={bool(r.cls)} keys={bool(r.keys)}" - return ok, msg - diff --git a/src/glossapi/ocr/rapidocr/dispatch.py b/src/glossapi/ocr/rapidocr/dispatch.py deleted file mode 100644 index 7deeba2..0000000 --- a/src/glossapi/ocr/rapidocr/dispatch.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -from typing import Iterable, Optional - - -def run_via_extract( - corpus, - files: Iterable[str], - *, - export_doc_json: bool = False, - internal_debug: bool = False, - content_debug: Optional[bool] = None, -) -> None: - """Thin adapter that forwards to Corpus.extract for RapidOCR/Docling. - - This exists for symmetry with deepseek_runner and to keep the OCR package - as the single entry point for OCR backends. - """ - # Note: internal_debug/content_debug are no-ops for the Docling/RapidOCR path. - # Docling's output already produces a single concatenated Markdown document. - corpus.extract( - input_format="pdf", - num_threads=1, # let extract decide; override in tests if needed - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=list(files), - skip_existing=False, - export_doc_json=bool(export_doc_json), - emit_formula_index=bool(export_doc_json), - phase1_backend="docling", - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py b/src/glossapi/ocr/rapidocr/docling_pipeline.py deleted file mode 100644 index bb8988f..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi.ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup b/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup deleted file mode 100644 index f80344d..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi._rapidocr_paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi._pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/onnx.py b/src/glossapi/ocr/rapidocr/onnx.py deleted file mode 100644 index 57430d1..0000000 --- a/src/glossapi/ocr/rapidocr/onnx.py +++ /dev/null @@ -1,105 +0,0 @@ -"""OCR helpers for GlossAPI using Docling + RapidOCR (ONNXRuntime). - -GPU-first OCR that auto-discovers packaged ONNX models and Greek keys within -the installed `glossapi` package. Designed as a drop-in for Corpus.ocr(). -""" -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Dict, Any, Tuple - -_PIPELINE_CACHE: dict[str, Tuple[object, object]] = {} - - -def _build_pipeline( - device: Optional[str] = None, - *, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, -): - # Delegate to canonical builder to avoid duplication - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline - - engine, opts = build_rapidocr_pipeline( - device=(device or "cuda:0"), - text_score=(0.45 if text_score is None else float(text_score)), - images_scale=(1.25 if images_scale is None else float(images_scale)), - formula_enrichment=False, - code_enrichment=False, - ) - # Apply use_cls override if requested - try: - if use_cls is not None and hasattr(opts, "ocr_options"): - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - except Exception: - pass - return engine, opts - - -def run_rapidocr_onnx( - pdf_path: Path | str, - *, - device: Optional[str] = None, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, - max_pages: Optional[int] = None, -) -> Dict[str, Any]: - """Run Docling + RapidOCR (ONNX) OCR on a PDF and return markdown text. - - Returns - ------- - dict with keys: - - markdown_text: str - - duration_s: float - - pages: int - - models: dict with file names of det/rec/cls/keys - """ - from time import perf_counter - pdf_p = Path(pdf_path) - if not pdf_p.exists(): - raise FileNotFoundError(pdf_p) - - key = str(device or "cuda:0").lower() - cached = _PIPELINE_CACHE.get(key) - if cached is None: - pipe, r = _build_pipeline(device=device, use_cls=use_cls, text_score=text_score, images_scale=images_scale) - _PIPELINE_CACHE[key] = (pipe, r) - else: - pipe, r = cached # type: ignore[misc] - - t0 = perf_counter() - conv = pipe.convert(source=str(pdf_p)) # type: ignore[attr-defined] - doc = conv.document - md_text = doc.export_to_markdown() - duration = perf_counter() - t0 - - # Attempt to get page count from conv/document - pages = 0 - try: - if hasattr(doc, "pages"): - pages = len(doc.pages) # type: ignore[attr-defined] - except Exception: - pages = 0 - - # Return model identifiers as file names only (no full paths) - import os as _os - models = { - "det": _os.path.basename(r.det) if r.det else None, - "rec": _os.path.basename(r.rec) if r.rec else None, - "cls": _os.path.basename(r.cls) if r.cls else None, - "keys": _os.path.basename(r.keys) if r.keys else None, - } - - return { - "markdown_text": md_text or "", - "duration_s": duration, - "pages": int(pages), - "models": models, - } - - -__all__ = [ - "run_rapidocr_onnx", -] diff --git a/src/glossapi/ocr/rapidocr/pipeline.py b/src/glossapi/ocr/rapidocr/pipeline.py deleted file mode 100644 index a623c3d..0000000 --- a/src/glossapi/ocr/rapidocr/pipeline.py +++ /dev/null @@ -1,229 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PictureDescriptionApiOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import DocumentConverter, PdfFormatOption - -from ._paths import resolve_packaged_onnx_and_keys -from .pool import GLOBAL_RAPID_OCR_POOL -from .safe import SafeRapidOcrModel, patch_docling_rapidocr - -_logger = logging.getLogger(__name__) - -patch_docling_rapidocr() - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def _apply_common_pdf_options( - *, - acc: AcceleratorOptions, - images_scale: float, - formula_enrichment: bool, - code_enrichment: bool, -) -> PdfPipelineOptions: - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - # Prefer lightweight placeholder picture descriptions to avoid heavy VLM backends. - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - return opts - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Builder for a Docling PDF pipeline without RapidOCR. - - Returns ``(converter, PdfPipelineOptions)`` where ``converter`` is a - ``StandardPdfPipeline`` configured for layout extraction only. - """ - - acc, _ = _resolve_accelerator(device) - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts - - -def build_rapidocr_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Canonical builder for Docling + RapidOCR pipeline. - - Returns a tuple (engine, PdfPipelineOptions). Prefers explicit RapidOCR injection - when supported; otherwise returns a DocumentConverter using the factory path. - """ - - def _fallback_layout(reason: str) -> Tuple[object, PdfPipelineOptions]: - _logger.warning( - "RapidOCR pipeline fallback: %s. Using Docling layout-only configuration.", - reason, - ) - pipeline, opts = build_layout_pipeline( - device=device, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - return pipeline, opts - - acc, want_cuda = _resolve_accelerator(device) - - # Optional provider preflight only when CUDA requested - if want_cuda: - try: - import onnxruntime as ort # type: ignore - - prov = ort.get_available_providers() - if "CUDAExecutionProvider" not in prov: - raise RuntimeError(f"CUDAExecutionProvider not available: {prov}") - except Exception as e: # pragma: no cover - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - return _fallback_layout("packaged RapidOCR ONNX assets missing") - - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=["el", "en"], - force_full_page_ocr=False, - use_det=True, - use_cls=False, - use_rec=True, - text_score=text_score, - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - opts.do_ocr = True - opts.ocr_options = ocr_opts - - # Prefer explicit injection of RapidOCR model when available - try: - from docling.models.rapid_ocr_model import RapidOcrModel # type: ignore - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - import inspect - - sig = inspect.signature(StandardPdfPipeline.__init__) - if "ocr_model" not in sig.parameters: - raise RuntimeError("Docling build does not support RapidOCR injection") - - def _factory(): - try: - return SafeRapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - except Exception: # pragma: no cover - # Fall back to the stock implementation if our wrapper misbehaves. - return RapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - - pooled_model = GLOBAL_RAPID_OCR_POOL.get( - str(acc.device), - ocr_opts, - _factory, - expected_type=SafeRapidOcrModel, - ) - pipeline = StandardPdfPipeline(opts, ocr_model=pooled_model) # type: ignore - return pipeline, opts - except Exception as exc: - _logger.warning( - "RapidOCR injection unavailable (%s); using DocumentConverter factory path.", - exc, - ) - - # Fallback: use DocumentConverter factory - try: - converter = DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} - ) - return converter, opts - except Exception as exc: - return _fallback_layout(f"DocumentConverter failed: {exc}") - - -__all__ = ["build_layout_pipeline", "build_rapidocr_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/pool.py b/src/glossapi/ocr/rapidocr/pool.py deleted file mode 100644 index db1e8f2..0000000 --- a/src/glossapi/ocr/rapidocr/pool.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Shared RapidOCR engine pooling utilities.""" -from __future__ import annotations - -from dataclasses import dataclass -from threading import Lock -from typing import Callable, Dict, Optional, Union, Type - -from docling.datamodel.pipeline_options import RapidOcrOptions - - -@dataclass(frozen=True) -class _PoolKey: - device: str - det_model_path: str - rec_model_path: str - cls_model_path: str - lang: Tuple[str, ...] - text_score: float - use_det: bool - use_cls: bool - use_rec: bool - - -class RapidOcrEnginePool: - """Process-local cache of RapidOCR models keyed by configuration.""" - - def __init__(self) -> None: - self._lock = Lock() - self._cache: Dict[_PoolKey, object] = {} - - def _make_key(self, device: str, opts: RapidOcrOptions) -> _PoolKey: - lang = tuple(opts.lang or []) - return _PoolKey( - device=str(device), - det_model_path=str(getattr(opts, "det_model_path", "")), - rec_model_path=str(getattr(opts, "rec_model_path", "")), - cls_model_path=str(getattr(opts, "cls_model_path", "")), - lang=lang, - text_score=float(getattr(opts, "text_score", 0.0)), - use_det=bool(getattr(opts, "use_det", True)), - use_cls=bool(getattr(opts, "use_cls", False)), - use_rec=bool(getattr(opts, "use_rec", True)), - ) - - def get( - self, - device: str, - opts: RapidOcrOptions, - factory: Callable[[], object], - *, - expected_type: Optional[Union[Type[object], tuple[Type[object], ...]]] = None, - ) -> object: - key = self._make_key(device, opts) - with self._lock: - model = self._cache.get(key) - if expected_type is not None and model is not None and not isinstance(model, expected_type): - self._cache.pop(key, None) - model = None - if model is None: - model = factory() - if expected_type is None or isinstance(model, expected_type): - self._cache[key] = model - return model - - def clear(self) -> None: - with self._lock: - self._cache.clear() - - -GLOBAL_RAPID_OCR_POOL = RapidOcrEnginePool() - -__all__ = ["RapidOcrEnginePool", "GLOBAL_RAPID_OCR_POOL"] diff --git a/src/glossapi/ocr/rapidocr/safe.py b/src/glossapi/ocr/rapidocr/safe.py deleted file mode 100644 index 5534563..0000000 --- a/src/glossapi/ocr/rapidocr/safe.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Temporary wrappers around Docling's RapidOCR integration. - -The upstream Docling release (2.48.x) does not tolerate RapidOCR returning -``None`` for a given crop. That bubbles up as an AttributeError inside the -conversion loop and the entire document fails. Until Docling includes a fix, we -wrap the loader so that ``None`` simply means "no detections" and processing -continues. Once Docling ships a release with the guard we can drop this shim and -revert to the vanilla ``RapidOcrModel``. -""" - -from __future__ import annotations - -import importlib.util -import sys -from collections.abc import Iterable -from pathlib import Path -from typing import Optional, Type - -import numpy - -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrOptions, RapidOcrOptions -from docling.models.rapid_ocr_model import RapidOcrModel as _RapidOcrModel -from docling.models.rapid_ocr_model import TextCell, _log -from docling.utils.profiling import TimeRecorder -from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle - -from ._paths import resolve_packaged_onnx_and_keys - - -class SafeRapidOcrModel(_RapidOcrModel): - """Drop-in RapidOCR wrapper that copes with ``None`` OCR results. - - Docling 2.48.0 assumes ``self.reader`` always returns an object with - ``boxes/txts/scores``. RapidOCR occasionally yields ``None`` for problematic - crops, which crashes the extractor. We normalise the return value before the - original list(zip(...)) call and treat anything unexpected as "no boxes". - Remove this once Docling hardens the upstream implementation. - """ - - # NOTE: keep signature identical so StandardPdfPipeline can instantiate it. - _rapidocr_available: Optional[bool] = None - - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - options: RapidOcrOptions, - accelerator_options, - ): - rapidocr_available = self._rapidocr_available - if rapidocr_available is None: - rapidocr_available = bool( - importlib.util.find_spec("rapidocr") is not None or "rapidocr" in sys.modules - ) - SafeRapidOcrModel._rapidocr_available = rapidocr_available - - effective_enabled = bool(enabled and rapidocr_available) - if enabled and not rapidocr_available: - _log.warning( - "RapidOCR python package not found; continuing with Docling pipeline OCR disabled." - ) - - if effective_enabled: - try: - resolved = resolve_packaged_onnx_and_keys() - - _log.warning( - 'SafeRapidOcrModel initial options: det=%s rec=%s cls=%s keys=%s', - getattr(options, 'det_model_path', None), - getattr(options, 'rec_model_path', None), - getattr(options, 'cls_model_path', None), - getattr(options, 'rec_keys_path', None), - ) - - if resolved.det: - options.det_model_path = resolved.det - if resolved.rec: - options.rec_model_path = resolved.rec - if resolved.cls: - options.cls_model_path = resolved.cls - if resolved.keys: - options.rec_keys_path = resolved.keys - - try: - from rapidocr.ch_ppocr_rec import main as _rapidocr_rec_main - - if not getattr(_rapidocr_rec_main.TextRecognizer, '_glossapi_patch', False): - original_get_character_dict = _rapidocr_rec_main.TextRecognizer.get_character_dict - - def _patched_get_character_dict(self, cfg): - try: - current_keys = cfg.get('keys_path', None) - current_rec_keys = cfg.get('rec_keys_path', None) - if current_rec_keys is None and current_keys is not None: - cfg['rec_keys_path'] = current_keys - _log.warning('Patched RapidOCR cfg: set rec_keys_path from keys_path=%s', current_keys) - else: - _log.warning('Patched RapidOCR cfg: existing rec_keys_path=%s keys_path=%s', current_rec_keys, current_keys) - except Exception: - _log.warning('RapidOCR cfg inspection failed', exc_info=True) - return original_get_character_dict(self, cfg) - - _rapidocr_rec_main.TextRecognizer.get_character_dict = _patched_get_character_dict - _rapidocr_rec_main.TextRecognizer._glossapi_patch = True - except Exception: - _log.warning('Failed to patch RapidOCR TextRecognizer for keys fallback', exc_info=True) - - _log.warning( - 'SafeRapidOcrModel using packaged assets: det=%s rec=%s cls=%s keys=%s', - options.det_model_path, - options.rec_model_path, - options.cls_model_path, - options.rec_keys_path, - ) - except Exception: - _log.warning( - 'SafeRapidOcrModel bootstrap failed to resolve packaged assets', - exc_info=True, - ) - - super().__init__( - enabled=effective_enabled, - artifacts_path=artifacts_path, - options=options, - accelerator_options=accelerator_options, - ) - - @classmethod - def get_options_type(cls) -> Type[OcrOptions]: - return RapidOcrOptions - - def _normalise_result(self, result): - """Return an iterable of (bbox, text, score) triples. - - RapidOCR returns ``None`` or semi-populated structures in some corner - cases. We swallow those and log a one-line warning so the page still - progresses through the pipeline. - """ - - if result is None: - _log.warning("RapidOCR returned None; skipping crop") - return [] - boxes = getattr(result, "boxes", None) - txts = getattr(result, "txts", None) - scores = getattr(result, "scores", None) - if boxes is None or txts is None or scores is None: - _log.warning("RapidOCR returned incomplete data; treating crop as empty") - return [] - try: - return list(zip(boxes.tolist(), txts, scores)) - except Exception as exc: # pragma: no cover - defensive only - _log.warning("RapidOCR result normalisation failed: %s", exc) - return [] - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - if not self.enabled: - yield from page_batch - return - - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - continue - - with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - raw_result = self.reader( - im, - use_det=self.options.use_det, - use_cls=self.options.use_cls, - use_rec=self.options.use_rec, - ) - result = self._normalise_result(raw_result) - del high_res_image - del im - - if not result: - continue - - cells = [ - TextCell( - index=ix, - text=line[1], - orig=line[1], - confidence=line[2], - from_ocr=True, - rect=BoundingRectangle.from_bounding_box( - BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ) - ), - ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) - - self.post_process_cells(all_ocr_cells, page) - - from docling.datamodel.settings import settings - - if settings.debug.visualize_ocr: - self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) - - yield page - - -def patch_docling_rapidocr() -> bool: - """Replace Docling's RapidOcrModel with the safe shim if available.""" - - try: - import docling.models.rapid_ocr_model as rapid_module - except Exception: # pragma: no cover - Docling missing - return False - - current = getattr(rapid_module, "RapidOcrModel", None) - if current is SafeRapidOcrModel: - return False - - rapid_module.RapidOcrModel = SafeRapidOcrModel - try: - from docling.models.factories import get_ocr_factory # type: ignore - import logging - except Exception: - return True - - try: - factory = get_ocr_factory() - options_type = SafeRapidOcrModel.get_options_type() - - if hasattr(factory, "classes"): - factory.classes[options_type] = SafeRapidOcrModel - elif hasattr(factory, "_classes"): - factory._classes[options_type] = SafeRapidOcrModel - logging.getLogger(__name__).info( - "Registered SafeRapidOcrModel for %s", options_type - ) - try: - from docling.pipeline import standard_pdf_pipeline as _std_pdf # type: ignore - from docling.datamodel.pipeline_options import RapidOcrOptions # type: ignore - from functools import lru_cache - except Exception as _exc: # pragma: no cover - best effort - logging.getLogger(__name__).warning( - "Docling factory patch limited to local mutation: %s", _exc - ) - else: - original_get_factory = getattr( - _std_pdf.get_ocr_factory, "__wrapped__", _std_pdf.get_ocr_factory - ) - - def _ensure_safe(factory_obj): - try: - current = factory_obj.classes.get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - factory_obj.classes[RapidOcrOptions] = SafeRapidOcrModel - except AttributeError: - current = getattr(factory_obj, "_classes", {}).get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - getattr(factory_obj, "_classes", {})[RapidOcrOptions] = SafeRapidOcrModel - return factory_obj - - @lru_cache(maxsize=None) - def _patched_get_ocr_factory(allow_external_plugins: bool = False): - return _ensure_safe(original_get_factory(allow_external_plugins)) - - _patched_get_ocr_factory.__wrapped__ = original_get_factory # type: ignore[attr-defined] - _std_pdf.get_ocr_factory = _patched_get_ocr_factory # type: ignore[attr-defined] - try: - _ensure_safe(_std_pdf.get_ocr_factory(False)) - except Exception: - pass - except Exception as exc: # pragma: no cover - best effort - import logging - - logging.getLogger(__name__).warning( - "Failed to re-register SafeRapidOcrModel: %s", exc - ) - return True - - -__all__ = ["SafeRapidOcrModel", "patch_docling_rapidocr"] diff --git a/src/glossapi/ocr/utils/cleaning.py b/src/glossapi/ocr/utils/cleaning.py index 9b4e287..c194c72 100644 --- a/src/glossapi/ocr/utils/cleaning.py +++ b/src/glossapi/ocr/utils/cleaning.py @@ -260,11 +260,207 @@ def _detect_repeated_lines_cut(text: str, *, threshold: int = 10) -> Optional[in return None +def _is_private_use_char(ch: str) -> bool: + codepoint = ord(ch) + return ( + 0xE000 <= codepoint <= 0xF8FF + or 0xF0000 <= codepoint <= 0xFFFFD + or 0x100000 <= codepoint <= 0x10FFFD + ) + + +def _is_symbol_garbage_char(ch: str) -> bool: + if _is_private_use_char(ch): + return True + return ch in { + "•", + "", + "·", + "◦", + "▪", + "▫", + "‣", + "∙", + "⋅", + "●", + "○", + "◉", + "◌", + "◆", + "◇", + "■", + "□", + "▲", + "△", + "▼", + "▽", + "►", + "◄", + "◊", + "", + "", + "", + "", + "", + "", + } + + +def _detect_symbol_garbage_cut(text: str, *, threshold: int = 16) -> Optional[int]: + """Cut on long runs of isolated bullet/dingbat/private-use symbols. + + This targets the common DeepSeek garbage mode where the model emits long + whitespace-separated runs of bullets or private-use glyphs instead of text. + """ + if threshold <= 1: + return 0 + run_count = 0 + run_start: Optional[int] = None + last_non_ws = -10_000 + for index, ch in enumerate(text): + if ch.isspace(): + continue + if _is_symbol_garbage_char(ch): + if run_count == 0 or (index - last_non_ws) > 3: + run_start = index + run_count = 1 + else: + run_count += 1 + last_non_ws = index + if run_count >= threshold: + return run_start + continue + run_count = 0 + run_start = None + last_non_ws = index + return None + + +NUMERIC_LIST_TOKEN_PATTERN = re.compile(r"(? Optional[int]: + """Cut on degenerate `1. 2. 3. ...` style list output.""" + if threshold <= 1: + return 0 + matches = list(NUMERIC_LIST_TOKEN_PATTERN.finditer(text)) + if len(matches) < threshold: + return None + run_start = matches[0].start() + run_count = 1 + prev_value = int(matches[0].group(1)) + prev_end = matches[0].end() + for match in matches[1:]: + current_value = int(match.group(1)) + gap = text[prev_end : match.start()] + if current_value == prev_value + 1 and len(gap) <= 4 and gap.strip() == "": + run_count += 1 + else: + run_start = match.start() + run_count = 1 + if run_count >= threshold: + return run_start + prev_value = current_value + prev_end = match.end() + return None + + +class StreamingGarbageDetector: + """Incremental detector for common OCR garbage generation modes. + + This is designed for hot decode loops: feed only newly decoded text chunks + and keep O(1) mutable state instead of rescanning the whole suffix. + """ + + def __init__( + self, + *, + symbol_threshold: int = 16, + numeric_list_threshold: int = 12, + ) -> None: + self.symbol_threshold = int(symbol_threshold) + self.numeric_list_threshold = int(numeric_list_threshold) + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number: Optional[int] = None + self._digits_buffer: str = "" + self.triggered_reason: Optional[str] = None + + def reset(self) -> None: + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + self.triggered_reason = None + + def _reset_numeric(self) -> None: + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + + def _feed_symbol_char(self, ch: str) -> bool: + if ch.isspace(): + return False + if _is_symbol_garbage_char(ch): + self._symbol_run += 1 + if self._symbol_run >= self.symbol_threshold: + self.triggered_reason = "symbol_garbage" + return True + return False + self._symbol_run = 0 + return False + + def _feed_numeric_char(self, ch: str) -> bool: + if ch.isspace(): + if self._digits_buffer: + self._reset_numeric() + return False + if "0" <= ch <= "9": + self._digits_buffer += ch + return False + if ch in {".", ")"} and self._digits_buffer: + value = int(self._digits_buffer) + self._digits_buffer = "" + if self._expected_next_number is None: + if value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + else: + if value == self._expected_next_number: + self._numeric_run += 1 + self._expected_next_number += 1 + elif value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + if self._numeric_run >= self.numeric_list_threshold: + self.triggered_reason = "numeric_list_garbage" + return True + return False + self._reset_numeric() + return False + + def feed(self, text: str) -> bool: + if self.triggered_reason is not None: + return True + for ch in str(text or ""): + if self._feed_symbol_char(ch): + return True + if self._feed_numeric_char(ch): + return True + return False + + def detect_early_stop_index( text: str, *, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, ) -> Optional[int]: """Find earliest cut index based on repetition heuristics. @@ -273,11 +469,12 @@ def detect_early_stop_index( """ idx_char = _detect_repeated_char_cut(text, threshold=char_repeat_threshold) idx_line = _detect_repeated_lines_cut(text, threshold=line_repeat_threshold) - if idx_char is None: - return idx_line - if idx_line is None: - return idx_char - return min(idx_char, idx_line) + idx_symbol = _detect_symbol_garbage_cut(text, threshold=symbol_garbage_threshold) + idx_numeric = _detect_numeric_list_garbage_cut(text, threshold=numeric_list_threshold) + candidates = [idx for idx in (idx_char, idx_line, idx_symbol, idx_numeric) if idx is not None] + if not candidates: + return None + return min(candidates) def apply_early_stop( @@ -286,6 +483,8 @@ def apply_early_stop( content_debug: bool = False, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, metrics: Optional[dict] = None, ) -> str: """Apply early termination heuristics to ``text`` and optionally append notice. @@ -299,6 +498,8 @@ def apply_early_stop( text, line_repeat_threshold=line_repeat_threshold, char_repeat_threshold=char_repeat_threshold, + symbol_garbage_threshold=symbol_garbage_threshold, + numeric_list_threshold=numeric_list_threshold, ) if cut is None: return text diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py new file mode 100644 index 0000000..83a8a8b --- /dev/null +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +import argparse +import json +import random +import shutil +import subprocess +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) + + +def _parse_devices(spec: str) -> List[int]: + tokens = [piece.strip() for piece in str(spec or "").split(",") if piece.strip()] + if not tokens: + raise argparse.ArgumentTypeError("--devices must contain at least one GPU id") + try: + return [int(token) for token in tokens] + except ValueError as exc: + raise argparse.ArgumentTypeError(f"Invalid GPU list: {spec}") from exc + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_pipeline_benchmark", + description="Benchmark DeepSeek OCR pipeline throughput for different scheduling strategies.", + ) + p.add_argument("--repo", required=True) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--python-bin", required=True) + p.add_argument("--model-dir", required=True) + p.add_argument("--label", required=True) + p.add_argument("--mode", default="static", choices=["static", "streaming"]) + p.add_argument( + "--scheduler", + default="whole_doc", + choices=["whole_doc", "fixed_shard", "exact_fill"], + ) + p.add_argument("--devices", default="0,1,2,3,4,5,6,7") + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--max-docs", type=int, default=None) + p.add_argument("--doc-order", default="name", choices=["name", "random", "largest_first"]) + p.add_argument("--seed", type=int, default=20260330) + p.add_argument("--target-batch-pages", type=int, default=160) + p.add_argument("--stream-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) + p.add_argument("--runtime-backend", default="vllm", choices=["transformers", "vllm"]) + p.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + p.add_argument("--prompt-override", default=None) + p.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + p.add_argument("--attn-backend", default="auto") + p.add_argument("--base-size", type=int, default=None) + p.add_argument("--image-size", type=int, default=None) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--max-new-tokens", type=int, default=2048) + p.add_argument("--vllm-batch-size", type=int, default=None) + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + p.add_argument("--disable-fp8-kv", action="store_true") + p.add_argument("--clean", action="store_true") + return p.parse_args() + + +def _weighted_documents( + *, + input_dir: Path, + max_docs: Optional[int], + doc_order: str, + seed: int, +) -> List[SourceDocument]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + documents = [ + SourceDocument(name=path.name, pages=int(deepseek_runner._effective_page_count(path, None))) + for path in sorted(input_dir.glob("*.pdf")) + ] + if doc_order == "largest_first": + documents.sort(key=lambda item: (-int(item.pages), str(item.name))) + elif doc_order == "random": + rng = random.Random(int(seed)) + rng.shuffle(documents) + if max_docs is not None: + documents = documents[: max(0, int(max_docs))] + return documents + + +def _plan_lanes( + *, + documents: List[SourceDocument], + devices: List[int], + workers_per_gpu: int, + scheduler: str, + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + scheduler_norm = str(scheduler or "whole_doc").strip().lower() + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches(documents, target_batch_pages=max(1, int(target_batch_pages))) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches(slices, target_batch_pages=max(1, int(target_batch_pages))) + lanes = assign_batches_to_lanes( + batches, + devices=devices, + workers_per_gpu=max(1, int(workers_per_gpu)), + ) + return [lane.to_dict() for lane in lanes if lane.batches] + + +def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: + metrics_dir = run_dir / "json" / "metrics" + totals = { + "docs_with_metrics": 0, + "pages_flagged": 0, + "pages_repaired": 0, + "plain_repairs": 0, + "tiled_repairs": 0, + } + if not metrics_dir.exists(): + return totals + for path in metrics_dir.glob("*.metrics.json"): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + totals["docs_with_metrics"] += 1 + summary = data.get("repair_summary") or {} + totals["pages_flagged"] += int(summary.get("pages_flagged", 0)) + totals["pages_repaired"] += int(summary.get("pages_repaired", 0)) + totals["plain_repairs"] += int(summary.get("plain_repairs", 0)) + totals["tiled_repairs"] += int(summary.get("tiled_repairs", 0)) + return totals + + +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + +def main() -> int: + args = _parse_args() + repo = Path(args.repo).resolve() + input_dir = Path(args.input_dir).resolve() + output_root = Path(args.output_dir).resolve() + python_bin = Path(args.python_bin).expanduser() + model_dir = Path(args.model_dir).resolve() + devices = _parse_devices(args.devices) + + from glossapi.ocr.deepseek import runner as deepseek_runner + + documents = _weighted_documents( + input_dir=input_dir, + max_docs=args.max_docs, + doc_order=args.doc_order, + seed=int(args.seed), + ) + if not documents: + raise SystemExit("No PDFs found for benchmark input set.") + lanes = _plan_lanes( + documents=documents, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + ) + + run_dir = output_root / args.label + if args.clean and run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + logs_dir = run_dir / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "lane_plan.json").write_text(json.dumps(lanes, indent=2), encoding="utf-8") + + script_path = ( + deepseek_runner.DEFAULT_VLLM_SCRIPT + if str(args.runtime_backend) == "vllm" + else deepseek_runner.DEFAULT_SCRIPT + ) + py_env = {"PYTHONPATH": str(repo / "src")} + + def start_lane(lane: Dict[str, Any]) -> Dict[str, Any]: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) + resolved_vllm_batch_size = ( + int(args.vllm_batch_size) + if args.vllm_batch_size is not None + else min(max(1, int(args.target_batch_pages)), max(1, pages)) + ) + log_path = logs_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" + fh = log_path.open("w", encoding="utf-8") + cmd = deepseek_runner._build_cli_command( + input_dir=input_dir, + output_dir=run_dir, + files=files, + page_ranges=page_ranges, + model_dir=model_dir, + python_bin=python_bin, + script=script_path, + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile=str(args.ocr_profile), + prompt_override=args.prompt_override, + attn_backend=str(args.attn_backend), + base_size=args.base_size, + image_size=args.image_size, + crop_mode=None, + render_dpi=int(args.render_dpi), + max_new_tokens=args.max_new_tokens, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend=str(args.runtime_backend), + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + repair_mode=str(args.repair_mode), + ) + env = deepseek_runner._build_env(python_bin=python_bin, visible_device=visible_device) + env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" if env.get("PYTHONPATH") else py_env["PYTHONPATH"] + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + return { + "lane_id": lane_id, + "visible_device": visible_device, + "batch_id": 0, + "pages": pages, + "files": files, + "page_ranges": page_ranges, + "planned_batch_count": int(lane_plan["planned_batch_count"]), + "planned_batch_pages": list(lane_plan["planned_batch_pages"]), + "resolved_vllm_batch_size": resolved_vllm_batch_size, + "log_path": str(log_path), + "fh": fh, + "proc": proc, + "start_ts": time.perf_counter(), + "cmd": cmd, + } + + global_start = time.perf_counter() + active: List[Dict[str, Any]] = [start_lane(lane) for lane in lanes] + + batch_results: List[Dict[str, Any]] = [] + while active: + time.sleep(0.2) + for item in list(active): + rc = item["proc"].poll() + if rc is None: + continue + end_ts = time.perf_counter() + item["fh"].close() + elapsed = max(0.000001, float(end_ts - item["start_ts"])) + batch_results.append( + { + "lane_id": int(item["lane_id"]), + "visible_device": int(item["visible_device"]), + "batch_id": int(item["batch_id"]), + "pages": int(item["pages"]), + "files": list(item["files"]), + "page_ranges": list(item.get("page_ranges") or []), + "planned_batch_count": int(item.get("planned_batch_count", 1)), + "planned_batch_pages": list(item.get("planned_batch_pages") or []), + "return_code": int(rc), + "resolved_vllm_batch_size": int(item["resolved_vllm_batch_size"]), + "start_offset_sec": float(item["start_ts"] - global_start), + "end_offset_sec": float(end_ts - global_start), + "elapsed_sec": float(elapsed), + "sec_per_page": float(elapsed / max(1, int(item["pages"]))), + "log_path": str(item["log_path"]), + "cmd": item["cmd"], + } + ) + active.remove(item) + + total_elapsed = max(0.000001, time.perf_counter() - global_start) + total_pages = sum(int(doc.pages) for doc in documents) + failures = [item for item in batch_results if int(item["return_code"]) != 0] + + lane_results: List[Dict[str, Any]] = [] + for lane in lanes: + lane_batches = [item for item in batch_results if int(item["lane_id"]) == int(lane["lane_id"])] + if not lane_batches: + continue + lane_start = min(float(item["start_offset_sec"]) for item in lane_batches) + lane_end = max(float(item["end_offset_sec"]) for item in lane_batches) + lane_elapsed = max(0.000001, lane_end - lane_start) + lane_pages = sum(int(item["pages"]) for item in lane_batches) + lane_results.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "batch_count": len(lane_batches), + "pages": int(lane_pages), + "active_elapsed_sec": float(lane_elapsed), + "sec_per_page": float(lane_elapsed / max(1, lane_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in lane_batches), + } + ) + + gpu_results: List[Dict[str, Any]] = [] + for visible_device in sorted({int(item["visible_device"]) for item in batch_results}): + gpu_batches = [item for item in batch_results if int(item["visible_device"]) == visible_device] + gpu_start = min(float(item["start_offset_sec"]) for item in gpu_batches) + gpu_end = max(float(item["end_offset_sec"]) for item in gpu_batches) + gpu_elapsed = max(0.000001, gpu_end - gpu_start) + gpu_pages = sum(int(item["pages"]) for item in gpu_batches) + gpu_results.append( + { + "visible_device": visible_device, + "batch_count": len(gpu_batches), + "pages": int(gpu_pages), + "active_elapsed_sec": float(gpu_elapsed), + "sec_per_page": float(gpu_elapsed / max(1, gpu_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in gpu_batches), + } + ) + + repair_metrics = _collect_repair_metrics(run_dir) + summary = { + "label": str(args.label), + "status": "pass" if not failures else "fail", + "mode": str(args.mode), + "scheduler": str(args.scheduler), + "runtime_backend": str(args.runtime_backend), + "ocr_profile": str(args.ocr_profile), + "repair_mode": str(args.repair_mode), + "devices": devices, + "workers_per_gpu": int(args.workers_per_gpu), + "doc_order": str(args.doc_order), + "target_batch_pages": int(args.target_batch_pages), + "stream_batch_pages": int(args.stream_batch_pages), + "docs": len(documents), + "pages": int(total_pages), + "shard_pages": int(args.shard_pages), + "shard_threshold_pages": int(args.shard_threshold_pages), + "wall_time_sec": float(total_elapsed), + "sec_per_page": float(total_elapsed / max(1, total_pages)), + "batch_results": batch_results, + "lane_results": lane_results, + "gpu_results": gpu_results, + "repair_metrics": repair_metrics, + "failures": failures, + } + (run_dir / "pipeline_benchmark_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(json.dumps(summary, indent=2)) + return 1 if failures else 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/install_glossapi.py b/src/glossapi/scripts/install_glossapi.py new file mode 100644 index 0000000..195d662 --- /dev/null +++ b/src/glossapi/scripts/install_glossapi.py @@ -0,0 +1,230 @@ +"""Guided installer for GlossAPI extras.""" + +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Set + + +PHASE_TO_EXTRAS: Dict[str, Set[str]] = { + "download": set(), + "browser_download": {"browser"}, + "extract": {"docling"}, + "ocr": set(), + "docs": {"docs"}, +} + + +@dataclass(frozen=True) +class InstallPlan: + phases: tuple[str, ...] + extras: tuple[str, ...] + editable: bool + include_cuda: bool + needs_deepseek_runtime: bool + + +def _supports_color() -> bool: + return sys.stdout.isatty() and os.environ.get("TERM") not in {"", "dumb", None} + + +def _style(text: str, code: str) -> str: + if not _supports_color(): + return text + return f"\033[{code}m{text}\033[0m" + + +def _prompt_yes_no(question: str, default: bool = False) -> bool: + suffix = "[Y/n]" if default else "[y/N]" + while True: + raw = input(f"{question} {suffix} ").strip().lower() + if not raw: + return default + if raw in {"y", "yes"}: + return True + if raw in {"n", "no"}: + return False + print("Please answer 'y' or 'n'.") + + +def _resolve_phase_selection(tokens: Iterable[str]) -> List[str]: + resolved: List[str] = [] + seen: Set[str] = set() + for token in tokens: + phase = str(token).strip().lower() + if not phase: + continue + if phase not in PHASE_TO_EXTRAS: + raise ValueError(f"Unsupported phase '{token}'. Valid phases: {', '.join(sorted(PHASE_TO_EXTRAS))}") + if phase not in seen: + seen.add(phase) + resolved.append(phase) + return resolved + + +def build_install_plan( + *, + phases: Sequence[str], + editable: bool, + include_cuda: bool, +) -> InstallPlan: + selected = _resolve_phase_selection(phases) + extras: Set[str] = set() + for phase in selected: + extras.update(PHASE_TO_EXTRAS[phase]) + if include_cuda: + extras.add("cuda") + return InstallPlan( + phases=tuple(selected), + extras=tuple(sorted(extras)), + editable=bool(editable), + include_cuda=bool(include_cuda), + needs_deepseek_runtime=("ocr" in selected), + ) + + +def build_pip_command(plan: InstallPlan, repo_root: Path) -> List[str]: + target = "." + if plan.extras: + target = f".[{','.join(plan.extras)}]" + cmd = [sys.executable, "-m", "pip", "install"] + if plan.editable: + cmd.append("-e") + cmd.append(target) + return cmd + + +def build_deepseek_command(repo_root: Path) -> Optional[List[str]]: + script = repo_root / "dependency_setup" / "setup_deepseek_uv.sh" + if not script.exists(): + return None + shell = shutil.which("bash") or shutil.which("sh") + if not shell: + return None + return [shell, str(script)] + + +def _interactive_plan(default_editable: bool) -> InstallPlan: + print(_style("GlossAPI Installer", "1;36")) + print("Select only the phases you plan to use so optional dependencies stay minimal.\n") + + selected: List[str] = ["download"] + print(_style("Core", "1;37")) + print(" download: base downloader/data pipeline dependencies") + if _prompt_yes_no("Add browser-gated download support?", default=False): + selected.append("browser_download") + if _prompt_yes_no("Add extraction support (Docling)?", default=False): + selected.append("extract") + if _prompt_yes_no("Add OCR support (DeepSeek backend)?", default=False): + selected.append("ocr") + if _prompt_yes_no("Add docs tooling?", default=False): + selected.append("docs") + include_cuda = _prompt_yes_no("Include CUDA extras where relevant?", default=False) + editable = _prompt_yes_no("Install in editable mode?", default=default_editable) + return build_install_plan(phases=selected, editable=editable, include_cuda=include_cuda) + + +def _plan_summary(plan: InstallPlan, command: Sequence[str]) -> str: + extras = ", ".join(plan.extras) if plan.extras else "(none)" + phases = ", ".join(plan.phases) if plan.phases else "(none)" + return "\n".join( + [ + _style("Install plan", "1;32"), + f" phases: {phases}", + f" extras: {extras}", + f" editable: {'yes' if plan.editable else 'no'}", + f" command: {shlex.join(command)}", + f" deepseek runtime: {'separate setup required' if plan.needs_deepseek_runtime else 'not requested'}", + ] + ) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python install_glossapi.py", + description="Guided installer for GlossAPI optional dependency groups.", + ) + parser.add_argument( + "--phases", + default="", + help=( + "Comma-separated phases to install. Valid values: " + + ", ".join(sorted(PHASE_TO_EXTRAS)) + + ". If omitted, an interactive wizard is shown." + ), + ) + parser.add_argument( + "--cuda", + action="store_true", + help="Include the CUDA extra.", + ) + parser.add_argument( + "--editable", + dest="editable", + action="store_true", + help="Install in editable mode.", + ) + parser.add_argument( + "--no-editable", + dest="editable", + action="store_false", + help="Install as a regular package.", + ) + parser.set_defaults(editable=True) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the computed pip command without running it.", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Skip confirmation prompts in non-interactive mode.", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + repo_root = Path(__file__).resolve().parents[3] + + if args.phases.strip(): + plan = build_install_plan( + phases=[token for token in args.phases.split(",") if token.strip()], + editable=args.editable, + include_cuda=bool(args.cuda), + ) + else: + plan = _interactive_plan(default_editable=bool(args.editable)) + + command = build_pip_command(plan, repo_root) + print(_plan_summary(plan, command)) + deepseek_command = build_deepseek_command(repo_root) if plan.needs_deepseek_runtime else None + if deepseek_command: + print(f" deepseek command: {shlex.join(deepseek_command)}") + + if args.dry_run: + return 0 + if not args.yes and not args.phases.strip(): + if not _prompt_yes_no("Run this install command now?", default=True): + print("Aborted.") + return 1 + + completed = subprocess.run(command, cwd=repo_root) + if completed.returncode != 0: + return int(completed.returncode) + if plan.needs_deepseek_runtime and deepseek_command: + print(_style("Provisioning dedicated DeepSeek runtime…", "1;33")) + completed = subprocess.run(deepseek_command, cwd=repo_root) + return int(completed.returncode) + + +if __name__ == "__main__": # pragma: no cover - CLI entrypoint + raise SystemExit(main()) diff --git a/src/glossapi/scripts/ocr_gpu_batch.py b/src/glossapi/scripts/ocr_gpu_batch.py index 2183664..2646baa 100644 --- a/src/glossapi/scripts/ocr_gpu_batch.py +++ b/src/glossapi/scripts/ocr_gpu_batch.py @@ -115,15 +115,21 @@ def main(argv: Optional[List[str]] = None) -> int: "--force-ocr", dest="force_ocr", action="store_true", - help="Force GPU OCR during extraction (default).", + help="Deprecated no-op retained for compatibility; OCR now runs through Corpus.ocr(...).", ) parser.add_argument( "--no-force-ocr", dest="force_ocr", action="store_false", - help="Skip forced OCR (only run math/layout).", + help="Explicitly disable the deprecated Phase-1 OCR flag.", + ) + parser.set_defaults(force_ocr=False) + parser.add_argument( + "--workers-per-device", + type=int, + default=1, + help="Number of extraction workers to bind to each visible GPU (default: 1).", ) - parser.set_defaults(force_ocr=True) parser.add_argument( "--dry-run", action="store_true", @@ -182,6 +188,7 @@ def main(argv: Optional[List[str]] = None) -> int: export_doc_json=True, emit_formula_index=emit_formula_index, phase1_backend=args.phase1_backend, + workers_per_device=max(1, int(args.workers_per_device)), ) print("[ocr_gpu_batch] Extraction complete.") @@ -190,4 +197,3 @@ def main(argv: Optional[List[str]] = None) -> int: if __name__ == "__main__": # pragma: no cover - CLI entrypoint raise SystemExit(main()) - diff --git a/src/glossapi/scripts/openarchives_download_freeze.py b/src/glossapi/scripts/openarchives_download_freeze.py new file mode 100644 index 0000000..8188e9a --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_freeze.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import argparse +import logging +from pathlib import Path +from typing import List, Optional + +from glossapi import Corpus +from glossapi.scripts.openarchives_ocr_run_node import ( + DEFAULT_DOWNLOAD_CONCURRENCY, + DEFAULT_DOWNLOAD_TIMEOUT, + _load_frame, + _normalize_download_results, + _prepare_download_input, + _write_canonical_metadata, +) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_freeze", + description=( + "Materialize one OpenArchives manifest into a canonical GlossAPI downloads root " + "without starting OCR. This is the reproducible PDF-freeze entrypoint." + ), + ) + p.add_argument("--input-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + input_path = Path(args.input_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + manifest_df = _prepare_download_input(_load_frame(input_path)) + download_input = manifests_dir / "download_input.parquet" + manifest_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + _write_canonical_metadata(work_root, manifest_df) + + if args.dry_run: + return 0 + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=manifest_df, download_results_df=dl_df, url_column="url") + _write_canonical_metadata(work_root, canonical_df) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_download_probe.py b/src/glossapi/scripts/openarchives_download_probe.py new file mode 100644 index 0000000..d253b9b --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_probe.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable, Optional +from urllib.parse import urlparse + +import pandas as pd + +from glossapi import Corpus + + +def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_probe", + description=( + "Sample OpenArchives OCR-target PDFs by host, run a controlled download probe, " + "and write per-host success summaries." + ), + ) + p.add_argument("--parquet", required=True, help="needs_ocr_enriched parquet with pdf_url and filename columns") + p.add_argument("--output-dir", required=True) + p.add_argument("--policy-file", default="") + p.add_argument("--samples-per-host", type=int, default=12) + p.add_argument("--max-hosts", type=int, default=12) + p.add_argument("--seed", type=int, default=42) + p.add_argument("--concurrency", type=int, default=12) + p.add_argument("--request-timeout", type=int, default=60) + p.add_argument("--scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--hosts", nargs="*", default=None, help="Optional explicit host allowlist") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _host_from_url(url: str) -> str: + try: + return (urlparse(str(url)).hostname or "").lower() + except Exception: + return "" + + +def _prepare_probe_frame( + df: pd.DataFrame, + *, + samples_per_host: int, + max_hosts: int, + seed: int, + hosts: Optional[Iterable[str]] = None, +) -> pd.DataFrame: + frame = df.copy() + if "pdf_url" not in frame.columns or "filename" not in frame.columns: + raise SystemExit("Probe parquet must include at least 'pdf_url' and 'filename' columns") + frame["host"] = frame["pdf_url"].astype(str).map(_host_from_url) + frame = frame[frame["host"].astype(bool)].copy() + if hosts: + allowed = {str(h).strip().lower() for h in hosts if str(h).strip()} + frame = frame[frame["host"].isin(allowed)].copy() + ranked_hosts = ( + frame.groupby("host", dropna=False) + .size() + .sort_values(ascending=False) + .head(max(1, int(max_hosts))) + .index.tolist() + ) + probe = frame[frame["host"].isin(ranked_hosts)].copy() + sampled = ( + probe.groupby("host", group_keys=True) + .apply( + lambda grp: grp.sample(n=min(len(grp), int(samples_per_host)), random_state=int(seed)), + include_groups=False, + ) + .reset_index(level=0) + .reset_index(drop=True) + ) + sampled["url"] = sampled["pdf_url"].astype(str) + sampled["base_domain"] = sampled["pdf_url"].astype(str).map( + lambda s: f"{urlparse(str(s)).scheme or 'https'}://{(urlparse(str(s)).netloc or '').lower()}".rstrip("/") + if _host_from_url(str(s)) + else "" + ) + return sampled + + +def _summary_payload(df: pd.DataFrame, *, source_rows: int) -> dict: + out = df.copy() + if "download_success" not in out.columns: + out["download_success"] = False + grouped = ( + out.groupby("host", dropna=False) + .agg( + docs=("host", "size"), + successes=("download_success", lambda s: int(pd.Series(s).fillna(False).sum())), + failures=("download_success", lambda s: int((~pd.Series(s).fillna(False)).sum())), + ) + .reset_index() + .sort_values(["docs", "successes"], ascending=[False, False]) + ) + return { + "source_rows": int(source_rows), + "probe_rows": int(len(out)), + "hosts": grouped.to_dict(orient="records"), + } + + +def main(argv: Optional[list[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + source_df = pd.read_parquet(parquet_path) + probe_df = _prepare_probe_frame( + source_df, + samples_per_host=int(args.samples_per_host), + max_hosts=int(args.max_hosts), + seed=int(args.seed), + hosts=args.hosts, + ) + probe_input = output_dir / "probe_input.parquet" + probe_df.to_parquet(probe_input, index=False) + + if args.dry_run: + summary = _summary_payload(probe_df, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + corpus = Corpus( + input_dir=output_dir / "downloads", + output_dir=output_dir, + log_level="INFO", + verbose=False, + ) + results = corpus.download( + input_parquet=probe_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.concurrency), + request_timeout=int(args.request_timeout), + scheduler_mode=str(args.scheduler_mode), + download_policy_file=(str(args.policy_file) if str(args.policy_file or "").strip() else None), + ) + merged = results.merge( + probe_df[["url", "host", "filename"]], + on="url", + how="left", + suffixes=("", "_probe"), + ) + merged_path = output_dir / "probe_results.parquet" + merged.to_parquet(merged_path, index=False) + summary = _summary_payload(merged, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_hf_refresh.py b/src/glossapi/scripts/openarchives_hf_refresh.py new file mode 100644 index 0000000..911c795 --- /dev/null +++ b/src/glossapi/scripts/openarchives_hf_refresh.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import _resolve_jsonl_path + + +PIPELINE_FIELDS = ( + "greek_badness_score", + "mojibake_badness_score", + "latin_percentage", + "polytonic_ratio", + "char_count_no_comments", + "is_empty", + "filter", + "needs_ocr", + "ocr_success", + "quality_method", + "reevaluated_at", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_hf_refresh", + description=( + "Refresh the canonical OpenArchives HF jsonl.zst shards in place from a refreshed " + "document-level parquet and update the dataset card counts." + ), + ) + p.add_argument("--dataset-root", required=True, help="Local clone/snapshot root of the HF dataset repo.") + p.add_argument("--metadata-parquet", required=True, help="Refreshed document-level parquet with source_jsonl/doc ids.") + p.add_argument("--output-root", default="", help="Optional separate output root. Defaults to in-place dataset-root.") + p.add_argument("--readme-path", default="README.md", help="Dataset card path relative to dataset-root/output-root.") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _normalize_source_key(dataset_root: Path, recorded_path: str) -> str: + resolved = _resolve_jsonl_path(dataset_root, recorded_path) + return str(resolved.relative_to(dataset_root)) + + +def _clean_value(value: object) -> object: + if pd.isna(value): # type: ignore[arg-type] + return None + if isinstance(value, pd.Timestamp): + return value.isoformat() + if hasattr(value, "item"): + try: + return value.item() + except Exception: + return value + return value + + +def _build_update_index(metadata_df: pd.DataFrame, *, dataset_root: Path) -> Dict[str, Dict[str, dict]]: + required = {"source_doc_id", "source_jsonl"} + missing = sorted(required - set(metadata_df.columns)) + if missing: + raise SystemExit(f"Metadata parquet missing required column(s): {', '.join(missing)}") + updates: Dict[str, Dict[str, dict]] = {} + work = metadata_df.copy() + work["_source_key"] = work["source_jsonl"].astype(str).map(lambda p: _normalize_source_key(dataset_root, p)) + for _, row in work.iterrows(): + source_key = str(row["_source_key"]) + doc_id = str(row["source_doc_id"] or "") + payload = {field: _clean_value(row[field]) for field in PIPELINE_FIELDS if field in row.index} + updates.setdefault(source_key, {})[doc_id] = payload + return updates + + +def _iter_jsonl_rows(path: Path) -> Iterable[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + yield json.loads(line) + + +def _write_jsonl_rows(path: Path, rows: Iterable[dict]) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + count = 0 + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + payload = (json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8") + writer.write(payload) + count += 1 + return count + + +def _refresh_readme(readme_text: str, *, total_docs: int, needs_ocr_docs: int) -> str: + title_text = f"OpenArchives.gr {total_docs:,} docs".replace(",", ",") + percent = (100.0 * needs_ocr_docs / total_docs) if total_docs else 0.0 + pct_text = f"{percent:.2f}%" + + replacements = [ + (r"pretty_name:\s*OpenArchives\.gr [^\n]+", f"pretty_name: {title_text}"), + (r"# OpenArchives\.gr [^\n]+", f"# {title_text}"), + ( + r"- Σύνολο markdown αρχείων: \*\*[0-9,]+\*\* from openarchives\.gr", + f"- Σύνολο markdown αρχείων: **{total_docs:,}** from openarchives.gr", + ), + ( + r"- Total markdown files: \*\*[0-9,]+\*\* from openarchives\.gr", + f"- Total markdown files: **{total_docs:,}** from openarchives.gr", + ), + ( + r"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ( + r"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ] + updated = readme_text + for pattern, replacement in replacements: + updated = re.sub(pattern, replacement, updated) + return updated + + +def _refresh_shard( + *, + input_path: Path, + output_path: Path, + updates: Dict[str, dict], + dry_run: bool, +) -> dict: + total = 0 + matched = 0 + needs_ocr = 0 + unmatched_doc_ids: list[str] = [] + rows_out: list[dict] = [] + + for row in _iter_jsonl_rows(input_path): + total += 1 + doc_id = str(row.get("doc_id") or "") + payload = updates.get(doc_id) + if payload is not None: + pipeline = dict(row.get("pipeline_metadata") or {}) + pipeline.update({k: v for k, v in payload.items() if v is not None}) + row["pipeline_metadata"] = pipeline + matched += 1 + else: + unmatched_doc_ids.append(doc_id) + pipeline = row.get("pipeline_metadata") or {} + if bool(pipeline.get("needs_ocr")): + needs_ocr += 1 + rows_out.append(row) + + if not dry_run: + _write_jsonl_rows(output_path, rows_out) + + return { + "path": str(input_path), + "total_rows": total, + "matched_rows": matched, + "unmatched_rows": total - matched, + "needs_ocr_rows": needs_ocr, + "sample_unmatched_doc_ids": unmatched_doc_ids[:5], + } + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + dataset_root = Path(args.dataset_root).expanduser().resolve() + output_root = Path(args.output_root).expanduser().resolve() if str(args.output_root).strip() else dataset_root + output_root.mkdir(parents=True, exist_ok=True) + metadata_path = Path(args.metadata_parquet).expanduser().resolve() + + metadata_df = pd.read_parquet(metadata_path).copy() + updates_by_shard = _build_update_index(metadata_df, dataset_root=dataset_root) + + summaries: list[dict] = [] + total_rows = 0 + matched_rows = 0 + needs_ocr_rows = 0 + shard_root = dataset_root / "data" / "openarchives" + for rel_key, updates in sorted(updates_by_shard.items()): + input_path = dataset_root / rel_key + output_path = output_root / rel_key + summary = _refresh_shard( + input_path=input_path, + output_path=output_path, + updates=updates, + dry_run=bool(args.dry_run), + ) + summaries.append(summary) + total_rows += int(summary["total_rows"]) + matched_rows += int(summary["matched_rows"]) + needs_ocr_rows += int(summary["needs_ocr_rows"]) + + readme_rel = Path(args.readme_path) + readme_in = dataset_root / readme_rel + readme_out = output_root / readme_rel + if readme_in.exists() and not args.dry_run: + readme_text = readme_in.read_text(encoding="utf-8") + readme_out.write_text( + _refresh_readme(readme_text, total_docs=matched_rows, needs_ocr_docs=int(metadata_df["needs_ocr"].fillna(False).sum())), + encoding="utf-8", + ) + + summary = { + "dataset_root": str(dataset_root), + "output_root": str(output_root), + "metadata_parquet": str(metadata_path), + "shards_touched": len(summaries), + "total_rows_seen": total_rows, + "matched_rows": matched_rows, + "unmatched_rows": total_rows - matched_rows, + "needs_ocr_rows_after_refresh": needs_ocr_rows, + "metadata_rows": int(len(metadata_df)), + "metadata_needs_ocr_rows": int(metadata_df["needs_ocr"].fillna(False).sum()) if "needs_ocr" in metadata_df.columns else None, + "sample_shards": summaries[:5], + } + print(json.dumps(summary, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_enrich.py b/src/glossapi/scripts/openarchives_ocr_enrich.py new file mode 100644 index 0000000..7bfd767 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_enrich.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import argparse +import io +import json +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_enrich", + description="Enrich OpenArchives OCR routing rows with page counts and PDF URLs from raw JSONL shards.", + ) + p.add_argument("--parquet", required=True, help="Canonical parquet after OpenArchives cleaning/fill.") + p.add_argument("--raw-repo-root", required=True, help="Local root of the raw HF OpenArchives dataset.") + p.add_argument("--output-parquet", required=True, help="Where the enriched parquet will be written.") + p.add_argument("--filename-column", default="filename") + p.add_argument("--doc-id-column", default="source_doc_id") + p.add_argument("--source-jsonl-column", default="source_jsonl") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive targets from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _resolve_jsonl_path(raw_repo_root: Path, recorded_path: str) -> Path: + candidate = Path(recorded_path) + if candidate.exists(): + return candidate + + marker = "data/openarchives/" + text = str(recorded_path) + idx = text.find(marker) + if idx != -1: + rel = Path(text[idx:]) + rewritten = raw_repo_root / rel + if rewritten.exists(): + return rewritten + + name = Path(recorded_path).name + matches = list((raw_repo_root / "data" / "openarchives").glob(f"**/{name}")) + if len(matches) == 1: + return matches[0] + raise FileNotFoundError(f"could not resolve JSONL path for {recorded_path}") + + +def _pick_pdf_url(source_meta: dict) -> str: + for key in ("refined_pdf_links_json", "pdf_links_json"): + value = source_meta.get(key) + url = _normalize_pdf_link(value) + if url: + return url + for key in ("external_link", "handle_url", "url"): + value = source_meta.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return "" + + +def _normalize_pdf_link(value: object) -> str: + if value is None: + return "" + if isinstance(value, str): + text = value.strip() + if not text: + return "" + if text.startswith("http://") or text.startswith("https://"): + return text + try: + parsed = json.loads(text) + except Exception: + return text + return _normalize_pdf_link(parsed) + if isinstance(value, list): + for item in value: + normalized = _normalize_pdf_link(item) + if normalized: + return normalized + return "" + if isinstance(value, dict): + for key in ("url", "href", "pdf_url", "link"): + if key in value: + normalized = _normalize_pdf_link(value[key]) + if normalized: + return normalized + return "" + return "" + + +def _coerce_page_count(value: object) -> Optional[int]: + if value is None: + return None + try: + return max(1, int(float(value))) + except Exception: + return None + + +def _enrich_targets( + targets: pd.DataFrame, + *, + raw_repo_root: Path, + doc_id_column: str, + source_jsonl_column: str, +) -> pd.DataFrame: + work = targets.copy() + work["_resolved_jsonl"] = work[source_jsonl_column].map( + lambda p: str(_resolve_jsonl_path(raw_repo_root, str(p))) + ) + grouped: Dict[str, Dict[str, int]] = {} + for row_index, row in work[[doc_id_column, "_resolved_jsonl"]].iterrows(): + grouped.setdefault(str(row["_resolved_jsonl"]), {})[str(row[doc_id_column])] = int(row_index) + + dctx = zstd.ZstdDecompressor() + for jsonl_path, doc_map in grouped.items(): + with Path(jsonl_path).open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + record = json.loads(line) + doc_id = str(record.get("doc_id") or "") + row_index = doc_map.get(doc_id) + if row_index is None: + continue + pipeline = record.get("pipeline_metadata") or {} + source_meta = record.get("source_metadata") or {} + page_count = _coerce_page_count(pipeline.get("page_count")) + pages_total = _coerce_page_count(pipeline.get("pages_total")) + if page_count is None: + page_count = pages_total + if pages_total is None: + pages_total = page_count + work.at[row_index, "page_count_source"] = page_count + work.at[row_index, "pages_total_source"] = pages_total + work.at[row_index, "pdf_url"] = _pick_pdf_url(source_meta) + work.at[row_index, "source_collection_slug"] = source_meta.get("collection_slug") or "" + work.at[row_index, "source_language_code"] = source_meta.get("language_code") or "" + + return work.drop(columns=["_resolved_jsonl"]) + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + raw_repo_root = Path(args.raw_repo_root).expanduser().resolve() + output_path = Path(args.output_parquet).expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + for required in (args.filename_column, args.doc_id_column, args.source_jsonl_column): + if required not in df.columns: + raise SystemExit(f"Required column '{required}' not found in parquet.") + + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + targets = df.loc[target_mask].copy() + if targets.empty: + raise SystemExit("No OCR target rows selected; enriched parquet was not created.") + + enriched_targets = _enrich_targets( + targets, + raw_repo_root=raw_repo_root, + doc_id_column=str(args.doc_id_column), + source_jsonl_column=str(args.source_jsonl_column), + ) + + enriched_targets.to_parquet(output_path, index=False) + summary = { + "source_parquet": str(parquet_path), + "output_parquet": str(output_path), + "target_docs": int(len(enriched_targets)), + "page_count_source_non_null": int(enriched_targets["page_count_source"].notna().sum()), + "pdf_url_non_empty": int(enriched_targets["pdf_url"].fillna("").astype(str).str.len().gt(0).sum()), + "pages_total_sum": int(pd.to_numeric(enriched_targets["page_count_source"], errors="coerce").fillna(0).sum()), + } + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py new file mode 100644 index 0000000..a66f564 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import List + +import pandas as pd + + +def _parse_args(argv: List[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_merge", + description="Merge shard-level OCR metadata back into a canonical GlossAPI download_results parquet.", + ) + p.add_argument("--master-parquet", required=True) + p.add_argument("--shard-parquets", nargs="+", required=True) + p.add_argument("--output-parquet", required=True) + p.add_argument("--key-column", default="filename") + return p.parse_args(argv) + + +def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: + if key not in df.columns: + raise SystemExit(f"Key column '{key}' not present in dataframe.") + return df[key].astype(str).str.strip() + + +def main(argv: List[str] | None = None) -> int: + args = _parse_args(argv) + master_path = Path(args.master_parquet).expanduser().resolve() + out_path = Path(args.output_parquet).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + master = pd.read_parquet(master_path).copy() + master["_merge_key"] = _normalize_key(master, str(args.key_column)) + + shard_frames: List[pd.DataFrame] = [] + for shard in args.shard_parquets: + shard_df = pd.read_parquet(Path(shard).expanduser().resolve()).copy() + shard_df["_merge_key"] = _normalize_key(shard_df, str(args.key_column)) + shard_frames.append(shard_df) + shards = pd.concat(shard_frames, ignore_index=True) + shards = shards.drop_duplicates(subset=["_merge_key"], keep="last") + + master = master.set_index("_merge_key", drop=False) + shards = shards.set_index("_merge_key", drop=False) + + for column in shards.columns: + if column == "_merge_key": + continue + master.loc[shards.index, column] = shards[column] + + master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") + master.to_parquet(out_path, index=False) + print( + f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}" + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py new file mode 100644 index 0000000..6970161 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -0,0 +1,316 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +import socket +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from glossapi import Corpus +from glossapi.parquet_schema import ParquetSchema + + +DEFAULT_DOWNLOAD_CONCURRENCY = 24 +DEFAULT_DOWNLOAD_TIMEOUT = 60 +DEFAULT_HEARTBEAT_INTERVAL = 60 + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_run_node", + description=( + "Materialize one OpenArchives OCR shard into a normal GlossAPI corpus root, " + "download its PDFs, and run DeepSeek OCR with the standardized settings." + ), + ) + p.add_argument("--shard-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") + p.add_argument("--heartbeat-path") + p.add_argument("--heartbeat-interval", type=int, default=DEFAULT_HEARTBEAT_INTERVAL) + p.add_argument("--instance-id", default="") + p.add_argument("--node-id", default="") + p.add_argument("--dry-run", action="store_true") + p.add_argument("--scheduler", default="whole_doc") + p.add_argument("--target-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--runtime-backend", default="vllm") + p.add_argument("--ocr-profile", default="markdown_grounded") + p.add_argument("--max-new-tokens", type=int, default=2048) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--repair-mode", default="auto") + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + return p.parse_args(argv) + + +def _hostname() -> str: + try: + return socket.gethostname() + except Exception: + return "" + + +def _atomic_write_json(path: Path, payload: Dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def _prepare_download_input(df: pd.DataFrame) -> pd.DataFrame: + required = {"filename", "pdf_url"} + missing = sorted(required - set(df.columns)) + if missing: + raise SystemExit(f"Shard parquet missing required column(s): {', '.join(missing)}") + out = df.copy() + out["url"] = out["pdf_url"].astype(str) + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + return out + + +def _load_frame(path: Path) -> pd.DataFrame: + return pd.read_parquet(path).copy() + + +def _normalize_download_results( + *, + shard_df: pd.DataFrame, + download_results_df: pd.DataFrame, + url_column: str = "url", +) -> pd.DataFrame: + shard = shard_df.copy() + if "filename_base" not in shard.columns: + shard["filename_base"] = shard["filename"].astype(str).map(lambda s: Path(s).stem) + + dl = download_results_df.copy() + if "filename_base" not in dl.columns: + dl["filename_base"] = dl["filename"].astype(str).map(lambda s: Path(s).stem) + + merged = dl.merge( + shard, + on="filename_base", + how="left", + suffixes=("", "_shard"), + ) + if "filename_shard" in merged.columns: + merged["filename"] = merged["filename_shard"].fillna(merged["filename"]) + merged = merged.drop(columns=["filename_shard"]) + if "pdf_url" in merged.columns and url_column in merged.columns: + merged[url_column] = merged["pdf_url"].fillna(merged[url_column]) + elif "pdf_url" in merged.columns and url_column not in merged.columns: + merged[url_column] = merged["pdf_url"] + if "download_success" not in merged.columns: + merged["download_success"] = False + if "download_error" not in merged.columns: + merged["download_error"] = "" + if "ocr_success" not in merged.columns: + merged["ocr_success"] = False + if "needs_ocr" not in merged.columns: + merged["needs_ocr"] = True + return merged + + +def _write_canonical_metadata(work_root: Path, df: pd.DataFrame) -> Path: + schema = ParquetSchema({"url_column": "url"}) + canonical = work_root / "download_results" / "download_results.parquet" + canonical.parent.mkdir(parents=True, exist_ok=True) + normalized = schema.normalize_metadata_frame(df) + schema.write_metadata_parquet(normalized, canonical) + return canonical + + +def _read_progress(parquet_path: Path, page_col: str = "page_count_source") -> Dict[str, Any]: + try: + df = pd.read_parquet(parquet_path) + except Exception as exc: + return {"parquet_error": str(exc)} + total_docs = int(len(df)) + docs_done = int(df.get("ocr_success", pd.Series(dtype=bool)).fillna(False).sum()) if "ocr_success" in df.columns else 0 + total_pages = 0 + pages_done = 0 + if page_col in df.columns: + page_values = pd.to_numeric(df[page_col], errors="coerce").fillna(0) + total_pages = int(page_values.sum()) + if "ocr_success" in df.columns: + pages_done = int(page_values[df["ocr_success"].fillna(False)].sum()) + return { + "docs_total": total_docs, + "docs_done": docs_done, + "pages_total": total_pages, + "pages_done": pages_done, + } + + +class _HeartbeatThread(threading.Thread): + def __init__( + self, + *, + heartbeat_path: Path, + interval: int, + parquet_path: Path, + context: Dict[str, Any], + ) -> None: + super().__init__(daemon=True) + self.heartbeat_path = heartbeat_path + self.interval = max(10, int(interval)) + self.parquet_path = parquet_path + self.context = dict(context) + self.stage = "init" + self.error = "" + self.stop_event = threading.Event() + self.started_at = time.time() + + def set_stage(self, stage: str) -> None: + self.stage = str(stage) + + def set_error(self, error: str) -> None: + self.error = str(error) + + def stop(self) -> None: + self.stop_event.set() + + def _payload(self) -> Dict[str, Any]: + payload = dict(self.context) + payload.update( + { + "timestamp": int(time.time()), + "hostname": _hostname(), + "stage": self.stage, + "error": self.error, + "uptime_sec": round(time.time() - self.started_at, 1), + "parquet_path": str(self.parquet_path), + } + ) + payload.update(_read_progress(self.parquet_path)) + return payload + + def run(self) -> None: + while not self.stop_event.is_set(): + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + self.stop_event.wait(self.interval) + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + shard_path = Path(args.shard_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + shard_df = _prepare_download_input(_load_frame(shard_path)) + download_input = manifests_dir / "download_input.parquet" + shard_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + _write_canonical_metadata(work_root, shard_df) + + heartbeat: Optional[_HeartbeatThread] = None + if args.heartbeat_path: + heartbeat = _HeartbeatThread( + heartbeat_path=Path(args.heartbeat_path).expanduser().resolve(), + interval=int(args.heartbeat_interval), + parquet_path=metadata_path, + context={ + "instance_id": str(args.instance_id or ""), + "node_id": str(args.node_id or ""), + "shard_parquet": str(shard_path), + "work_root": str(work_root), + }, + ) + heartbeat.start() + + try: + if args.dry_run: + if heartbeat: + heartbeat.set_stage("dry_run") + return 0 + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + + if heartbeat: + heartbeat.set_stage("download") + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") + metadata_path = _write_canonical_metadata(work_root, canonical_df) + if heartbeat: + heartbeat.parquet_path = metadata_path + heartbeat.set_stage("ocr") + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + corpus.ocr( + fix_bad=True, + mode="ocr_bad", + backend="deepseek", + runtime_backend=str(args.runtime_backend), + ocr_profile=str(args.ocr_profile), + use_gpus="multi", + workers_per_gpu=int(args.workers_per_gpu), + render_dpi=int(args.render_dpi), + max_new_tokens=int(args.max_new_tokens), + repair_mode=str(args.repair_mode), + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + gpu_memory_utilization=float(args.gpu_memory_utilization), + math_enhance=False, + ) + if heartbeat: + heartbeat.set_stage("done") + return 0 + except Exception as exc: + if heartbeat: + heartbeat.set_stage("failed") + heartbeat.set_error(str(exc)) + raise + finally: + if heartbeat: + heartbeat.stop() + heartbeat.join(timeout=5) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_shards.py b/src/glossapi/scripts/openarchives_ocr_shards.py new file mode 100644 index 0000000..e68833c --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_shards.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import pandas as pd + + +PAGE_COLUMN_CANDIDATES: Sequence[str] = ( + "page_count_source", + "pages_total_source", + "pages_total", + "page_count", + "total_pages", + "num_pages", + "pages", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_shards", + description="Create page-balanced OCR shard manifests from a canonical GlossAPI parquet.", + ) + p.add_argument("--parquet", required=True, help="Canonical download_results parquet with needs_ocr flags.") + p.add_argument("--output-dir", required=True, help="Directory where shard manifests and summaries will be written.") + p.add_argument("--nodes", type=int, default=4, help="Number of OCR nodes to shard across.") + p.add_argument( + "--pages-per-hour-per-node", + type=float, + default=50700.0, + help="Validated throughput per OCR node, used for ETA calculations.", + ) + p.add_argument("--filename-column", default="filename") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--page-column", + default=None, + help="Explicit page-count column. If omitted, the script searches common page columns.", + ) + p.add_argument( + "--copy-columns", + default="", + help="Comma-separated extra metadata columns to preserve in every shard manifest.", + ) + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive the target set from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _resolve_page_column(df: pd.DataFrame, explicit: Optional[str]) -> str: + if explicit: + if explicit not in df.columns: + raise SystemExit(f"--page-column '{explicit}' not found in parquet.") + return explicit + for candidate in PAGE_COLUMN_CANDIDATES: + if candidate in df.columns: + return candidate + raise SystemExit( + "No page-count column found. Expected one of: " + + ", ".join(PAGE_COLUMN_CANDIDATES) + + " or pass --page-column." + ) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _page_int(value: object) -> int: + try: + return max(1, int(value)) + except Exception: + return 1 + + +def _make_node_bins(node_count: int) -> List[Dict[str, object]]: + return [ + { + "node_id": idx, + "pages_total": 0, + "docs_total": 0, + "rows": [], + } + for idx in range(max(1, int(node_count))) + ] + + +def _assign_rows(df: pd.DataFrame, *, page_column: str, node_count: int) -> List[Dict[str, object]]: + ordered = df.copy() + ordered["_pages_int"] = ordered[page_column].map(_page_int) + ordered = ordered.sort_values(["_pages_int"], ascending=[False]).reset_index(drop=True) + bins = _make_node_bins(node_count) + for row in ordered.to_dict(orient="records"): + node = min(bins, key=lambda item: (int(item["pages_total"]), int(item["node_id"]))) + row["node_id"] = int(node["node_id"]) + node["rows"].append(row) + node["docs_total"] = int(node["docs_total"]) + 1 + node["pages_total"] = int(node["pages_total"]) + int(row["_pages_int"]) + return bins + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + if args.filename_column not in df.columns: + raise SystemExit(f"Filename column '{args.filename_column}' not found in parquet.") + + page_column = _resolve_page_column(df, args.page_column) + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + shard_df = df.loc[target_mask].copy() + if shard_df.empty: + raise SystemExit("No OCR target rows selected; shard manifests were not created.") + + copy_columns = [c.strip() for c in str(args.copy_columns or "").split(",") if c.strip()] + selected_columns = [args.filename_column, page_column] + for optional in [ + "needs_ocr", + "greek_badness_score", + "mojibake_badness_score", + "ocr_success", + "source_row", + "document_type", + ] + copy_columns: + if optional in shard_df.columns and optional not in selected_columns: + selected_columns.append(optional) + shard_df = shard_df[selected_columns].copy() + + bins = _assign_rows(shard_df, page_column=page_column, node_count=int(args.nodes)) + summaries: List[Dict[str, object]] = [] + total_pages = 0 + total_docs = 0 + for node in bins: + node_id = int(node["node_id"]) + rows = list(node["rows"]) + node_df = pd.DataFrame(rows) + if "_pages_int" in node_df.columns: + node_df = node_df.drop(columns=["_pages_int"]) + node_df["shard_id"] = f"node-{node_id:02d}" + node_df["node_id"] = node_id + out_path = output_dir / f"openarchives_ocr_shard_node_{node_id:02d}.parquet" + node_df.to_parquet(out_path, index=False) + + node_pages = int(node["pages_total"]) + node_docs = int(node["docs_total"]) + total_pages += node_pages + total_docs += node_docs + summaries.append( + { + "node_id": node_id, + "manifest_path": str(out_path), + "docs_total": node_docs, + "pages_total": node_pages, + "eta_hours_at_validated_speed": float(node_pages / float(args.pages_per_hour_per_node)), + } + ) + + overall = { + "source_parquet": str(parquet_path), + "nodes": int(args.nodes), + "filename_column": str(args.filename_column), + "page_column": str(page_column), + "docs_total": int(total_docs), + "pages_total": int(total_pages), + "pages_per_hour_per_node": float(args.pages_per_hour_per_node), + "eta_hours_one_node": float(total_pages / float(args.pages_per_hour_per_node)), + "eta_hours_all_nodes": float(total_pages / (float(args.pages_per_hour_per_node) * max(1, int(args.nodes)))), + "node_summaries": summaries, + } + (output_dir / "openarchives_ocr_shard_summary.json").write_text( + json.dumps(overall, indent=2), + encoding="utf-8", + ) + print(json.dumps(overall, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..4165a08 --- /dev/null +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import argparse +import csv +import json +import os +import shutil +import signal +import sqlite3 +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable, Optional, Sequence + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +@dataclass(frozen=True) +class TransferItem: + canonical_filename: str + remote_path: str + remote_size_bytes: int + remote_name: str + + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS transfer_items ( + canonical_filename TEXT PRIMARY KEY, + remote_path TEXT NOT NULL, + remote_size_bytes INTEGER NOT NULL, + remote_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT NOT NULL DEFAULT '', + transfer_started_at TEXT, + transfer_finished_at TEXT, + last_seen_size_bytes INTEGER NOT NULL DEFAULT 0 +); +""" + + +def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_pdf_stage_pull", + description="Resumable staged pull of OpenArchives PDFs from the Greece storage box.", + ) + p.add_argument("--manifest", required=True, help="TSV manifest with canonical_filename, remote_path, remote_size_bytes, remote_name.") + p.add_argument("--work-root", required=True, help="Root directory for downloads, partials, logs, and state.") + p.add_argument("--remote-host", default="debian@83.212.80.170") + p.add_argument("--password-env", default="GREECE_BOX_PASSWORD", help="Environment variable containing the remote SSH password.") + p.add_argument("--max-attempts", type=int, default=20) + p.add_argument("--connect-timeout", type=int, default=30) + p.add_argument("--io-timeout", type=int, default=180) + p.add_argument("--sleep-after-failure", type=float, default=10.0) + p.add_argument("--summary-interval-seconds", type=float, default=5.0) + p.add_argument("--limit", type=int, default=0, help="Optional limit for testing.") + return p.parse_args(argv) + + +class TransferState: + def __init__(self, db_path: Path): + self.db_path = db_path + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self.conn = sqlite3.connect(str(self.db_path)) + self.conn.execute("PRAGMA journal_mode=WAL") + self.conn.execute(SCHEMA) + self.conn.commit() + + def close(self) -> None: + self.conn.close() + + def sync_manifest(self, items: Iterable[TransferItem]) -> None: + rows = [ + (item.canonical_filename, item.remote_path, int(item.remote_size_bytes), item.remote_name) + for item in items + ] + self.conn.executemany( + """ + INSERT INTO transfer_items ( + canonical_filename, remote_path, remote_size_bytes, remote_name, status + ) VALUES (?, ?, ?, ?, 'pending') + ON CONFLICT(canonical_filename) DO UPDATE SET + remote_path=excluded.remote_path, + remote_size_bytes=excluded.remote_size_bytes, + remote_name=excluded.remote_name + """, + rows, + ) + self.conn.commit() + + def reset_stale_in_progress(self) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error=CASE + WHEN last_error = '' THEN 'Recovered from interrupted transfer' + ELSE last_error || ' | Recovered from interrupted transfer' + END + WHERE status='in_progress' + """ + ) + self.conn.commit() + + def mark_completed_if_present(self, downloads_dir: Path, partial_dir: Path) -> None: + cur = self.conn.execute( + "SELECT canonical_filename, remote_size_bytes, status FROM transfer_items" + ) + updates = [] + for canonical_filename, remote_size_bytes, status in cur.fetchall(): + final_path = downloads_dir / canonical_filename + if final_path.exists() and final_path.stat().st_size == int(remote_size_bytes): + updates.append((int(remote_size_bytes), utc_now(), canonical_filename)) + continue + part_path = partial_dir / f"{canonical_filename}.part" + if part_path.exists() and status == "completed": + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error='Final file missing; resuming from partial', + transfer_finished_at=NULL + WHERE canonical_filename=? + """, + (canonical_filename,), + ) + if updates: + self.conn.executemany( + """ + UPDATE transfer_items + SET status='completed', + last_seen_size_bytes=?, + transfer_finished_at=?, + last_error='' + WHERE canonical_filename=? + """, + updates, + ) + self.conn.commit() + + def next_item(self, *, max_attempts: int) -> Optional[sqlite3.Row]: + self.conn.row_factory = sqlite3.Row + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + ORDER BY attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) + return cur.fetchone() + + def mark_in_progress(self, canonical_filename: str, current_size: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='in_progress', + attempts=attempts+1, + transfer_started_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(current_size), canonical_filename), + ) + self.conn.commit() + + def mark_completed(self, canonical_filename: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='completed', + transfer_finished_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def mark_failed(self, canonical_filename: str, error: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='failed', + last_error=?, + last_seen_size_bytes=? + WHERE canonical_filename=? + """, + (str(error), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT status, COUNT(*) AS c + FROM transfer_items + GROUP BY status + """ + ) + counts = {"pending": 0, "in_progress": 0, "completed": 0, "failed": 0} + for status, count in cur.fetchall(): + counts[str(status)] = int(count) + counts["total"] = sum(counts.values()) + return counts + + def byte_counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT + COALESCE(SUM(remote_size_bytes), 0) AS bytes_total, + COALESCE(SUM(CASE WHEN status = 'completed' THEN remote_size_bytes ELSE 0 END), 0) AS bytes_completed, + COALESCE(SUM(CASE WHEN status = 'in_progress' THEN last_seen_size_bytes ELSE 0 END), 0) AS bytes_in_progress + FROM transfer_items + """ + ) + row = cur.fetchone() + bytes_total = int(row[0] or 0) + bytes_completed = int(row[1] or 0) + bytes_in_progress = int(row[2] or 0) + bytes_remaining = max(0, bytes_total - bytes_completed) + return { + "bytes_total": bytes_total, + "bytes_completed": bytes_completed, + "bytes_in_progress": bytes_in_progress, + "bytes_remaining": bytes_remaining, + } + + +def read_manifest(path: Path) -> list[TransferItem]: + items: list[TransferItem] = [] + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + required = {"canonical_filename", "remote_path", "remote_size_bytes", "remote_name"} + if not required.issubset(reader.fieldnames or set()): + raise SystemExit(f"Manifest missing required columns: {sorted(required)}") + for row in reader: + items.append( + TransferItem( + canonical_filename=str(row["canonical_filename"]).strip(), + remote_path=str(row["remote_path"]).strip(), + remote_size_bytes=int(row["remote_size_bytes"]), + remote_name=str(row["remote_name"]).strip(), + ) + ) + return items + + +def write_json(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def append_event(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload, ensure_ascii=False) + "\n") + + +def sftp_one( + *, + remote_host: str, + remote_path: str, + temp_path: Path, + password_env: str, + connect_timeout: int, + io_timeout: int, +) -> subprocess.CompletedProcess[str]: + cmd = [ + "sshpass", + "-e", + "sftp", + "-o", + "BatchMode=no", + "-o", + "PreferredAuthentications=password", + "-o", + "PubkeyAuthentication=no", + "-o", + "KbdInteractiveAuthentication=yes", + "-o", + f"ConnectTimeout={int(connect_timeout)}", + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", + "-o", + "ConnectionAttempts=3", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/greece_box_known_hosts", + "-b", + "-", + remote_host, + ] + env = os.environ.copy() + secret = env.get(password_env) + if not secret: + raise SystemExit(f"Password env var '{password_env}' is not set.") + env["SSHPASS"] = secret + batch = f'reget "{remote_path}" "{temp_path}"\n' + return subprocess.run(cmd, capture_output=True, text=True, env=env, input=batch) + + +def run(argv: Optional[Sequence[str]] = None) -> int: + args = parse_args(argv) + manifest_path = Path(args.manifest).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + downloads_dir = work_root / "downloads" + partial_dir = work_root / "partials" + logs_dir = work_root / "logs" + state_dir = work_root / "state" + downloads_dir.mkdir(parents=True, exist_ok=True) + partial_dir.mkdir(parents=True, exist_ok=True) + logs_dir.mkdir(parents=True, exist_ok=True) + state_dir.mkdir(parents=True, exist_ok=True) + + state = TransferState(state_dir / "transfer_state.sqlite3") + items = read_manifest(manifest_path) + if args.limit and int(args.limit) > 0: + items = items[: int(args.limit)] + state.sync_manifest(items) + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads_dir, partial_dir) + + stop_requested = False + + def _handle_signal(signum, _frame) -> None: + nonlocal stop_requested + stop_requested = True + print(f"[transfer] signal {signum} received; stopping after current file", file=sys.stderr) + + signal.signal(signal.SIGINT, _handle_signal) + signal.signal(signal.SIGTERM, _handle_signal) + + last_summary_ts = 0.0 + current_path = state_dir / "current_transfer.json" + summary_path = state_dir / "summary.json" + events_path = logs_dir / "events.jsonl" + + while not stop_requested: + row = state.next_item(max_attempts=int(args.max_attempts)) + if row is None: + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + break + + canonical = str(row["canonical_filename"]) + remote_path = str(row["remote_path"]) + remote_size = int(row["remote_size_bytes"]) + final_path = downloads_dir / canonical + temp_path = partial_dir / f"{canonical}.part" + current_size = temp_path.stat().st_size if temp_path.exists() else 0 + + state.mark_in_progress(canonical, current_size) + write_json( + current_path, + { + "updated_at": utc_now(), + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_path": str(temp_path), + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + append_event( + events_path, + { + "ts": utc_now(), + "event": "start", + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + + result = sftp_one( + remote_host=str(args.remote_host), + remote_path=remote_path, + temp_path=temp_path, + password_env=str(args.password_env), + connect_timeout=int(args.connect_timeout), + io_timeout=int(args.io_timeout), + ) + + if result.returncode == 0 and temp_path.exists(): + actual_size = temp_path.stat().st_size + if remote_size > 0 and actual_size != remote_size: + state.mark_failed( + canonical, + f"Size mismatch after transfer: expected {remote_size}, got {actual_size}", + actual_size, + ) + else: + final_path.parent.mkdir(parents=True, exist_ok=True) + os.replace(temp_path, final_path) + state.mark_completed(canonical, actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "completed", + "canonical_filename": canonical, + "size_bytes": actual_size, + }, + ) + else: + actual_size = temp_path.stat().st_size if temp_path.exists() else 0 + error = (result.stderr or result.stdout or "").strip()[-4000:] + state.mark_failed(canonical, error or f"transfer failed with code {result.returncode}", actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "failed", + "canonical_filename": canonical, + "return_code": int(result.returncode), + "partial_size_bytes": actual_size, + "error": error or f"transfer failed with code {result.returncode}", + }, + ) + time.sleep(float(args.sleep_after_failure)) + + now = time.time() + if now - last_summary_ts >= float(args.summary_interval_seconds): + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": False}) + last_summary_ts = now + + if current_path.exists(): + try: + current_path.unlink() + except Exception: + pass + + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + state.close() + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(run()) diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py new file mode 100644 index 0000000..20707e7 --- /dev/null +++ b/tests/test_browser_gloss_downloader.py @@ -0,0 +1,376 @@ +import asyncio + +import pandas as pd + +from glossapi import Corpus +from glossapi.download_policy import build_download_policy +from glossapi.gloss_browser_downloader import BrowserGlossDownloader, BrowserSessionState +import glossapi.corpus.phase_download as phase_download_mod + + +def test_browser_downloader_skips_viewer_interstitial(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + called = False + + async def _fake_browser_download(**kwargs): + nonlocal called + called = True + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue=( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ), + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result is None + assert called is False + + +def test_browser_downloader_recovers_challenge_page(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + async def _fake_browser_download(**kwargs): + return ( + b"%PDF-1.7\n%dummy\n", + {"Content-Type": "application/pdf"}, + {"candidate_url": "https://example.org/file.pdf"}, + ) + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://example.org/file.pdf", + headers={"Content-Type": "text/html"}, + content=b"challenge", + html_issue=( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ), + retry_count=1, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 1) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.7") + assert not (tmp_path / "downloads" / ".part_browser_0").exists() + + +def test_browser_downloader_domain_cookie_lookup(tmp_path): + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + domain_cookies={"eur-lex.europa.eu": {"token": "abc123"}}, + ) + + cookies = downloader._domain_cookies_for_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) + + assert cookies == {"token": "abc123"} + + +def test_browser_downloader_bootstrap_url_uses_base_for_file_endpoints(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._choose_browser_bootstrap_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) == "https://eur-lex.europa.eu" + + +def test_browser_downloader_ignores_err_aborted_for_file_navigation(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._should_ignore_navigation_exception( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + assert not downloader._should_ignore_navigation_exception( + "https://example.org/article", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + + +def test_browser_downloader_uses_default_browser_route_for_preflight(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="browser") + + async def _fake_download_browser_route(**kwargs): + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://example.org/file.pdf", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + + +def test_browser_downloader_reuses_cached_domain_session(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="auto") + bootstraps = 0 + fetches = 0 + + async def _fake_fetch_with_browser_session_state(**kwargs): + nonlocal fetches + fetches += 1 + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + async def _bootstrap(**kwargs): + nonlocal bootstraps + bootstraps += 1 + return BrowserSessionState(user_agent="UA", cookie_header="a=b", cached_at=10_000.0), [] + + monkeypatch.setattr(downloader, "_bootstrap_browser_session_state", _bootstrap) + monkeypatch.setattr(downloader, "_fetch_with_browser_session_state", _fake_fetch_with_browser_session_state) + monkeypatch.setattr("glossapi.gloss_browser_downloader.time.time", lambda: 10_100.0) + + first = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file.pdf", referer=None) + ) + second = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file2.pdf", referer=None) + ) + + assert first[0].startswith(b"%PDF") + assert second[0].startswith(b"%PDF") + assert bootstraps == 1 + assert fetches == 2 + + +def test_browser_downloader_policy_routes_domain_to_browser(tmp_path, monkeypatch): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["eur-lex.europa.eu"]}, + "downloader": "browser", + "browser_timeout_ms": 1234, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + observed = {} + + async def _fake_download_browser_route(**kwargs): + observed.update(kwargs) + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert observed["route_options"]["browser_timeout_ms"] == 1234 + + +def test_download_policy_preserves_transport_and_scheduler_options(): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ikee.lib.auth.gr"]}, + "downloader": "standard", + "request_timeout": 120, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 3, + "skip_failed_after": 5, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + + route, options = policy.resolve("https://ikee.lib.auth.gr/record/123/files/file.pdf") + + assert route == "standard" + assert options["request_timeout"] == 120 + assert options["ssl_verify"] is False + assert options["per_domain_concurrency"] == 2 + assert options["domain_concurrency_floor"] == 1 + assert options["domain_concurrency_ceiling"] == 3 + assert options["skip_failed_after"] == 5 + assert options["domain_cookies"] == {"sessionid": "abc"} + + +def test_browser_downloader_route_options_apply_standard_transport_settings(tmp_path): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ktisis.cut.ac.cy"]}, + "downloader": "standard", + "request_timeout": 90, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 2, + "skip_failed_after": 4, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + async def _build_connector(): + return downloader._build_session_connector( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + + route, route_options = downloader._resolve_route("https://ktisis.cut.ac.cy/items/123/file.pdf") + timeout = downloader._build_request_timeout(0, route_options=route_options) + connector = asyncio.run(_build_connector()) + cookies = downloader._resolve_request_cookies( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + floor, ceiling, start, skip_after = downloader._resolve_domain_scheduler_settings(route_options) + + assert route == "standard" + assert timeout.total == 90 + assert connector is not None + assert cookies["sessionid"] == "abc" + assert (floor, ceiling, start, skip_after) == (1, 2, 2, 4) + + +def test_corpus_download_mode_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + result = corpus.download(input_parquet=input_parquet, download_mode="browser") + + assert observed["cls"] == "browser" + assert observed["kwargs"]["default_download_route"] == "browser" + assert bool(result["download_success"].iloc[0]) is True + assert (tmp_path / "download_results" / f"download_results_{input_parquet.name}").exists() + + +def test_corpus_browser_mode_alias_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, browser_mode=True) + + assert observed["cls"] == "browser" + + +def test_corpus_policy_file_selects_browser_router(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://eur-lex.europa.eu/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + policy_path = tmp_path / "download_policy.yml" + policy_path.write_text( + "default:\n downloader: standard\nrules:\n - match:\n domains: [eur-lex.europa.eu]\n downloader: browser\n", + encoding="utf-8", + ) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://eur-lex.europa.eu/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, download_policy_file=policy_path) + + assert observed["kwargs"]["download_policy_file"] == policy_path.resolve() + assert observed["kwargs"]["default_download_route"] == "standard" diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 29db5be..a5ea0b1 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -50,12 +50,6 @@ def make_corpus(tmp_path): return Corpus(input_dir=input_dir, output_dir=output_dir) -def set_onnx_providers(monkeypatch, providers): - stub = SimpleNamespace(get_available_providers=lambda: providers) - monkeypatch.setitem(sys.modules, "onnxruntime", stub) - return stub - - def set_torch_stub(monkeypatch, *, available: bool, device_count: int): cuda_ns = SimpleNamespace( is_available=lambda: available, @@ -66,22 +60,23 @@ def set_torch_stub(monkeypatch, *, available: bool, device_count: int): return torch_ns -def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_force_ocr_is_ignored_for_backend_selection(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() - set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CPUExecutionProvider"]) + set_torch_stub(monkeypatch, available=False, device_count=0) - with pytest.raises(RuntimeError) as exc: - corpus.prime_extractor( - input_format="pdf", - accel_type="CUDA", - force_ocr=True, - phase1_backend="docling", - ) + corpus.prime_extractor( + input_format="pdf", + accel_type="CPU", + force_ocr=True, + phase1_backend="auto", + ) - assert "CUDAExecutionProvider" in str(exc.value) + assert corpus.extractor.last_policy == "safe" + ensure_kwargs = corpus.extractor.ensure_calls[0] + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -89,8 +84,6 @@ def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -106,8 +99,6 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CPU", @@ -120,26 +111,23 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey assert corpus.extractor.ensure_calls[0]["enable_ocr"] is False -def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_configures_docling_backend_explicitly(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=2) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CUDA", - force_ocr=True, - phase1_backend="auto", + phase1_backend="docling", ) assert corpus.extractor.last_policy == "docling" assert corpus.extractor.last_max_batch_files == 1 assert corpus.extractor.last_prefer_safe_backend is False ensure_kwargs = corpus.extractor.ensure_calls[0] - assert ensure_kwargs["enable_ocr"] is True - assert ensure_kwargs["force_full_page_ocr"] is True + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypatch): @@ -147,8 +135,6 @@ def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypa corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -203,6 +189,8 @@ def extract(self, *, file_paths=None, **kwargs): with pytest.raises(SystemExit) as exit_info: corpus_mod.gpu_extract_worker_queue( device_id=0, + worker_slot=0, + worker_key="gpu0-w0", in_dir=str(tmp_path), out_dir=str(tmp_path), work_q=work_q, diff --git a/tests/test_deepseek_preflight.py b/tests/test_deepseek_preflight.py index 1900a2b..73e761d 100644 --- a/tests/test_deepseek_preflight.py +++ b/tests/test_deepseek_preflight.py @@ -1,5 +1,4 @@ import sys -from pathlib import Path from glossapi.ocr.deepseek.preflight import check_deepseek_env @@ -9,45 +8,34 @@ def test_preflight_reports_missing_components(tmp_path): "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "0", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "1", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": str(tmp_path / "missing_python"), - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(tmp_path / "missing_script.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(tmp_path / "missing_script.py"), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(tmp_path / "missing_model"), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(tmp_path / "missing_lib"), - "PATH": str(tmp_path), # no cc1plus here } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) names = {c.name for c in report.errors} + assert "allow_cli" in names + assert "allow_stub" in names assert "deepseek_python" in names - assert "vllm_script" in names + assert "runner_script" in names assert "model_dir" in names - assert "ld_library_path" in names - assert "cc1plus" in names assert not report.ok def test_preflight_passes_with_complete_env(tmp_path): - script = tmp_path / "run_pdf_ocr_vllm.py" + script = tmp_path / "run_pdf_ocr_transformers.py" script.write_text("#!/usr/bin/env python3\n", encoding="utf-8") - model_dir = tmp_path / "DeepSeek-OCR" + model_dir = tmp_path / "DeepSeek-OCR-2" model_dir.mkdir() (model_dir / "config.json").write_text("{}", encoding="utf-8") (model_dir / "model-00001-of-000001.safetensors").write_bytes(b"stub") - lib_dir = tmp_path / "libjpeg" - lib_dir.mkdir() - fake_bin = tmp_path / "bin" - fake_bin.mkdir() - cc1plus = fake_bin / "cc1plus" - cc1plus.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") - cc1plus.chmod(0o755) env = { "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "1", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "0", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": sys.executable, - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(script), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(script), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(model_dir), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(lib_dir), - "PATH": str(fake_bin), } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) assert report.ok assert not report.errors diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py new file mode 100644 index 0000000..1e39cd5 --- /dev/null +++ b/tests/test_deepseek_runner_contract.py @@ -0,0 +1,342 @@ +import sys +from pathlib import Path + +import pandas as pd +import pytest + + +def _mk_corpus(tmp_path: Path): + from glossapi import Corpus + + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_deepseek_backend_rejects_stub_mode(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + df = pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ) + parquet_path = dl_dir / "download_results.parquet" + df.to_parquet(parquet_path, index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%real\n") + + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") + + with pytest.raises(RuntimeError, match="stub execution has been removed"): + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc[fname, "ocr_success"]) is False + assert bool(updated.loc[fname, "needs_ocr"]) is True + + +def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _write_outputs, _write_progress + + output_dir = tmp_path / "output" + _write_progress( + output_dir=output_dir, + stem="doc", + page_outputs=["page one"], + total_pages=5, + completed_pages=1, + ) + + canonical_markdown = output_dir / "markdown" / "doc.md" + progress_markdown = output_dir / "sidecars" / "ocr_progress" / "doc.partial.md" + progress_json = output_dir / "json" / "metrics" / "doc.progress.json" + + assert not canonical_markdown.exists() + assert progress_markdown.exists() + assert progress_json.exists() + + _write_outputs(output_dir=output_dir, stem="doc", markdown="final", page_count=5) + + assert canonical_markdown.exists() + assert canonical_markdown.read_text(encoding="utf-8") == "final\n" + assert not progress_markdown.exists() + + +def test_auto_attn_backend_prefers_eager_when_flash_attn_is_unavailable(monkeypatch): + import builtins + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _resolve_attn_backend + + original_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "flash_attn": + raise ImportError("flash_attn unavailable") + return original_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert _resolve_attn_backend("auto") == "eager" + + +def test_runner_uses_downloads_subdir_when_present(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["input_dir"] = input_dir + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_transformers.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"]) + + assert calls["input_dir"] == downloads_dir.resolve() + assert result["doc"]["page_count"] == 1 + + +def test_build_cli_command_includes_speed_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + prompt_override="custom prompt", + attn_backend="flash_attention_2", + base_size=768, + image_size=512, + crop_mode=True, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=1.05, + no_repeat_ngram_size=12, + runtime_backend="transformers", + vllm_batch_size=None, + gpu_memory_utilization=None, + disable_fp8_kv=False, + repair_mode=None, + ) + + assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--prompt-override" in cmd and "custom prompt" in cmd + assert "--attn-backend" in cmd and "flash_attention_2" in cmd + assert "--base-size" in cmd and "768" in cmd + assert "--image-size" in cmd and "512" in cmd + assert "--crop-mode" in cmd + assert "--render-dpi" in cmd and "144" in cmd + assert "--max-new-tokens" in cmd and "1024" in cmd + + +def test_deepseek_default_max_new_tokens_is_standardized(): + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import DEFAULT_MAX_NEW_TOKENS + + assert DEFAULT_MAX_NEW_TOKENS == 2048 + assert runner.DEFAULT_MAX_NEW_TOKENS == 2048 + + +def test_build_cli_command_includes_vllm_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=110, + max_new_tokens=768, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=16, + gpu_memory_utilization=0.92, + disable_fp8_kv=True, + repair_mode="auto", + ) + + assert "--batch-size" in cmd and "16" in cmd + assert "--gpu-memory-utilization" in cmd and "0.92" in cmd + assert "--disable-fp8-kv" in cmd + assert "--repair-mode" in cmd and "auto" in cmd + + +def test_build_cli_command_includes_page_ranges(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=[], + page_ranges=["a.pdf:1:64", "b.pdf:65:128"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=32, + gpu_memory_utilization=0.9, + disable_fp8_kv=False, + repair_mode="auto", + ) + + assert "--page-ranges" in cmd + assert "a.pdf:1:64" in cmd + assert "b.pdf:65:128" in cmd + + +def test_vllm_empty_page_detector_is_conservative(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _is_effectively_empty_page + + empty_page = { + "top_dark_ratio": 0.0004, + "bottom_dark_ratio": 0.0006, + "top_third_dark_ratio": 0.0002, + "middle_third_dark_ratio": 0.0005, + "bottom_third_dark_ratio": 0.0007, + "overall_dark_ratio": 0.0008, + } + non_empty_sparse_page = { + "top_dark_ratio": 0.003, + "bottom_dark_ratio": 0.004, + "top_third_dark_ratio": 0.0028, + "middle_third_dark_ratio": 0.0031, + "bottom_third_dark_ratio": 0.0042, + "overall_dark_ratio": 0.0022, + } + assert _is_effectively_empty_page(empty_page, "auto") is True + assert _is_effectively_empty_page(non_empty_sparse_page, "auto") is False + assert _is_effectively_empty_page(empty_page, "off") is False + + +def test_early_stop_detects_symbol_and_numeric_list_garbage(): + from glossapi.ocr.utils.cleaning import detect_early_stop_index + + symbol_garbage = "Κανονικό κείμενο\n" + (" " * 20) + numeric_list_garbage = "Πρόλογος\n" + " ".join(f"{idx}." for idx in range(1, 20)) + + symbol_cut = detect_early_stop_index(symbol_garbage) + numeric_cut = detect_early_stop_index(numeric_list_garbage) + + assert symbol_cut is not None + assert "Κανονικό κείμενο" in symbol_garbage[:symbol_cut] + assert numeric_cut is not None + assert "Πρόλογος" in numeric_list_garbage[:numeric_cut] + + +def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["script"] = kwargs["script"] + calls["runtime_backend"] = kwargs["runtime_backend"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["runtime_backend"] == "vllm" + assert Path(calls["script"]).name == "run_pdf_ocr_vllm.py" + assert result["doc"]["page_count"] == 1 + + +def test_runner_forwards_scheduler_controls_to_multi_cli(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_multi_cli(**kwargs): + calls.update(kwargs) + md_dir = kwargs["out_root"] / "markdown" + metrics_dir = kwargs["out_root"] / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_multi_cli", fake_run_multi_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files( + corpus, + ["doc.pdf"], + runtime_backend="vllm", + use_gpus="multi", + devices=[0, 1], + scheduler="exact_fill", + target_batch_pages=196, + shard_pages=64, + shard_threshold_pages=256, + ) + + assert calls["scheduler"] == "exact_fill" + assert calls["target_batch_pages"] == 196 + assert calls["shard_pages"] == 64 + assert calls["shard_threshold_pages"] == 256 + assert result["doc"]["page_count"] == 1 diff --git a/tests/test_deepseek_runner_stub.py b/tests/test_deepseek_runner_stub.py deleted file mode 100644 index aee5177..0000000 --- a/tests/test_deepseek_runner_stub.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -import pandas as pd - - -def _mk_corpus(tmp_path: Path): - from glossapi import Corpus - - root = tmp_path / "corpus" - root.mkdir() - return Corpus(input_dir=root, output_dir=root) - - -def test_deepseek_backend_stub_runs_and_updates_parquet(tmp_path, monkeypatch): - corpus = _mk_corpus(tmp_path) - - # Seed a minimal metadata parquet with one bad file - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - fname = "doc.pdf" - df = pd.DataFrame( - [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] - ) - parquet_path = dl_dir / "download_results.parquet" - df.to_parquet(parquet_path, index=False) - - # Create an empty placeholder file for the PDF - (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") - - # Monkeypatch the runner internal to avoid heavy imports - from glossapi.ocr.deepseek import runner - - def fake_run_one(pdf_path, md_out, metrics_out, cfg): - md_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("deepseek stub output\n", encoding="utf-8") - metrics_out.write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") - return {"page_count": 1} - - monkeypatch.setattr(runner, "_run_one_pdf", fake_run_one) - - # Run OCR via dispatcher - corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) - - # Artifacts exist - stem = "doc" - md = corpus.output_dir / "markdown" / f"{stem}.md" - metrics = corpus.output_dir / "json" / "metrics" / f"{stem}.metrics.json" - assert md.exists(), "Markdown output should be created by deepseek stub" - assert metrics.exists(), "Metrics JSON should be created by deepseek stub" - - # Parquet updated - updated = pd.read_parquet(parquet_path).set_index("filename") - row = updated.loc[fname] - assert bool(row["ocr_success"]) is True - assert bool(row["needs_ocr"]) is False - # extraction_mode is optional; if present assert value - if "extraction_mode" in updated.columns: - assert updated.loc[fname, "extraction_mode"] == "deepseek" diff --git a/tests/test_deepseek_scheduling.py b/tests/test_deepseek_scheduling.py new file mode 100644 index 0000000..25983a8 --- /dev/null +++ b/tests/test_deepseek_scheduling.py @@ -0,0 +1,238 @@ +from pathlib import Path + + +def _touch_files(root: Path, names: list[str]) -> None: + root.mkdir(parents=True, exist_ok=True) + for name in names: + (root / name).write_bytes(b"%PDF-1.4\n%stub\n") + + +def test_plan_lanes_balances_weighted_docs_greedily(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "huge.pdf": 500, + "mid_a.pdf": 300, + "mid_b.pdf": 300, + "small_a.pdf": 200, + "tiny_a.pdf": 100, + "tiny_b.pdf": 100, + } + _touch_files(tmp_path, list(weights)) + + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + lanes = runner._plan_lanes( + file_list=["tiny_b.pdf", "mid_a.pdf", "huge.pdf", "small_a.pdf", "tiny_a.pdf", "mid_b.pdf"], + input_root=tmp_path, + lane_devices=[0, 1, 2], + workers_per_gpu=1, + max_pages=None, + ) + + assert [int(lane["weight"]) for lane in lanes] == [500, 500, 500] + assigned = [name for lane in lanes for name in lane["files"]] + assert sorted(assigned) == sorted(weights) + assert len(assigned) == len(set(assigned)) + + +def test_auto_vllm_batch_size_caps_total_pages(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "a.pdf": 90, + "b.pdf": 120, + "c.pdf": 400, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + capped = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=None, + ) + reduced = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=20, + ) + + assert capped == 160 + assert reduced == 60 + + +def test_auto_scheduler_prefers_exact_fill_for_multi_gpu_vllm(): + from glossapi.ocr.deepseek import runner + + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="vllm", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "exact_fill" + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="transformers", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "whole_doc" + + +def test_fixed_shard_builder_only_splits_large_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_fixed_shard_slices + + documents = [ + SourceDocument(name="huge.pdf", pages=310), + SourceDocument(name="mid.pdf", pages=120), + SourceDocument(name="small.pdf", pages=40), + ] + + slices = build_fixed_shard_slices(documents, shard_pages=128, shard_threshold_pages=200) + + assert [item.item_id for item in slices] == [ + "huge.pdf:1:128", + "huge.pdf:129:256", + "huge.pdf:257:310", + "mid.pdf", + "small.pdf", + ] + + +def test_exact_fill_batches_split_documents_to_fill_target(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_exact_fill_batches + + documents = [ + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=60), + SourceDocument(name="c.pdf", pages=60), + SourceDocument(name="d.pdf", pages=20), + ] + + batches = build_exact_fill_batches(documents, target_batch_pages=160) + + assert [batch.pages for batch in batches] == [160, 160, 20] + assert [item.item_id for item in batches[0].items] == ["a.pdf:1:160"] + assert set(item.item_id for item in batches[1].items) == {"a.pdf:161:200", "b.pdf", "c.pdf"} + assert [item.item_id for item in batches[2].items] == ["d.pdf"] + + +def test_assign_batches_to_lanes_balances_full_batches(): + from glossapi.ocr.deepseek.scheduling import ( + BatchPlan, + WorkSlice, + assign_batches_to_lanes, + ) + + batches = [ + BatchPlan(batch_id=0, items=[WorkSlice("a.pdf", 160, 1, 160)]), + BatchPlan(batch_id=1, items=[WorkSlice("b.pdf", 160, 1, 160)]), + BatchPlan(batch_id=2, items=[WorkSlice("c.pdf", 160, 1, 160)]), + BatchPlan(batch_id=3, items=[WorkSlice("d.pdf", 20, 1, 20)]), + ] + + lanes = assign_batches_to_lanes(batches, devices=[0, 1], workers_per_gpu=1) + + assert sorted(lane.assigned_pages for lane in lanes) == [180, 320] + assert [len(lane.batches) for lane in lanes] == [2, 2] + + +def test_benchmark_planner_exact_fill_mixes_ranges_and_whole_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=200), + SourceDocument(name="tiny.pdf", pages=20), + SourceDocument(name="mid.pdf", pages=60), + SourceDocument(name="mid2.pdf", pages=60), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] + + +def test_benchmark_planner_whole_doc_preserves_whole_files(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=1085), + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=200), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="whole_doc", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + assigned = [name for lane in lanes for batch in lane["batches"] for name in batch["files"]] + assert sorted(assigned) == ["a.pdf", "b.pdf", "monster.pdf"] + + +def test_runner_lane_batches_exact_fill_split_large_docs(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "monster.pdf": 200, + "mid.pdf": 60, + "mid2.pdf": 60, + "tiny.pdf": 20, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + lanes = runner._plan_lane_batches( + file_list=list(weights), + input_root=tmp_path, + lane_devices=[0, 1], + workers_per_gpu=1, + max_pages=None, + runtime_backend="vllm", + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] diff --git a/tests/test_gloss_downloader_dynamic_html.py b/tests/test_gloss_downloader_dynamic_html.py new file mode 100644 index 0000000..a1bd678 --- /dev/null +++ b/tests/test_gloss_downloader_dynamic_html.py @@ -0,0 +1,53 @@ +from glossapi.gloss_downloader import GlossDownloader + + +def test_detects_waf_challenge_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + headers = { + "Content-Type": "text/html; charset=UTF-8", + "x-amzn-waf-action": "challenge", + } + body = b""" + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "challenge page" in error.lower() + + +def test_detects_js_document_viewer_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b""" + + + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "document viewer" in error.lower() + + +def test_regular_html_document_is_still_allowed(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://example.org/article" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b"""Article +

Normal HTML document

Body text.

""" + + assert downloader.infer_file_extension(url, headers, body) == "html" + assert downloader._detect_html_interstitial(url, headers, body) is None diff --git a/tests/test_install_glossapi.py b/tests/test_install_glossapi.py new file mode 100644 index 0000000..5226429 --- /dev/null +++ b/tests/test_install_glossapi.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from glossapi.scripts.install_glossapi import ( + build_deepseek_command, + build_install_plan, + build_pip_command, +) + + +def test_build_install_plan_collects_phase_extras(): + plan = build_install_plan( + phases=["download", "browser_download", "extract", "ocr"], + editable=True, + include_cuda=False, + ) + + assert plan.phases == ("download", "browser_download", "extract", "ocr") + assert set(plan.extras) == {"browser", "docling"} + assert plan.editable is True + assert plan.needs_deepseek_runtime is True + + +def test_build_install_plan_adds_cuda_extra(): + plan = build_install_plan( + phases=["download"], + editable=False, + include_cuda=True, + ) + + assert set(plan.extras) == {"cuda"} + assert plan.editable is False + assert plan.needs_deepseek_runtime is False + + +def test_build_pip_command_uses_editable_install(): + plan = build_install_plan( + phases=["download", "browser_download"], + editable=True, + include_cuda=False, + ) + command = build_pip_command(plan, Path("/tmp/repo")) + + assert command[:4] == [command[0], "-m", "pip", "install"] + assert "-e" in command + assert command[-1] == ".[browser]" + + +def test_build_deepseek_command_points_to_setup_script(): + command = build_deepseek_command(Path("/tmp/repo")) + + assert command is None or command[0] diff --git a/tests/test_jsonl_export.py b/tests/test_jsonl_export.py index e05caa0..aecd7a3 100644 --- a/tests/test_jsonl_export.py +++ b/tests/test_jsonl_export.py @@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path): assert len(seen_doc_ids) == len(texts) +def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path): + corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks") + + base_text = "## Base Title\n\nMerged body from extraction." + base_path = corpus.cleaned_markdown_dir / "chunked.md" + base_path.parent.mkdir(parents=True, exist_ok=True) + base_path.write_text(base_text, encoding="utf-8") + + chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked" + chunk_dir.mkdir(parents=True, exist_ok=True) + (chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8") + (chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8") + + _write_download_results( + corpus.output_dir / "download_results" / "download_results.parquet", + [ + { + "filename": "chunked.pdf", + "filter": "ok", + "needs_ocr": False, + "is_empty": False, + "char_count_no_comments": 10, + } + ], + ) + + out_path = corpus.output_dir / "chunked.jsonl" + corpus.jsonl(out_path) + + record = json.loads(out_path.read_text(encoding="utf-8").strip()) + assert record["document"] == base_text + + @pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed") def test_hf_streaming_loader_example(tmp_path): corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7") @@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path): table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019)) assert set(table.column("doc_id").to_pylist()) == {"a"} + def _expected_doc_id(filename: str) -> str: return hashlib.sha256(filename.encode("utf-8")).hexdigest() diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index 0419ba4..6c410c5 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -11,7 +11,7 @@ def _mk_corpus(tmp_path: Path): return Corpus(input_dir=root, output_dir=root) -def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): +def test_deepseek_ocr_then_math_only_smoke(tmp_path, monkeypatch): corpus = _mk_corpus(tmp_path) # Two PDFs: one needs OCR, one does not (for math-only later) @@ -28,7 +28,7 @@ def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) - # DeepSeek stub for OCR + # DeepSeek runner is stubbed here only to avoid the heavy model during unit tests. from glossapi.ocr.deepseek import runner def fake_run_for_files(self_ref, files, **kwargs): @@ -45,7 +45,7 @@ def fake_run_for_files(self_ref, files, **kwargs): # Run DeepSeek OCR for bad files corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=True, mode="ocr_bad_then_math") - # RapidOCR math-only pass: ensure JSON for clean.pdf and run math + # Math-only pass: ensure JSON for clean.pdf and run math json_dir = corpus.output_dir / "json" json_dir.mkdir(parents=True, exist_ok=True) (json_dir / "clean.docling.json").write_text("{}", encoding="utf-8") @@ -58,7 +58,7 @@ def fake_enrich(files=None, **kwargs): monkeypatch.setattr(corpus, "formula_enrich_from_json", fake_enrich) - corpus.ocr(backend="rapidocr", fix_bad=False, math_enhance=True, mode="math_only") + corpus.ocr(backend="deepseek", fix_bad=False, math_enhance=True, mode="math_only") # Verify updated = pd.read_parquet(parquet_path).set_index("filename") diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 965692b..e2198b7 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -51,29 +51,7 @@ def fail_math(*args, **kwargs): assert calls.get("files") == [fname] -def test_rapidocr_backend_routes_to_extract_with_docling(tmp_path, monkeypatch): +def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) - - # Seed minimal metadata parquet that flags a single file for OCR - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - df = pd.DataFrame([ - {"filename": "doc.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False} - ]) - df.to_parquet(dl_dir / "download_results.parquet", index=False) - - captured = {} - - def fake_extract(**kwargs): - captured.update(kwargs) - return None - - monkeypatch.setattr(corpus, "extract", fake_extract) - - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False, use_gpus="single", devices=[0]) - - assert captured, "Expected extract() to be called for rapidocr backend" - assert captured.get("force_ocr") is True - assert captured.get("phase1_backend") == "docling" - files = captured.get("filenames") or [] - assert files and files[0] == "doc.pdf" + with pytest.raises(ValueError, match="backend must be 'deepseek'"): + corpus.ocr(backend="bogus", fix_bad=True, math_enhance=False) diff --git a/tests/test_ocr_imports.py b/tests/test_ocr_imports.py index 3487619..094e72b 100644 --- a/tests/test_ocr_imports.py +++ b/tests/test_ocr_imports.py @@ -8,32 +8,19 @@ def test_import_ocr_package_is_lightweight(): import glossapi.ocr as ocr assert hasattr(ocr, "deepseek") - assert hasattr(ocr, "rapidocr") # New subpackages remain importable lazily import glossapi.ocr.deepseek.runner as deepseek_runner - import glossapi.ocr.rapidocr.dispatch as rapid_dispatch assert ocr.deepseek.runner is deepseek_runner - assert ocr.rapidocr.dispatch is rapid_dispatch assert ocr.deepseek_runner is deepseek_runner - assert ocr.rapidocr_dispatch is rapid_dispatch assert hasattr(deepseek_runner, "run_for_files") - assert hasattr(rapid_dispatch, "run_via_extract") # Utilities module always available (pure Python) from glossapi.ocr.utils import json_io as utils_json assert hasattr(utils_json, "export_docling_json") - if importlib.util.find_spec("docling") is not None: - try: - from glossapi.ocr.rapidocr import pool as rapid_pool - except ModuleNotFoundError: - pytest.skip("Docling optional dependencies not available") - else: - assert hasattr(rapid_pool, "GLOBAL_RAPID_OCR_POOL") - if importlib.util.find_spec("docling_core") is not None: try: from glossapi.ocr.math import enrich_from_docling_json, RoiEntry diff --git a/tests/test_openarchives_download_freeze.py b/tests/test_openarchives_download_freeze.py new file mode 100644 index 0000000..6420372 --- /dev/null +++ b/tests/test_openarchives_download_freeze.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_download_freeze import main + + +def test_download_freeze_dry_run_materializes_manifest(tmp_path: Path) -> None: + src = tmp_path / "input.parquet" + pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ).to_parquet(src, index=False) + + work_root = tmp_path / "work" + rc = main(["--input-parquet", str(src), "--work-root", str(work_root), "--dry-run"]) + assert rc == 0 + assert (work_root / "manifests" / "download_input.parquet").exists() + assert (work_root / "download_results" / "download_results.parquet").exists() diff --git a/tests/test_openarchives_download_probe.py b/tests/test_openarchives_download_probe.py new file mode 100644 index 0000000..0213438 --- /dev/null +++ b/tests/test_openarchives_download_probe.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_download_probe import _prepare_probe_frame + + +def test_prepare_probe_frame_limits_per_host_and_adds_runtime_columns() -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/a.pdf"}, + {"filename": "b.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/b.pdf"}, + {"filename": "c.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/c.pdf"}, + {"filename": "d.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/d.pdf"}, + {"filename": "e.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/e.pdf"}, + ] + ) + + out = _prepare_probe_frame( + df, + samples_per_host=2, + max_hosts=2, + seed=7, + ) + + counts = out.groupby("host").size().to_dict() + assert counts["ikee.lib.auth.gr"] == 2 + assert counts["dspace.lib.ntua.gr"] == 2 + assert set(out["url"]) <= set(df["pdf_url"]) + assert set(out["base_domain"]) == {"https://ikee.lib.auth.gr", "https://dspace.lib.ntua.gr"} diff --git a/tests/test_openarchives_hf_refresh.py b/tests/test_openarchives_hf_refresh.py new file mode 100644 index 0000000..81f015e --- /dev/null +++ b/tests/test_openarchives_hf_refresh.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_hf_refresh import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + writer.write((json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8")) + + +def _read_jsonl_zst(path: Path) -> list[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text = io.TextIOWrapper(reader, encoding="utf-8").read() + return [json.loads(line) for line in text.splitlines() if line.strip()] + + +def test_openarchives_hf_refresh_updates_pipeline_metadata_and_readme(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {"filename": "AAA_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 1.0}, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "source_metadata": {"filename": "BBB_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 2.0}, + }, + ], + ) + (dataset_root / "README.md").write_text( + "---\npretty_name: OpenArchives.gr 191,000 docs\n---\n\n# OpenArchives.gr 191,000 docs\n\n" + "- Σύνολο markdown αρχείων: **191,301** from openarchives.gr\n" + "- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **23,083 / 191,301 (12.07%)**\n" + "- Total markdown files: **191,301** from openarchives.gr\n" + "- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **23,083 / 191,301 (12.07%)**\n", + encoding="utf-8", + ) + + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + "ocr_success": False, + "greek_badness_score": 72.0, + "mojibake_badness_score": 0.2, + "latin_percentage": 33.3, + "polytonic_ratio": 0.0, + "char_count_no_comments": 1234.0, + "is_empty": False, + "filter": "ok", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + { + "source_doc_id": "doc-b", + "source_jsonl": str(shard_path), + "needs_ocr": False, + "ocr_success": False, + "greek_badness_score": 2.0, + "mojibake_badness_score": 0.0, + "latin_percentage": 22.0, + "polytonic_ratio": 0.0, + "char_count_no_comments": 456.0, + "is_empty": True, + "filter": "empty_text==0", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + ] + ) + assert rc == 0 + + rows = _read_jsonl_zst(out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst") + assert rows[0]["pipeline_metadata"]["needs_ocr"] is True + assert rows[0]["pipeline_metadata"]["greek_badness_score"] == 72.0 + assert rows[1]["pipeline_metadata"]["is_empty"] is True + assert rows[1]["pipeline_metadata"]["filter"] == "empty_text==0" + + readme = (out_root / "README.md").read_text(encoding="utf-8") + assert "OpenArchives.gr 2 docs" in readme + assert "**1 / 2 (50.00%)**" in readme + + +def test_openarchives_hf_refresh_dry_run_does_not_write_outputs(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {}, + "pipeline_metadata": {"needs_ocr": False}, + } + ], + ) + (dataset_root / "README.md").write_text("# OpenArchives.gr 191,000 docs\n", encoding="utf-8") + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + } + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + "--dry-run", + ] + ) + assert rc == 0 + assert not (out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst").exists() diff --git a/tests/test_openarchives_ocr_enrich.py b/tests/test_openarchives_ocr_enrich.py new file mode 100644 index 0000000..16d683a --- /dev/null +++ b/tests/test_openarchives_ocr_enrich.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows).encode("utf-8") + cctx = zstd.ZstdCompressor() + path.write_bytes(cctx.compress(payload)) + + +def test_openarchives_ocr_enrich_extracts_page_counts_and_pdf_url(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_01" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "pipeline_metadata": {"page_count": 98, "pages_total": 98}, + "source_metadata": { + "pdf_links_json": "https://example.com/a.pdf", + "collection_slug": "Dione", + "language_code": "el", + }, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "pipeline_metadata": {"pages_total": 12}, + "source_metadata": { + "pdf_links_json": json.dumps( + [ + {"url": "https://example.com/b.pdf"}, + {"url": "https://example.com/b2.pdf"}, + ] + ), + "collection_slug": "Pandemos", + "language_code": "el", + }, + }, + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "filename": "AAA_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-b", + "filename": "BBB_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-c", + "filename": "CCC_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": False, + }, + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output).sort_values("filename").reset_index(drop=True) + assert enriched["filename"].tolist() == ["AAA_000.pdf", "BBB_000.pdf"] + assert enriched["page_count_source"].tolist() == [98, 12] + assert enriched["pages_total_source"].tolist() == [98, 12] + assert enriched["pdf_url"].tolist() == ["https://example.com/a.pdf", "https://example.com/b.pdf"] + assert enriched["source_collection_slug"].tolist() == ["Dione", "Pandemos"] + + +def test_openarchives_ocr_enrich_resolves_rewritten_source_jsonl_path(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_02" / "chunk-001.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-x", + "filename": "XXX_000", + "text": "x", + "pipeline_metadata": {"page_count": 7}, + "source_metadata": {"external_link": "https://example.com/x"}, + } + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-x", + "filename": "XXX_000.pdf", + "source_jsonl": "/home/foivos/data/glossapi_raw/hf/openarchives.gr/data/openarchives/shard_02/chunk-001.jsonl.zst", + "needs_ocr": True, + } + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output) + assert int(enriched.loc[0, "page_count_source"]) == 7 + assert enriched.loc[0, "pdf_url"] == "https://example.com/x" diff --git a/tests/test_openarchives_ocr_run_node.py b/tests/test_openarchives_ocr_run_node.py new file mode 100644 index 0000000..0b66d52 --- /dev/null +++ b/tests/test_openarchives_ocr_run_node.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_ocr_run_node import ( + _normalize_download_results, + _prepare_download_input, +) + + +def test_prepare_download_input_adds_url_and_filename_base() -> None: + df = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ) + out = _prepare_download_input(df) + assert out.loc[0, "url"] == "https://example.com/a.pdf" + assert out.loc[0, "filename_base"] == "ABC_001" + + +def test_normalize_download_results_preserves_shard_filename_and_metadata() -> None: + shard = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "filename_base": "ABC_001", + "needs_ocr": True, + "source_doc_id": "doc-1", + } + ] + ) + dl = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "filename_base": "ABC_001", + "download_success": True, + "download_error": "", + "url": "https://example.com/a.pdf", + } + ] + ) + out = _normalize_download_results(shard_df=shard, download_results_df=dl) + assert out.loc[0, "filename"] == "ABC_001.pdf" + assert out.loc[0, "source_doc_id"] == "doc-1" + assert bool(out.loc[0, "download_success"]) is True + assert bool(out.loc[0, "needs_ocr"]) is True diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py new file mode 100644 index 0000000..314b785 --- /dev/null +++ b/tests/test_openarchives_ocr_shards.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd + +from glossapi.scripts import openarchives_ocr_merge, openarchives_ocr_shards + + +def test_openarchives_ocr_shards_balances_pages(tmp_path: Path) -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "pages_total": 100}, + {"filename": "b.pdf", "needs_ocr": True, "pages_total": 90}, + {"filename": "c.pdf", "needs_ocr": True, "pages_total": 40}, + {"filename": "d.pdf", "needs_ocr": True, "pages_total": 30}, + {"filename": "skip.pdf", "needs_ocr": False, "pages_total": 999}, + ] + ) + source = tmp_path / "download_results.parquet" + out_dir = tmp_path / "shards" + df.to_parquet(source, index=False) + + rc = openarchives_ocr_shards.main( + [ + "--parquet", + str(source), + "--output-dir", + str(out_dir), + "--nodes", + "2", + ] + ) + assert rc == 0 + + summary = json.loads((out_dir / "openarchives_ocr_shard_summary.json").read_text()) + assert summary["docs_total"] == 4 + assert summary["pages_total"] == 260 + manifests = sorted(out_dir.glob("openarchives_ocr_shard_node_*.parquet")) + assert len(manifests) == 2 + page_totals = [int(pd.read_parquet(path)["pages_total"].sum()) for path in manifests] + assert max(page_totals) - min(page_totals) <= 20 + + +def test_openarchives_ocr_merge_updates_master(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "ocr_success": False}, + {"filename": "b.pdf", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": False, "ocr_success": True, "ocr_node_id": 2}, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("filename") + assert bool(merged.loc["a.pdf", "ocr_success"]) is True + assert bool(merged.loc["a.pdf", "needs_ocr"]) is False + assert int(merged.loc["a.pdf", "ocr_node_id"]) == 2 + assert bool(merged.loc["b.pdf", "ocr_success"]) is False diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..f115370 --- /dev/null +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from pathlib import Path + +from glossapi.scripts.openarchives_pdf_stage_pull import TransferItem, TransferState, read_manifest + + +def _write_manifest(path: Path) -> None: + path.write_text( + "\t".join(["canonical_filename", "remote_path", "remote_size_bytes", "remote_name"]) + + "\n" + + "\t".join(["AAA_456.pdf", "/remote/AAA_456.pdf", "10", "AAA_456.pdf"]) + + "\n" + + "\t".join(["VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", "20", "VFK_368.pdf.Ac6Dc3BA"]) + + "\n", + encoding="utf-8", + ) + + +def test_read_manifest_parses_rows(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + + items = read_manifest(manifest) + + assert items == [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", 20, "VFK_368.pdf.Ac6Dc3BA"), + ] + + +def test_transfer_state_resets_stale_and_marks_completed(tmp_path: Path) -> None: + db_path = tmp_path / "state.sqlite3" + downloads = tmp_path / "downloads" + partials = tmp_path / "partials" + downloads.mkdir() + partials.mkdir() + state = TransferState(db_path) + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + + state.mark_in_progress("AAA_456.pdf", 5) + (downloads / "BBB_001.pdf").write_bytes(b"x" * 12) + + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads, partials) + + cur = state.conn.execute( + "SELECT canonical_filename, status, last_seen_size_bytes, last_error FROM transfer_items ORDER BY canonical_filename" + ) + rows = cur.fetchall() + assert rows[0][0] == "AAA_456.pdf" + assert rows[0][1] == "pending" + assert "Recovered from interrupted transfer" in rows[0][3] + assert rows[1][0] == "BBB_001.pdf" + assert rows[1][1] == "completed" + assert rows[1][2] == 12 + + counts = state.counts() + assert counts["pending"] == 1 + assert counts["completed"] == 1 + state.close() + + +def test_transfer_state_next_item_respects_attempt_limit(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=25 WHERE canonical_filename='AAA_456.pdf'" + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=2 WHERE canonical_filename='BBB_001.pdf'" + ) + state.conn.commit() + + row = state.next_item(max_attempts=20) + + assert row is not None + assert row["canonical_filename"] == "BBB_001.pdf" + state.close() diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 4fe7464..f673a83 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -1,4 +1,5 @@ import os +import sys from pathlib import Path import pandas as pd @@ -7,10 +8,6 @@ pytest.importorskip("docling") pytest.importorskip("glossapi_rs_cleaner") -pytest.importorskip( - "onnxruntime", reason="RapidOCR/DeepSeek end-to-end tests require onnxruntime" -) -import onnxruntime as ort # noqa: E402 from glossapi import Corpus from glossapi.corpus import _resolve_skiplist_path @@ -106,11 +103,8 @@ def _assert_dir_contents( pytest.fail(f"Unexpected file {entry} in {root}") -@pytest.mark.rapidocr -def test_pipeline_smoke_and_artifacts(tmp_path): +def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for pipeline smoke test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -132,7 +126,6 @@ def test_pipeline_smoke_and_artifacts(tmp_path): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -146,6 +139,21 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert bool(needs.get("blank.pdf")), "Blank PDF should be flagged for OCR" assert not bool(needs.get("text.pdf")) + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + (markdown_dir / f"{stem}.md").write_text("[[Blank page]]\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( mode="ocr_bad", use_gpus="single", @@ -193,15 +201,8 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert sections_file.exists() -@pytest.mark.rapidocr def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - - assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -242,7 +243,6 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -256,6 +256,25 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert bool(greek_row["needs_ocr"]), "Greek consonant doc should require OCR rerun" assert "non_greek_text" in str(greek_row.get("filter", "")), "Filter should record non-Greek text" + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + if stem == "greek_consonants": + text = documents["greek_consonants"] + else: + text = documents.get(stem) or "[[Blank page]]" + (markdown_dir / f"{stem}.md").write_text(f"{text}\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( fix_bad=True, math_enhance=True, @@ -268,6 +287,15 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not bool(greek_after["needs_ocr"]), "Greek consonant doc should be resolved after OCR rerun" assert bool(greek_after.get("ocr_success", False)), "OCR rerun should mark greek consonant doc as success" + corpus.ocr( + backend="deepseek", + fix_bad=False, + math_enhance=True, + mode="math_only", + use_gpus="single", + devices=[device_idx], + ) + json_dir = corpus_dir / "json" assert json_dir.exists(), "Docling JSON directory should exist after extraction" for stem in documents: @@ -304,11 +332,8 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not skiplist_path.read_text(encoding="utf-8").strip(), "Fatal skip-list should remain empty" -@pytest.mark.rapidocr def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for OCR recovery test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -351,7 +376,6 @@ def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): accel_type="CUDA", num_threads=1, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -384,8 +408,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): script = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py", + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + Path.cwd() / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py", ) ) if not script.exists(): @@ -393,8 +417,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): python_bin = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_TEST_PYTHON", - Path("/mnt/data/glossAPI/deepseek_venv/bin/python"), + "GLOSSAPI_DEEPSEEK_PYTHON", + os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", sys.executable), ) ) if not python_bin.exists(): @@ -409,29 +433,17 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): if not model_dir.exists(): pytest.skip(f"DeepSeek model directory missing: {model_dir}") - lib_path = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - if not lib_path: - candidate = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" - if candidate.exists(): - lib_path = str(candidate) - if not lib_path or not Path(lib_path).exists(): - pytest.skip("Set GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH to the libjpeg-turbo library directory") - - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - device_idx = 0 if torch.cuda.device_count() > 1: device_idx = torch.cuda.current_device() - # Force the CLI path (no stub fallback) and point to the desired interpreter/script. + # Force the real runner path and point to the desired interpreter/script. monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", str(python_bin)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", str(script)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH", lib_path) - monkeypatch.setenv("VLLM_ALLOW_REMOTE_CODE", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", str(script)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) existing_py_path = os.environ.get("PYTHONPATH", "") src_path = str(Path.cwd() / "src") if existing_py_path: @@ -439,13 +451,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): else: monkeypatch.setenv("PYTHONPATH", src_path) - import glossapi.ocr.deepseek.runner as deepseek_runner - - def _raise_if_stub(*_args, **_kwargs): - raise AssertionError("DeepSeek fallback stub should not run in CLI smoke test") - - monkeypatch.setattr(deepseek_runner, "_run_one_pdf", _raise_if_stub) - corpus_dir = tmp_path / "corpus" corpus_dir.mkdir() @@ -461,7 +466,6 @@ def _raise_if_stub(*_args, **_kwargs): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) diff --git a/tests/test_rapidocr_patch.py b/tests/test_rapidocr_patch.py deleted file mode 100644 index 93a8ca5..0000000 --- a/tests/test_rapidocr_patch.py +++ /dev/null @@ -1,368 +0,0 @@ -import importlib -import sys -import types -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pytest - - -def _clear_modules(prefix: str) -> None: - for name in list(sys.modules): - if name == prefix or name.startswith(f"{prefix}."): - sys.modules.pop(name, None) - - -def _install_docling_stub(*, supports_injection: bool) -> None: - _clear_modules("docling") - _clear_modules("docling_core") - _clear_modules("glossapi") - - def register(name: str) -> types.ModuleType: - module = types.ModuleType(name) - sys.modules[name] = module - return module - - docling = register("docling") - register("docling.backend") - register("docling.backend.docling_parse_backend").DoclingParseDocumentBackend = object - register("docling.backend.docling_parse_v2_backend").DoclingParseV2DocumentBackend = object - register("docling.backend.pypdfium2_backend").PyPdfiumDocumentBackend = object - - base_models = register("docling.datamodel.base_models") - - class InputFormat: - PDF = "pdf" - DOCX = "docx" - XML_JATS = "xml" - HTML = "html" - PPTX = "pptx" - CSV = "csv" - MD = "md" - - class ConversionStatus: - SUCCESS = "success" - PARTIAL_SUCCESS = "partial" - - class Page: - def __init__(self): - self._backend = types.SimpleNamespace( - is_valid=lambda: True, - get_page_image=lambda *args, **kwargs: types.SimpleNamespace() - ) - - base_models.InputFormat = InputFormat - base_models.ConversionStatus = ConversionStatus - base_models.Page = Page - - pipeline_opts = register("docling.datamodel.pipeline_options") - - class AcceleratorDevice: - AUTO = "auto" - CUDA = "cuda" - MPS = "mps" - CPU = "cpu" - - class AcceleratorOptions: - def __init__(self, num_threads=None, device=None): - self.num_threads = num_threads - self.device = device - - class PdfPipelineOptions: - def __init__(self, **_kwargs): - self.ocr_options = None - self.do_ocr = False - - class RapidOcrOptions: - def __init__(self, **kwargs): - for key, value in kwargs.items(): - setattr(self, key, value) - self.rec_keys_path = None - - class OcrOptions: - pass - - class LayoutOptions: - pass - - class TableStructureOptions: - def __init__(self, mode=None): - self.mode = mode - self.do_cell_matching = False - - class TableFormerMode: - ACCURATE = "accurate" - - class PictureDescriptionApiOptions: - pass - - pipeline_opts.AcceleratorDevice = AcceleratorDevice - pipeline_opts.AcceleratorOptions = AcceleratorOptions - pipeline_opts.PdfPipelineOptions = PdfPipelineOptions - pipeline_opts.RapidOcrOptions = RapidOcrOptions - pipeline_opts.OcrOptions = OcrOptions - pipeline_opts.LayoutOptions = LayoutOptions - pipeline_opts.TableStructureOptions = TableStructureOptions - pipeline_opts.TableFormerMode = TableFormerMode - pipeline_opts.PictureDescriptionApiOptions = PictureDescriptionApiOptions - - register("docling.datamodel.document").ConversionResult = object - - settings_mod = register("docling.datamodel.settings") - - class _Debug: - def __init__(self): - self.profile_pipeline_timings = False - self.visualize_ocr = False - - class _Settings: - def __init__(self): - self.debug = _Debug() - - settings_mod.settings = _Settings() - - converter_mod = register("docling.document_converter") - - class DocumentConverter: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - class PdfFormatOption: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - converter_mod.DocumentConverter = DocumentConverter - converter_mod.PdfFormatOption = PdfFormatOption - converter_mod.WordFormatOption = object - converter_mod.HTMLFormatOption = object - converter_mod.XMLJatsFormatOption = object - converter_mod.PowerpointFormatOption = object - converter_mod.MarkdownFormatOption = object - converter_mod.CsvFormatOption = object - - register("docling.pipeline.simple_pipeline").SimplePipeline = object - - pipelines_mod = register("docling.pipelines.standard_pdf_pipeline") - pipeline_mod = register("docling.pipeline.standard_pdf_pipeline") - - if supports_injection: - class StandardPdfPipeline: - def __init__(self, opts, ocr_model=None, **_): - self.opts = opts - self.ocr_model = ocr_model - else: - class StandardPdfPipeline: - def __init__(self, opts, **_): - self.opts = opts - - pipelines_mod.StandardPdfPipeline = StandardPdfPipeline - pipeline_mod.StandardPdfPipeline = StandardPdfPipeline - - rapid_module = register("docling.models.rapid_ocr_model") - - class DummyReader: - def __call__(self, *_args, **_kwargs): - return [] - - class RapidOcrModel: - def __init__(self, enabled, artifacts_path, options, accelerator_options): - self.enabled = enabled - self.reader = DummyReader() - self.options = options - - def get_ocr_rects(self, _page): - return [] - - def post_process_cells(self, _cells, _page): - pass - - class TextCell: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - class _Log: - @staticmethod - def warning(_msg, *args, **kwargs): - return None - - rapid_module.RapidOcrModel = RapidOcrModel - rapid_module.TextCell = TextCell - rapid_module._log = _Log() - - utils_mod = register("docling.utils") - profiling_mod = register("docling.utils.profiling") - - class TimeRecorder: - def __init__(self, *_args, **_kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, *exc): - return False - - profiling_mod.TimeRecorder = TimeRecorder - utils_mod.profiling = profiling_mod - - register("docling.models") - - core_doc = register("docling_core.types.doc") - - class BoundingBox: - @staticmethod - def from_tuple(coord, origin=None): - return SimpleNamespace(coord=coord, origin=origin) - - class CoordOrigin: - TOPLEFT = "topleft" - - core_doc.BoundingBox = BoundingBox - core_doc.CoordOrigin = CoordOrigin - - core_page = register("docling_core.types.doc.page") - - class BoundingRectangle: - @staticmethod - def from_bounding_box(box): - return box - - core_page.BoundingRectangle = BoundingRectangle - - -def _install_onnxruntime_stub(): - sys.modules['onnxruntime'] = types.SimpleNamespace( - get_available_providers=lambda: ['CUDAExecutionProvider'] - ) - - -def _make_safe_ocr() -> SimpleNamespace: - """Return an instantiated SafeRapidOcrModel with stubbed dependencies.""" - rapid_opts = sys.modules['docling.datamodel.pipeline_options'].RapidOcrOptions() - accel_opts = sys.modules['docling.datamodel.pipeline_options'].AcceleratorOptions(device='cuda:0') - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel - - return SafeRapidOcrModel(enabled=True, artifacts_path=None, options=rapid_opts, accelerator_options=accel_opts) - - -@pytest.fixture(autouse=True) -def _cleanup_modules(): - yield - for name in [n for n in list(sys.modules) if n.startswith('glossapi') and '_rapidocr_paths' not in n]: - if name.startswith('glossapi_rs_'): - continue - sys.modules.pop(name, None) - _clear_modules('docling') - _clear_modules('docling_core') - sys.modules.pop('onnxruntime', None) - - -def test_patch_runs_on_import(): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - rapid_module = sys.modules['docling.models.rapid_ocr_model'] - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel, patch_docling_rapidocr - - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - patch_docling_rapidocr() - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - -def test_build_rapidocr_pipeline_injects_when_supported(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - glossapi_mod = importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - captured = {} - - def fake_pool_get(device, opts, factory, expected_type): - model = factory() - assert isinstance(model, pipeline.SafeRapidOcrModel) - assert expected_type is pipeline.SafeRapidOcrModel - captured['device'] = device - captured['opts'] = opts - return SimpleNamespace() - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fake_pool_get)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - assert hasattr(engine, 'ocr_model') - assert captured['device'] == 'cuda:0' - assert opts.do_ocr is True - - -def test_build_rapidocr_pipeline_falls_back_without_injection(monkeypatch): - _install_docling_stub(supports_injection=False) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - def fail_pool(*_args, **_kwargs): - raise AssertionError('Pool should not be used when injection unsupported') - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fail_pool)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - converter_mod = importlib.import_module('docling.document_converter') - assert isinstance(engine, converter_mod.DocumentConverter) - assert opts.do_ocr is True - - -def test_safe_rapidocr_normalises_none(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - assert model._normalise_result(None) == [] - - -def test_safe_rapidocr_normalises_incomplete_and_valid_data(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - class IncompleteResult: - boxes = None - txts = ['foo'] - scores = [0.9] - - assert model._normalise_result(IncompleteResult()) == [] - - box = np.array([ - [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0]], - ]) - - class FullResult: - boxes = box - txts = ['foo'] - scores = [0.9] - - output = model._normalise_result(FullResult()) - assert output == [ - (box[0].tolist(), 'foo', 0.9) - ] diff --git a/tests/test_streaming_garbage_detector.py b/tests/test_streaming_garbage_detector.py new file mode 100644 index 0000000..0d12fdd --- /dev/null +++ b/tests/test_streaming_garbage_detector.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import pytest + +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector + + +DOWNLOAD_EXPORT = ( + Path.home() + / "Downloads" + / "deepseek_ocr_43pdfs_allpages_20260331" +) + + +def _stream_detect(text: str, *, chunk_size: int) -> tuple[bool, str | None]: + detector = StreamingGarbageDetector() + for idx in range(0, len(text), max(1, int(chunk_size))): + if detector.feed(text[idx : idx + chunk_size]): + return True, detector.triggered_reason + return False, detector.triggered_reason + + +def _load_real_markdown_garbage() -> str: + root = DOWNLOAD_EXPORT / "corrections_markdown_garbage" + if not root.exists(): + pytest.skip(f"missing local export: {root}") + for path in sorted(root.glob("*__markdown_original.md")): + text = path.read_text(encoding="utf-8", errors="ignore") + if "\uf0b7" in text or "" in text or "" in text: + return text + pytest.skip("no local symbol-garbage sample found") + + +def _load_real_empty_page_numeric_garbage() -> str: + if not DOWNLOAD_EXPORT.exists(): + pytest.skip(f"missing local export: {DOWNLOAD_EXPORT}") + preferred = DOWNLOAD_EXPORT / ( + "000008__04afb897cb954a76fe378b2ca22f2f059097876fa60a57666de75e37319e5968__p0008__markdown_original.md" + ) + candidates = [preferred] if preferred.exists() else sorted(DOWNLOAD_EXPORT.glob("*__markdown_original.md")) + for path in candidates: + text = path.read_text(encoding="utf-8", errors="ignore") + if "1. 2. 3." in text: + return text + pytest.skip("no local numeric-list garbage sample found") + + +@pytest.mark.parametrize("chunk_size", [1, 2, 5, 17]) +def test_streaming_detector_catches_symbol_garbage_across_chunks(chunk_size): + text = "Κανονικό κείμενο\n" + (" " * 20) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 2, 4, 11]) +def test_streaming_detector_catches_numeric_list_garbage_across_chunks(chunk_size): + text = " ".join(f"{idx}." for idx in range(1, 25)) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage" + + +def test_streaming_detector_ignores_non_ascii_digit_glyphs(): + triggered, reason = _stream_detect("x³ y² z¹", chunk_size=1) + assert triggered is False + assert reason is None + + +@pytest.mark.parametrize("chunk_size", [1, 3, 9, 23]) +def test_streaming_detector_real_faulty_page_from_downloads(chunk_size): + text = _load_real_markdown_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 3, 8, 21]) +def test_streaming_detector_real_empty_page_generation_from_downloads(chunk_size): + text = _load_real_empty_page_numeric_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage"