From 00e8a2c449c808394751602a56c6461c2869781f Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 30 Nov 2025 10:04:35 +0200 Subject: [PATCH 01/26] chore: remove GitHub workflows --- .github/workflows/docs-selfhost.yml | 51 ---------------------------- .github/workflows/docs.yml | 33 ------------------ .github/workflows/python-publish.yml | 40 ---------------------- 3 files changed, 124 deletions(-) delete mode 100644 .github/workflows/docs-selfhost.yml delete mode 100644 .github/workflows/docs.yml delete mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/docs-selfhost.yml b/.github/workflows/docs-selfhost.yml deleted file mode 100644 index 57c67b2..0000000 --- a/.github/workflows/docs-selfhost.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Build Docs (Self-Host Deploy) - -on: - workflow_dispatch: - -jobs: - build-and-deploy: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Install MkDocs - run: | - python -m pip install --upgrade pip - pip install mkdocs mkdocs-material - - - name: Build site - run: | - mkdocs build --strict - - - name: Install rsync and ssh - run: sudo apt-get update -y && sudo apt-get install -y rsync openssh-client - - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.SSH_KEY }} - - - name: Add known hosts - run: | - mkdir -p ~/.ssh - echo "${{ secrets.SSH_KNOWN_HOSTS }}" >> ~/.ssh/known_hosts - chmod 644 ~/.ssh/known_hosts - - - name: Deploy via rsync - env: - SSH_USER: ${{ secrets.SSH_USER }} - SSH_HOST: ${{ secrets.SSH_HOST }} - SSH_TARGET: ${{ secrets.SSH_TARGET }} - run: | - if [ -z "$SSH_USER" ] || [ -z "$SSH_HOST" ] || [ -z "$SSH_TARGET" ]; then - echo "Missing SSH_USER/SSH_HOST/SSH_TARGET secrets." && exit 1 - fi - rsync -az --delete site/ "$SSH_USER@$SSH_HOST:$SSH_TARGET" - diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 30bbcd8..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Build and Deploy Docs - -on: - push: - branches: [ main, master ] - workflow_dispatch: - -jobs: - docs: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Install MkDocs - run: | - python -m pip install --upgrade pip - pip install mkdocs mkdocs-material - - - name: Build site - run: | - mkdocs build --strict - - - name: Deploy to GitHub Pages - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./site diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml deleted file mode 100644 index 757eb51..0000000 --- a/.github/workflows/python-publish.yml +++ /dev/null @@ -1,40 +0,0 @@ -# This workflow will upload a Python Package using GitHub Actions when a release is created -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-repositories - -name: Upload Python Package - -on: - workflow_dispatch: - release: - types: [published] - -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - # IMPORTANT: this permission is mandatory for trusted publishing - id-token: write - contents: read - - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - name: Copy README to pipeline directory - run: | - cp README.md pipeline/ - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: | - cd pipeline - python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages-dir: pipeline/dist/ - password: ${{ secrets.PYPI_API_TOKEN }} From 269cabf8c2f72f36c0238dfa16ca173b94bc593d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 30 Nov 2025 10:17:45 +0200 Subject: [PATCH 02/26] docs: remove PyPI badge --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index d37c347..ebc6baf 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # GlossAPI -[![PyPI Status](https://img.shields.io/pypi/v/glossapi?logo=pypi)](https://pypi.org/project/glossapi/) - GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss.eu/) that turns academic PDFs into structured Markdown, cleans noisy text with Rust extensions, and optionally enriches math/code content. ## Why GlossAPI From b82d04e4da77193e7bcb6df4b9772fa897499111 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 25 Jan 2026 12:50:56 +0200 Subject: [PATCH 03/26] fix chunk merging --- CONTRIBUTING.md | 20 +++++++++++++++++ src/glossapi/corpus/phase_export.py | 14 +++--------- src/glossapi/gloss_extract.py | 21 +++++++++++------- tests/test_jsonl_export.py | 34 +++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..979e757 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to GlossAPI + +## Working branches and PR flow +- Open PRs are pushed against the `development` branch. +- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint. + +## Some design principles +- Corpus methods should be easy to use and descriptive. +- Python files should be readable and well organized (check folder structure). +- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline. + +## Pipeline awareness and folder layout +- Tie any pipeline change to the artifacts it produces. Common touchpoints: + - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`). + - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders. + - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`. +- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable. + +## Keep changes small +- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting. diff --git a/src/glossapi/corpus/phase_export.py b/src/glossapi/corpus/phase_export.py index 26a6a82..4bcc6a8 100644 --- a/src/glossapi/corpus/phase_export.py +++ b/src/glossapi/corpus/phase_export.py @@ -471,8 +471,6 @@ def _normalize_value(value: Any) -> Any: chunk_paths: List[Path] = entry.get("chunk_paths", []) or [] base_path: Optional[Path] = entry.get("base_path") representative_path: Optional[Path] = base_path - if representative_path is None and chunk_paths: - representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0] base_metadata = metadata_by_stem.get(stem) chunk_metadata = metadata_chunks_by_stem.get(stem, []) if base_metadata is None and not chunk_metadata: @@ -480,17 +478,11 @@ def _normalize_value(value: Any) -> Any: metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata) metadata = {k: _normalize_value(v) for k, v in metadata.items()} original_filename_value = metadata.get("filename") - if chunk_paths: - ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key) - parts: List[str] = [] - for path in ordered_chunks: - parts.append(path.read_text(encoding="utf-8")) - document_text = "\n".join(parts) - elif representative_path is not None: - document_text = representative_path.read_text(encoding="utf-8") - else: + if base_path is None or not base_path.exists(): continue + document_text = base_path.read_text(encoding="utf-8") + filetype = metadata.get("filetype") or metadata.get("file_ext") if not filetype: filename_candidate = original_filename_value or metadata.get("filename") diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..44dbcfa 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -914,6 +914,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: except Exception as e: self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}") + # Always attempt to assemble whatever chunks succeeded (best-effort) + out_md_path = output_dir / f"{stem}.md" + final_md_written = False + if all_segments: + try: + final_md = "\n\n".join(all_segments) + out_md_path.write_text(final_md, encoding="utf-8") + final_md_written = True + except Exception as e: + self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not completed: # Record failure/timeout provenance in parquet try: @@ -928,6 +939,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: chunk_size=self.chunk_size, chunk_count=len(manifest.get("entries", [])), chunk_manifest_path=manifest_path, + no_partial_output=not final_md_written, ) except Exception as e: self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}") @@ -939,14 +951,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}") return False - # Assemble final markdown - try: - final_md = "\n\n".join(all_segments) - out_md_path = output_dir / f"{stem}.md" - with out_md_path.open("w", encoding="utf-8") as fp: - fp.write(final_md) - except Exception as e: - self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not final_md_written: return False # Record success provenance in parquet try: diff --git a/tests/test_jsonl_export.py b/tests/test_jsonl_export.py index e05caa0..aecd7a3 100644 --- a/tests/test_jsonl_export.py +++ b/tests/test_jsonl_export.py @@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path): assert len(seen_doc_ids) == len(texts) +def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path): + corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks") + + base_text = "## Base Title\n\nMerged body from extraction." + base_path = corpus.cleaned_markdown_dir / "chunked.md" + base_path.parent.mkdir(parents=True, exist_ok=True) + base_path.write_text(base_text, encoding="utf-8") + + chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked" + chunk_dir.mkdir(parents=True, exist_ok=True) + (chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8") + (chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8") + + _write_download_results( + corpus.output_dir / "download_results" / "download_results.parquet", + [ + { + "filename": "chunked.pdf", + "filter": "ok", + "needs_ocr": False, + "is_empty": False, + "char_count_no_comments": 10, + } + ], + ) + + out_path = corpus.output_dir / "chunked.jsonl" + corpus.jsonl(out_path) + + record = json.loads(out_path.read_text(encoding="utf-8").strip()) + assert record["document"] == base_text + + @pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed") def test_hf_streaming_loader_example(tmp_path): corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7") @@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path): table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019)) assert set(table.column("doc_id").to_pylist()) == {"a"} + def _expected_doc_id(filename: str) -> str: return hashlib.sha256(filename.encode("utf-8")).hexdigest() From 6bcde8b283ab8e517d01787e8c4ba48768656432 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Wed, 4 Mar 2026 21:29:44 +0200 Subject: [PATCH 04/26] Fix editable install by switching root build backend to setuptools --- pyproject.toml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e6672c..c7cf7c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["maturin>=1.5,<2.0"] -build-backend = "maturin" +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" [project] name = "glossapi" @@ -34,7 +34,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Rust", - "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)", ] [project.optional-dependencies] @@ -66,17 +65,16 @@ docs = [ "mkdocs-material>=9.5", ] -[tool.maturin] -bindings = "pyo3" -# The crate is located under rust/glossapi_rs_noise -module-name = "glossapi_rs_noise" -python-source = "src" +[tool.setuptools] +package-dir = {"" = "src"} +include-package-data = true -manifest-path = "rust/glossapi_rs_noise/Cargo.toml" -include = ["src/**"] -python-packages = [ - "glossapi" -] +[tool.setuptools.packages.find] +where = ["src"] +include = ["glossapi", "glossapi.*"] + +[tool.setuptools.package-data] +glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ From ab87731241d266e37235c738be36f6a0d2fc0737 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 00:26:16 +0000 Subject: [PATCH 05/26] Simplify OCR stack around DeepSeek --- .gitignore | 4 + README.md | 41 +- dependency_setup/deepseek_gpu_smoke.py | 33 +- dependency_setup/deepseek_uv/pyproject.toml | 28 + dependency_setup/deepseek_uv/uv.lock | 2605 +++++++++++++++++ dependency_setup/dependency_notes.md | 66 +- .../requirements-glossapi-deepseek.txt | 29 +- .../requirements-glossapi-docling.txt | 38 + .../requirements-glossapi-rapidocr.txt | 4 - dependency_setup/setup_deepseek_uv.sh | 138 + dependency_setup/setup_glossapi.sh | 115 +- dependency_setup/setup_glossapi_deepseek.sh | 2 +- .../deepseek_only_upgrade_roadmap.md | 262 ++ docs/architecture/index.md | 2 +- docs/configuration.md | 26 +- docs/getting_started.md | 58 +- docs/index.md | 4 +- docs/math_enrichment_runtime.md | 5 +- docs/ocr_and_math_enhancement.md | 39 +- docs/quickstart.md | 18 +- docs/stages/ocr.md | 9 +- docs/testing/compatibility_matrix.md | 276 ++ docs/troubleshooting.md | 10 +- mkdocs.yml | 4 +- pyproject.toml | 25 +- src/glossapi/__init__.py | 51 +- src/glossapi/_pipeline.py | 4 +- src/glossapi/corpus/phase_clean.py | 2 + src/glossapi/corpus/phase_extract.py | 36 +- src/glossapi/corpus/phase_ocr_math.py | 40 +- src/glossapi/gloss_extract.py | 221 +- src/glossapi/ocr/__init__.py | 7 +- src/glossapi/ocr/deepseek/__init__.py | 2 +- src/glossapi/ocr/deepseek/preflight.py | 70 +- .../ocr/deepseek/run_pdf_ocr_transformers.py | 188 ++ src/glossapi/ocr/deepseek/runner.py | 185 +- src/glossapi/ocr/docling/__init__.py | 5 + src/glossapi/ocr/docling/pipeline.py | 95 + src/glossapi/ocr/docling_pipeline.py | 82 + src/glossapi/ocr/rapidocr/__init__.py | 26 - src/glossapi/ocr/rapidocr/__init__.py.backup | 6 - src/glossapi/ocr/rapidocr/_paths.py | 114 - src/glossapi/ocr/rapidocr/dispatch.py | 33 - src/glossapi/ocr/rapidocr/docling_pipeline.py | 501 ---- .../ocr/rapidocr/docling_pipeline.py.backup | 501 ---- src/glossapi/ocr/rapidocr/onnx.py | 105 - src/glossapi/ocr/rapidocr/pipeline.py | 229 -- src/glossapi/ocr/rapidocr/pool.py | 72 - src/glossapi/ocr/rapidocr/safe.py | 301 -- tests/test_corpus_guards.py | 19 +- tests/test_deepseek_preflight.py | 30 +- tests/test_deepseek_runner_contract.py | 62 + tests/test_deepseek_runner_stub.py | 59 - tests/test_ocr_backends_smoke.py | 8 +- tests/test_ocr_dispatch_backends.py | 28 +- tests/test_ocr_imports.py | 13 - tests/test_pipeline_smoke.py | 98 +- tests/test_rapidocr_patch.py | 368 --- 58 files changed, 4241 insertions(+), 3161 deletions(-) create mode 100644 dependency_setup/deepseek_uv/pyproject.toml create mode 100644 dependency_setup/deepseek_uv/uv.lock create mode 100644 dependency_setup/requirements-glossapi-docling.txt delete mode 100644 dependency_setup/requirements-glossapi-rapidocr.txt create mode 100755 dependency_setup/setup_deepseek_uv.sh create mode 100644 docs/architecture/deepseek_only_upgrade_roadmap.md create mode 100644 docs/testing/compatibility_matrix.md create mode 100644 src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py create mode 100644 src/glossapi/ocr/docling/__init__.py create mode 100644 src/glossapi/ocr/docling/pipeline.py create mode 100644 src/glossapi/ocr/docling_pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/__init__.py delete mode 100644 src/glossapi/ocr/rapidocr/__init__.py.backup delete mode 100644 src/glossapi/ocr/rapidocr/_paths.py delete mode 100644 src/glossapi/ocr/rapidocr/dispatch.py delete mode 100644 src/glossapi/ocr/rapidocr/docling_pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/docling_pipeline.py.backup delete mode 100644 src/glossapi/ocr/rapidocr/onnx.py delete mode 100644 src/glossapi/ocr/rapidocr/pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/pool.py delete mode 100644 src/glossapi/ocr/rapidocr/safe.py create mode 100644 tests/test_deepseek_runner_contract.py delete mode 100644 tests/test_deepseek_runner_stub.py delete mode 100644 tests/test_rapidocr_patch.py diff --git a/.gitignore b/.gitignore index 8c98a88..929a8c5 100644 --- a/.gitignore +++ b/.gitignore @@ -58,10 +58,13 @@ htmlcov/ # OCR test outputs test_ocr_*_output/ *_demo_output/ +artifacts/ # OCR model weights (if downloaded locally) nanonets/ ocr_models/ +deepseek-ocr-2-model/ +models/ # Noise analysis reports glossapi_noise_analysis_report.md @@ -78,4 +81,5 @@ dependency_setup/.venvs/ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ +deepseek-ocr-2/ repro_rapidocr_onnx/ diff --git a/README.md b/README.md index ebc6baf..e581361 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss. ## Why GlossAPI - Handles download → extraction → cleaning → sectioning in one pipeline. -- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR. +- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation. - Rust-powered cleaner/noise metrics keep Markdown quality predictable. - Greek-first metadata and section classification tuned for academic corpora. - Modular Corpus API lets you resume from any stage or plug into existing flows. @@ -40,45 +40,40 @@ PY ## Automated Environment Profiles -Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes: +Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime: ```bash -# Vanilla pipeline (no GPU OCR extras) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Docling / main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# Docling + RapidOCR mode -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR runtime (uv-managed) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub. -- Export these to force the real CLI and avoid silent stub output: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. +- Export these to force the real runtime and avoid silent stub output: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`. -- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`). -- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`. +- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise. ## Choose Your Install Path | Scenario | Commands | Notes | | --- | --- | --- | | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. | -| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. | +| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. | +| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. | | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. | | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. | diff --git a/dependency_setup/deepseek_gpu_smoke.py b/dependency_setup/deepseek_gpu_smoke.py index e85d202..ddfb314 100644 --- a/dependency_setup/deepseek_gpu_smoke.py +++ b/dependency_setup/deepseek_gpu_smoke.py @@ -3,9 +3,9 @@ Minimal DeepSeek OCR integration smoke test. This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and -verifies that real Markdown output is produced. It requires the DeepSeek-OCR -weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to -the repository root (override via ``DEEPSEEK_MODEL_DIR``). +verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2 +weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the +repository root (override via ``DEEPSEEK_MODEL_DIR``). """ from __future__ import annotations @@ -20,15 +20,16 @@ REPO_ROOT = Path(__file__).resolve().parents[1] SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs" -DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve() +DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve() def ensure_model_available(model_root: Path) -> None: - expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors" + direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + expected = direct_root / "model-00001-of-000001.safetensors" if not expected.exists() or expected.stat().st_size < 1_000_000: raise FileNotFoundError( - f"Expected DeepSeek-OCR weights at {expected}. " - "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) " + f"Expected DeepSeek-OCR-2 weights at {expected}. " + "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) " "or set DEEPSEEK_MODEL_DIR to the directory that contains them." ) @@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None: from glossapi import Corpus ensure_model_available(model_root) - sample_pdf = SAMPLES_DIR / "sample01_plain.pdf" + model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + sample_pdf = SAMPLES_DIR / "alpha.pdf" if not sample_pdf.exists(): raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}") @@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None: parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") os.environ.setdefault( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - str(model_root / "run_pdf_ocr_vllm.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"), ) os.environ.setdefault( "GLOSSAPI_DEEPSEEK_PYTHON", sys.executable, ) - ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str( - model_root / "libjpeg-turbo" / "lib" - ) - os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra - os.environ["LD_LIBRARY_PATH"] = ( - f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":") - ) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) corpus = Corpus(input_dir=input_dir, output_dir=output_dir) corpus.ocr( @@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None: def main() -> None: - model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") + model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR") if model_dir_env: model_root = Path(model_dir_env).expanduser().resolve() else: diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml new file mode 100644 index 0000000..809b499 --- /dev/null +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" +requires-python = ">=3.11,<3.13" +dependencies = [ + "glossapi[docling,deepseek]", + "torch==2.6.0", + "torchvision==0.21.0", + "torchaudio==2.6.0", +] + +[dependency-groups] +test = [ + "pytest", + "fpdf2", +] + +[tool.uv.sources] +glossapi = { path = "../..", editable = true } +torch = { index = "pytorch-cu118" } +torchvision = { index = "pytorch-cu118" } +torchaudio = { index = "pytorch-cu118" } + +[[tool.uv.index]] +name = "pytorch-cu118" +url = "https://download.pytorch.org/whl/cu118" +explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock new file mode 100644 index 0000000..f5eefaa --- /dev/null +++ b/dependency_setup/deepseek_uv/uv.lock @@ -0,0 +1,2605 @@ +version = 1 +revision = 3 +requires-python = ">=3.11, <3.13" +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] + +[[package]] +name = "accelerate" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" }, +] + +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" }, +] + +[[package]] +name = "aiofiles" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, + { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, + { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, + { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, + { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, + { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, + { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, + { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, + { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, + { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, + { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, + { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, + { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, + { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, + { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, +] + +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/35/02daf95b9cd686320bb622eb148792655c9412dbb9b67abb5694e5910a24/charset_normalizer-3.4.5.tar.gz", hash = "sha256:95adae7b6c42a6c5b5b559b1a99149f090a57128155daeea91732c8d970d8644", size = 134804, upload-time = "2026-03-06T06:03:19.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/9e/bcec3b22c64ecec47d39bf5167c2613efd41898c019dccd4183f6aa5d6a7/charset_normalizer-3.4.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:610f72c0ee565dfb8ae1241b666119582fdbfe7c0975c175be719f940e110694", size = 279531, upload-time = "2026-03-06T06:00:52.252Z" }, + { url = "https://files.pythonhosted.org/packages/58/12/81fd25f7e7078ab5d1eedbb0fac44be4904ae3370a3bf4533c8f2d159acd/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60d68e820af339df4ae8358c7a2e7596badeb61e544438e489035f9fbf3246a5", size = 188006, upload-time = "2026-03-06T06:00:53.8Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6e/f2d30e8c27c1b0736a6520311982cf5286cfc7f6cac77d7bc1325e3a23f2/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b473fc8dca1c3ad8559985794815f06ca3fc71942c969129070f2c3cdf7281", size = 205085, upload-time = "2026-03-06T06:00:55.311Z" }, + { url = "https://files.pythonhosted.org/packages/d0/90/d12cefcb53b5931e2cf792a33718d7126efb116a320eaa0742c7059a95e4/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d4eb8ac7469b2a5d64b5b8c04f84d8bf3ad340f4514b98523805cbf46e3b3923", size = 200545, upload-time = "2026-03-06T06:00:56.532Z" }, + { url = "https://files.pythonhosted.org/packages/03/f4/44d3b830a20e89ff82a3134912d9a1cf6084d64f3b95dcad40f74449a654/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bcb3227c3d9aaf73eaaab1db7ccd80a8995c509ee9941e2aae060ca6e4e5d81", size = 193863, upload-time = "2026-03-06T06:00:57.823Z" }, + { url = "https://files.pythonhosted.org/packages/25/4b/f212119c18a6320a9d4a730d1b4057875cdeabf21b3614f76549042ef8a8/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:75ee9c1cce2911581a70a3c0919d8bccf5b1cbc9b0e5171400ec736b4b569497", size = 181827, upload-time = "2026-03-06T06:00:59.323Z" }, + { url = "https://files.pythonhosted.org/packages/74/00/b26158e48b425a202a92965f8069e8a63d9af1481dfa206825d7f74d2a3c/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d1401945cb77787dbd3af2446ff2d75912327c4c3a1526ab7955ecf8600687c", size = 191085, upload-time = "2026-03-06T06:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c2/1c1737bf6fd40335fe53d28fe49afd99ee4143cc57a845e99635ce0b9b6d/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a45e504f5e1be0bd385935a8e1507c442349ca36f511a47057a71c9d1d6ea9e", size = 190688, upload-time = "2026-03-06T06:01:02.479Z" }, + { url = "https://files.pythonhosted.org/packages/5a/3d/abb5c22dc2ef493cd56522f811246a63c5427c08f3e3e50ab663de27fcf4/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e09f671a54ce70b79a1fc1dc6da3072b7ef7251fadb894ed92d9aa8218465a5f", size = 183077, upload-time = "2026-03-06T06:01:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/44/33/5298ad4d419a58e25b3508e87f2758d1442ff00c2471f8e0403dab8edad5/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d01de5e768328646e6a3fa9e562706f8f6641708c115c62588aef2b941a4f88e", size = 206706, upload-time = "2026-03-06T06:01:05.773Z" }, + { url = "https://files.pythonhosted.org/packages/7b/17/51e7895ac0f87c3b91d276a449ef09f5532a7529818f59646d7a55089432/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:131716d6786ad5e3dc542f5cc6f397ba3339dc0fb87f87ac30e550e8987756af", size = 191665, upload-time = "2026-03-06T06:01:07.473Z" }, + { url = "https://files.pythonhosted.org/packages/90/8f/cce9adf1883e98906dbae380d769b4852bb0fa0004bc7d7a2243418d3ea8/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a374cc0b88aa710e8865dc1bd6edb3743c59f27830f0293ab101e4cf3ce9f85", size = 201950, upload-time = "2026-03-06T06:01:08.973Z" }, + { url = "https://files.pythonhosted.org/packages/08/ca/bce99cd5c397a52919e2769d126723f27a4c037130374c051c00470bcd38/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d31f0d1671e1534e395f9eb84a68e0fb670e1edb1fe819a9d7f564ae3bc4e53f", size = 195830, upload-time = "2026-03-06T06:01:10.155Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/2e3d023a06911f1281f97b8f036edc9872167036ca6f55cc874a0be6c12c/charset_normalizer-3.4.5-cp311-cp311-win32.whl", hash = "sha256:cace89841c0599d736d3d74a27bc5821288bb47c5441923277afc6059d7fbcb4", size = 132029, upload-time = "2026-03-06T06:01:11.706Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1f/a853b73d386521fd44b7f67ded6b17b7b2367067d9106a5c4b44f9a34274/charset_normalizer-3.4.5-cp311-cp311-win_amd64.whl", hash = "sha256:f8102ae93c0bc863b1d41ea0f4499c20a83229f52ed870850892df555187154a", size = 142404, upload-time = "2026-03-06T06:01:12.865Z" }, + { url = "https://files.pythonhosted.org/packages/b4/10/dba36f76b71c38e9d391abe0fd8a5b818790e053c431adecfc98c35cd2a9/charset_normalizer-3.4.5-cp311-cp311-win_arm64.whl", hash = "sha256:ed98364e1c262cf5f9363c3eca8c2df37024f52a8fa1180a3610014f26eac51c", size = 132796, upload-time = "2026-03-06T06:01:14.106Z" }, + { url = "https://files.pythonhosted.org/packages/9c/b6/9ee9c1a608916ca5feae81a344dffbaa53b26b90be58cc2159e3332d44ec/charset_normalizer-3.4.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed97c282ee4f994ef814042423a529df9497e3c666dca19be1d4cd1129dc7ade", size = 280976, upload-time = "2026-03-06T06:01:15.276Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d8/a54f7c0b96f1df3563e9190f04daf981e365a9b397eedfdfb5dbef7e5c6c/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0294916d6ccf2d069727d65973c3a1ca477d68708db25fd758dd28b0827cff54", size = 189356, upload-time = "2026-03-06T06:01:16.511Z" }, + { url = "https://files.pythonhosted.org/packages/42/69/2bf7f76ce1446759a5787cb87d38f6a61eb47dbbdf035cfebf6347292a65/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dc57a0baa3eeedd99fafaef7511b5a6ef4581494e8168ee086031744e2679467", size = 206369, upload-time = "2026-03-06T06:01:17.853Z" }, + { url = "https://files.pythonhosted.org/packages/10/9c/949d1a46dab56b959d9a87272482195f1840b515a3380e39986989a893ae/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ed1a9a204f317ef879b32f9af507d47e49cd5e7f8e8d5d96358c98373314fc60", size = 203285, upload-time = "2026-03-06T06:01:19.473Z" }, + { url = "https://files.pythonhosted.org/packages/67/5c/ae30362a88b4da237d71ea214a8c7eb915db3eec941adda511729ac25fa2/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad83b8f9379176c841f8865884f3514d905bcd2a9a3b210eaa446e7d2223e4d", size = 196274, upload-time = "2026-03-06T06:01:20.728Z" }, + { url = "https://files.pythonhosted.org/packages/b2/07/c9f2cb0e46cb6d64fdcc4f95953747b843bb2181bda678dc4e699b8f0f9a/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:a118e2e0b5ae6b0120d5efa5f866e58f2bb826067a646431da4d6a2bdae7950e", size = 184715, upload-time = "2026-03-06T06:01:22.194Z" }, + { url = "https://files.pythonhosted.org/packages/36/64/6b0ca95c44fddf692cd06d642b28f63009d0ce325fad6e9b2b4d0ef86a52/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:754f96058e61a5e22e91483f823e07df16416ce76afa4ebf306f8e1d1296d43f", size = 193426, upload-time = "2026-03-06T06:01:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/50/bc/a730690d726403743795ca3f5bb2baf67838c5fea78236098f324b965e40/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0c300cefd9b0970381a46394902cd18eaf2aa00163f999590ace991989dcd0fc", size = 191780, upload-time = "2026-03-06T06:01:25.053Z" }, + { url = "https://files.pythonhosted.org/packages/97/4f/6c0bc9af68222b22951552d73df4532b5be6447cee32d58e7e8c74ecbb7b/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c108f8619e504140569ee7de3f97d234f0fbae338a7f9f360455071ef9855a95", size = 185805, upload-time = "2026-03-06T06:01:26.294Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b9/a523fb9b0ee90814b503452b2600e4cbc118cd68714d57041564886e7325/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d1028de43596a315e2720a9849ee79007ab742c06ad8b45a50db8cdb7ed4a82a", size = 208342, upload-time = "2026-03-06T06:01:27.55Z" }, + { url = "https://files.pythonhosted.org/packages/4d/61/c59e761dee4464050713e50e27b58266cc8e209e518c0b378c1580c959ba/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:19092dde50335accf365cce21998a1c6dd8eafd42c7b226eb54b2747cdce2fac", size = 193661, upload-time = "2026-03-06T06:01:29.051Z" }, + { url = "https://files.pythonhosted.org/packages/1c/43/729fa30aad69783f755c5ad8649da17ee095311ca42024742701e202dc59/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4354e401eb6dab9aed3c7b4030514328a6c748d05e1c3e19175008ca7de84fb1", size = 204819, upload-time = "2026-03-06T06:01:30.298Z" }, + { url = "https://files.pythonhosted.org/packages/87/33/d9b442ce5a91b96fc0840455a9e49a611bbadae6122778d0a6a79683dd31/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a68766a3c58fde7f9aaa22b3786276f62ab2f594efb02d0a1421b6282e852e98", size = 198080, upload-time = "2026-03-06T06:01:31.478Z" }, + { url = "https://files.pythonhosted.org/packages/56/5a/b8b5a23134978ee9885cee2d6995f4c27cc41f9baded0a9685eabc5338f0/charset_normalizer-3.4.5-cp312-cp312-win32.whl", hash = "sha256:1827734a5b308b65ac54e86a618de66f935a4f63a8a462ff1e19a6788d6c2262", size = 132630, upload-time = "2026-03-06T06:01:33.056Z" }, + { url = "https://files.pythonhosted.org/packages/70/53/e44a4c07e8904500aec95865dc3f6464dc3586a039ef0df606eb3ac38e35/charset_normalizer-3.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:728c6a963dfab66ef865f49286e45239384249672cd598576765acc2a640a636", size = 142856, upload-time = "2026-03-06T06:01:34.489Z" }, + { url = "https://files.pythonhosted.org/packages/ea/aa/c5628f7cad591b1cf45790b7a61483c3e36cf41349c98af7813c483fd6e8/charset_normalizer-3.4.5-cp312-cp312-win_arm64.whl", hash = "sha256:75dfd1afe0b1647449e852f4fb428195a7ed0588947218f7ba929f6538487f02", size = 132982, upload-time = "2026-03-06T06:01:35.641Z" }, + { url = "https://files.pythonhosted.org/packages/c5/60/3a621758945513adfd4db86827a5bafcc615f913dbd0b4c2ed64a65731be/charset_normalizer-3.4.5-py3-none-any.whl", hash = "sha256:9db5e3fcdcee89a78c04dffb3fe33c79f77bd741a624946db2591c81b2fc85b0", size = 55455, upload-time = "2026-03-06T06:03:17.827Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "dask" +version = "2026.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "cloudpickle" }, + { name = "fsspec" }, + { name = "importlib-metadata", marker = "python_full_version < '3.12'" }, + { name = "packaging" }, + { name = "partd" }, + { name = "pyyaml" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/52/b0f9172b22778def907db1ff173249e4eb41f054b46a9c83b1528aaf811f/dask-2026.1.2.tar.gz", hash = "sha256:1136683de2750d98ea792670f7434e6c1cfce90cab2cc2f2495a9e60fd25a4fc", size = 10997838, upload-time = "2026-01-30T21:04:20.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/23/d39ccc4ed76222db31530b0a7d38876fdb7673e23f838e8d8f0ed4651a4f/dask-2026.1.2-py3-none-any.whl", hash = "sha256:46a0cf3b8d87f78a3d2e6b145aea4418a6d6d606fe6a16c79bd8ca2bb862bc91", size = 1482084, upload-time = "2026-01-30T21:04:18.363Z" }, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + +[[package]] +name = "deprecated" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, +] + +[[package]] +name = "dill" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, +] + +[[package]] +name = "docling" +version = "2.48.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accelerate" }, + { name = "beautifulsoup4" }, + { name = "certifi" }, + { name = "docling-core", extra = ["chunking"] }, + { name = "docling-ibm-models" }, + { name = "docling-parse" }, + { name = "easyocr" }, + { name = "filetype" }, + { name = "huggingface-hub" }, + { name = "lxml" }, + { name = "marko" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pillow" }, + { name = "pluggy" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "pylatexenc" }, + { name = "pypdfium2" }, + { name = "python-docx" }, + { name = "python-pptx" }, + { name = "requests" }, + { name = "rtree" }, + { name = "scipy" }, + { name = "tqdm" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/be/32/e117cb0dcc76c93828d2cd9b45c3f8ccf6c86314a60e9c65f16067d3df26/docling-2.48.0.tar.gz", hash = "sha256:e94a5f75c544ec1bbb9169d2f4da72e1f497fd2fcda57cfacc454c93b1c92a8e", size = 189422, upload-time = "2025-08-26T05:31:02.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/32/a9c6677c66178a397b89b5b6fe1e7b3d3de98ddc2b331fbcd7440419b9f0/docling-2.48.0-py3-none-any.whl", hash = "sha256:8a1c1dfd5ed84cadb0f81fcb1464e5d501c4bfaa121e15306e09e3c0c983cc3e", size = 212266, upload-time = "2025-08-26T05:31:00.779Z" }, +] + +[[package]] +name = "docling-core" +version = "2.68.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "jsonref" }, + { name = "jsonschema" }, + { name = "latex2mathml" }, + { name = "pandas" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "tabulate" }, + { name = "typer" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/b7/95e329d143528decd8f6af5d4db6c2d6bc3dc40f9d53ee5b7d5b901dfe11/docling_core-2.68.0.tar.gz", hash = "sha256:261ecb6281d45fcf0559640297eda728f8f7dd4fe8c8bf7ced42dbf9b4e46223", size = 267551, upload-time = "2026-03-07T12:20:24.523Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/66/d8bbe25dec2bb91d9090b939349b1c9b94c307edceada46c5bc6f213a569/docling_core-2.68.0-py3-none-any.whl", hash = "sha256:175145398c005399819a7cfe7b634257caaaecfbb4451840b8ddb31fc2f5ac12", size = 247092, upload-time = "2026-03-07T12:20:23.172Z" }, +] + +[package.optional-dependencies] +chunking = [ + { name = "semchunk" }, + { name = "transformers" }, + { name = "tree-sitter" }, + { name = "tree-sitter-c" }, + { name = "tree-sitter-javascript" }, + { name = "tree-sitter-python" }, + { name = "tree-sitter-typescript" }, +] + +[[package]] +name = "docling-ibm-models" +version = "3.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accelerate" }, + { name = "docling-core" }, + { name = "huggingface-hub" }, + { name = "jsonlines" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "rtree" }, + { name = "safetensors", extra = ["torch"] }, + { name = "torch" }, + { name = "torchvision" }, + { name = "tqdm" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/91/f883e0a2b3466e1126dfd4463f386c70f5b90d271c27b6f5a97d2f8312e6/docling_ibm_models-3.11.0.tar.gz", hash = "sha256:454401563a8e79cb33b718bc559d9bacca8a0183583e48f8e616c9184c1f5eb1", size = 87721, upload-time = "2026-01-23T12:29:35.384Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/5d/97e9c2e10fbd3ee1723ac82c335f8211a9633c0397cc11ed057c3ba4006e/docling_ibm_models-3.11.0-py3-none-any.whl", hash = "sha256:68f7961069d643bfdab21b1c9ef24a979db293496f4c2283d95b1025a9ac5347", size = 87352, upload-time = "2026-01-23T12:29:34.045Z" }, +] + +[[package]] +name = "docling-parse" +version = "4.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docling-core" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "tabulate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/7a/653c3b11920113217724fab9b4740f9f8964864f92a2a27590accecec5ac/docling_parse-4.7.3.tar.gz", hash = "sha256:5936e6bcb7969c2a13f38ecc75cada3b0919422dc845e96da4b0b7b3bbc394ce", size = 67646746, upload-time = "2026-01-14T14:18:19.376Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/81/dd317e0bce475153dc08a60a9a8615b1a04d4d3c9803175e6cb7b7e9b49b/docling_parse-4.7.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66896bbe925073e4d48f18ec29dcd611a390d6b2378fae72125e77b020cd5664", size = 14615974, upload-time = "2026-01-14T14:17:30.246Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b5/088590e0b32fd0a393ca419c644d1435a1c99fa6b2a87888eef4d0fdea33/docling_parse-4.7.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:281347b3e937c1a5ffa6f8774ee603b64a0899fe8a6885573dec7eb48a3421d8", size = 14981051, upload-time = "2026-01-14T14:17:32.426Z" }, + { url = "https://files.pythonhosted.org/packages/b7/63/2b6c9127924487573d5419d58ec77955f0b7c0a923c8232ad461d71039aa/docling_parse-4.7.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3d86c51f9ce35a1b40b2f410f7271d9bd5fc58e7240f4cae7fdd2cef757e671", size = 15092586, upload-time = "2026-01-14T14:17:34.634Z" }, + { url = "https://files.pythonhosted.org/packages/af/89/ed27a83eb113bdf0b0f82f3c30a0db3c005df58b236f6487b232dacdb57a/docling_parse-4.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:3b04459cc97a8a4929622e341b9981e23987a63af07db599afc5e1c4d389060b", size = 16144866, upload-time = "2026-01-14T14:17:36.742Z" }, + { url = "https://files.pythonhosted.org/packages/d6/26/9d86ae12699a25b7233f76ce062253e9c14e57781e00166b792b3a9d56db/docling_parse-4.7.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d89231aa4fba3e38b80c11beb8edc07569e934c1f3935b51f57904fefe958ba5", size = 14616739, upload-time = "2026-01-14T14:17:38.567Z" }, + { url = "https://files.pythonhosted.org/packages/f2/fd/1aebb8a7f15d658f3be858ddbbc4ef7206089d540a7df0dcd4b846b99901/docling_parse-4.7.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dffd19ed373b0da5cea124606b183489a8686c3d18643e94485be1bdda5713ea", size = 14980782, upload-time = "2026-01-14T14:17:40.659Z" }, + { url = "https://files.pythonhosted.org/packages/3e/47/a722527c9f89c65f69f8a463be4f12ad73bae18132f29d8de8b2d9f6f082/docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc32b6f25a673e41b9a8112b6b841284f60dbac9427b7848a03b435460f74aee", size = 15092450, upload-time = "2026-01-14T14:17:42.838Z" }, + { url = "https://files.pythonhosted.org/packages/91/c7/316373a92ba42c2aeaee128fc77a34333449fe3e820b9d524e0ee396ea35/docling_parse-4.7.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef691045623863624f2cb7347572d0262a53cb84940ef7dd851d9f13a2eb8833", size = 16147359, upload-time = "2026-01-14T14:17:44.906Z" }, +] + +[[package]] +name = "easydict" +version = "1.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/9f/d18d6b5e19244788a6d09c14a8406376b4f4bfcc008e6d17a4f4c15362e8/easydict-1.13.tar.gz", hash = "sha256:b1135dedbc41c8010e2bc1f77ec9744c7faa42bce1a1c87416791449d6c87780", size = 6809, upload-time = "2024-03-04T12:04:41.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, +] + +[[package]] +name = "easyocr" +version = "1.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ninja" }, + { name = "numpy" }, + { name = "opencv-python-headless" }, + { name = "pillow" }, + { name = "pyclipper" }, + { name = "python-bidi" }, + { name = "pyyaml" }, + { name = "scikit-image" }, + { name = "scipy" }, + { name = "shapely" }, + { name = "torch" }, + { name = "torchvision" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, +] + +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, +] + +[[package]] +name = "filetype" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, +] + +[[package]] +name = "fonttools" +version = "4.61.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, + { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, + { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, + { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, + { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, + { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, + { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, + { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, + { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, + { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, +] + +[[package]] +name = "fpdf2" +version = "2.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "fonttools" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/f2/72feae0b2827ed38013e4307b14f95bf0b3d124adfef4d38a7d57533f7be/fpdf2-2.8.7.tar.gz", hash = "sha256:7060ccee5a9c7ab0a271fb765a36a23639f83ef8996c34e3d46af0a17ede57f9", size = 362351, upload-time = "2026-02-28T05:39:16.456Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/0a/cf50ecffa1e3747ed9380a3adfc829259f1f86b3fdbd9e505af789003141/fpdf2-2.8.7-py3-none-any.whl", hash = "sha256:d391fc508a3ce02fc43a577c830cda4fe6f37646f2d143d489839940932fbc19", size = 327056, upload-time = "2026-02-28T05:39:14.619Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" }, + { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" }, + { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" }, + { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" }, + { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" }, + { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" }, + { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" }, + { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" }, + { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" }, + { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, + { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, + { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, + { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, + { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, + { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, + { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, + { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, + { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + +[[package]] +name = "glossapi" +version = "0.1.3" +source = { editable = "../../" } +dependencies = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "dask" }, + { name = "ftfy" }, + { name = "joblib" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pypdfium2" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "zstandard" }, +] + +[package.optional-dependencies] +deepseek = [ + { name = "accelerate" }, + { name = "addict" }, + { name = "easydict" }, + { name = "einops" }, + { name = "img2pdf" }, + { name = "pillow" }, + { name = "pymupdf" }, + { name = "tokenizers" }, + { name = "transformers" }, +] +docling = [ + { name = "docling" }, +] + +[package.metadata] +requires-dist = [ + { name = "accelerate", marker = "extra == 'deepseek'", specifier = ">=1.2.1,<2" }, + { name = "addict", marker = "extra == 'deepseek'" }, + { name = "aiofiles", specifier = ">=23.0.0" }, + { name = "aiohttp", specifier = ">=3.8.0" }, + { name = "dask", specifier = ">=2022.1.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.48.0" }, + { name = "easydict", marker = "extra == 'deepseek'" }, + { name = "einops", marker = "extra == 'deepseek'" }, + { name = "ftfy", specifier = ">=6.0.0" }, + { name = "img2pdf", marker = "extra == 'deepseek'", specifier = ">=0.5.1" }, + { name = "joblib", specifier = ">=1.0.0" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, + { name = "numpy", specifier = "<2" }, + { name = "pandas", specifier = ">=1.3.0" }, + { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "pyarrow", specifier = ">=7.0.0" }, + { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, + { name = "pypdfium2", specifier = ">=4.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = "==1.6.1" }, + { name = "tenacity", specifier = ">=8.0.0" }, + { name = "tokenizers", marker = "extra == 'deepseek'", specifier = "==0.20.3" }, + { name = "torch", marker = "extra == 'cuda'", specifier = "==2.5.1" }, + { name = "torchvision", marker = "extra == 'cuda'", specifier = "==0.20.1" }, + { name = "tqdm", specifier = ">=4.67.0" }, + { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, + { name = "zstandard", specifier = ">=0.22.0" }, +] +provides-extras = ["docling", "cuda", "deepseek", "docs"] + +[[package]] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "glossapi", extra = ["deepseek", "docling"] }, + { name = "torch" }, + { name = "torchaudio" }, + { name = "torchvision" }, +] + +[package.dev-dependencies] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "glossapi", extras = ["docling", "deepseek"], editable = "../../" }, + { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, +] + +[package.metadata.requires-dev] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[[package]] +name = "hf-xet" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, + { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, + { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "imageio" +version = "2.37.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/6f/606be632e37bf8d05b253e8626c2291d74c691ddc7bcdf7d6aaf33b32f6a/imageio-2.37.2.tar.gz", hash = "sha256:0212ef2727ac9caa5ca4b2c75ae89454312f440a756fcfc8ef1993e718f50f8a", size = 389600, upload-time = "2025-11-04T14:29:39.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/fe/301e0936b79bcab4cacc7548bf2853fc28dced0a578bab1f7ef53c9aa75b/imageio-2.37.2-py3-none-any.whl", hash = "sha256:ad9adfb20335d718c03de457358ed69f141021a333c40a53e57273d8a5bd0b9b", size = 317646, upload-time = "2025-11-04T14:29:37.948Z" }, +] + +[[package]] +name = "img2pdf" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pikepdf" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/97/ca44c467131b93fda82d2a2f21b738c8bcf63b5259e3b8250e928b8dd52a/img2pdf-0.6.3.tar.gz", hash = "sha256:219518020f5bd242bdc46493941ea3f756f664c2e86f2454721e74353f58cd95", size = 120350, upload-time = "2025-11-05T20:51:57.558Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/dc/91e3a4a11c25ae183bd5a71b84ecb298db76405ff70013f76b10877bdfe3/img2pdf-0.6.3-py3-none-any.whl", hash = "sha256:44d12d235752edd17c43c04ff39952cdc5dd4c6aba90569c4902bd445085266b", size = 49701, upload-time = "2025-11-05T20:51:55.469Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + +[[package]] +name = "jsonlines" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" }, +] + +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, +] + +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + +[[package]] +name = "latex2mathml" +version = "3.78.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/26/57b1034c08922d0aefea79430a5e0006ffaee4f0ec59d566613f667ab2f7/latex2mathml-3.78.1.tar.gz", hash = "sha256:f941db80bf41db33f31df87b304e8b588f8166b813b0257c11c98f7a9d0aac71", size = 74030, upload-time = "2025-08-29T23:34:23.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3e/76/d661ea2e529c3d464f9efd73f9ac31626b45279eb4306e684054ea20e3d4/latex2mathml-3.78.1-py3-none-any.whl", hash = "sha256:f089b6d75e85b937f99693c93e8c16c0804008672c3dd2a3d25affd36f238100", size = 73892, upload-time = "2025-08-29T23:34:21.98Z" }, +] + +[[package]] +name = "lazy-loader" +version = "0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" }, +] + +[[package]] +name = "locket" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/83/97b29fe05cb6ae28d2dbd30b81e2e402a3eed5f460c26e9eaa5895ceacf5/locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632", size = 4350, upload-time = "2022-04-20T22:04:44.312Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/bc/83e112abc66cd466c6b83f99118035867cecd41802f8d044638aa78a106e/locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3", size = 4398, upload-time = "2022-04-20T22:04:42.23Z" }, +] + +[[package]] +name = "lxml" +version = "5.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/2d/67693cc8a605a12e5975380d7ff83020dcc759351b5a066e1cced04f797b/lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9", size = 8083240, upload-time = "2025-04-23T01:45:18.566Z" }, + { url = "https://files.pythonhosted.org/packages/73/53/b5a05ab300a808b72e848efd152fe9c022c0181b0a70b8bca1199f1bed26/lxml-5.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7", size = 4387685, upload-time = "2025-04-23T01:45:21.387Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/1a3879c5f512bdcd32995c301886fe082b2edd83c87d41b6d42d89b4ea4d/lxml-5.4.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa", size = 4991164, upload-time = "2025-04-23T01:45:23.849Z" }, + { url = "https://files.pythonhosted.org/packages/f9/94/bbc66e42559f9d04857071e3b3d0c9abd88579367fd2588a4042f641f57e/lxml-5.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df", size = 4746206, upload-time = "2025-04-23T01:45:26.361Z" }, + { url = "https://files.pythonhosted.org/packages/66/95/34b0679bee435da2d7cae895731700e519a8dfcab499c21662ebe671603e/lxml-5.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e", size = 5342144, upload-time = "2025-04-23T01:45:28.939Z" }, + { url = "https://files.pythonhosted.org/packages/e0/5d/abfcc6ab2fa0be72b2ba938abdae1f7cad4c632f8d552683ea295d55adfb/lxml-5.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44", size = 4825124, upload-time = "2025-04-23T01:45:31.361Z" }, + { url = "https://files.pythonhosted.org/packages/5a/78/6bd33186c8863b36e084f294fc0a5e5eefe77af95f0663ef33809cc1c8aa/lxml-5.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba", size = 4876520, upload-time = "2025-04-23T01:45:34.191Z" }, + { url = "https://files.pythonhosted.org/packages/3b/74/4d7ad4839bd0fc64e3d12da74fc9a193febb0fae0ba6ebd5149d4c23176a/lxml-5.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba", size = 4765016, upload-time = "2025-04-23T01:45:36.7Z" }, + { url = "https://files.pythonhosted.org/packages/24/0d/0a98ed1f2471911dadfc541003ac6dd6879fc87b15e1143743ca20f3e973/lxml-5.4.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c", size = 5362884, upload-time = "2025-04-23T01:45:39.291Z" }, + { url = "https://files.pythonhosted.org/packages/48/de/d4f7e4c39740a6610f0f6959052b547478107967362e8424e1163ec37ae8/lxml-5.4.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8", size = 4902690, upload-time = "2025-04-23T01:45:42.386Z" }, + { url = "https://files.pythonhosted.org/packages/07/8c/61763abd242af84f355ca4ef1ee096d3c1b7514819564cce70fd18c22e9a/lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86", size = 4944418, upload-time = "2025-04-23T01:45:46.051Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/6d7e3b63e7e282619193961a570c0a4c8a57fe820f07ca3fe2f6bd86608a/lxml-5.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056", size = 4827092, upload-time = "2025-04-23T01:45:48.943Z" }, + { url = "https://files.pythonhosted.org/packages/71/4a/e60a306df54680b103348545706a98a7514a42c8b4fbfdcaa608567bb065/lxml-5.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7", size = 5418231, upload-time = "2025-04-23T01:45:51.481Z" }, + { url = "https://files.pythonhosted.org/packages/27/f2/9754aacd6016c930875854f08ac4b192a47fe19565f776a64004aa167521/lxml-5.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd", size = 5261798, upload-time = "2025-04-23T01:45:54.146Z" }, + { url = "https://files.pythonhosted.org/packages/38/a2/0c49ec6941428b1bd4f280650d7b11a0f91ace9db7de32eb7aa23bcb39ff/lxml-5.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751", size = 4988195, upload-time = "2025-04-23T01:45:56.685Z" }, + { url = "https://files.pythonhosted.org/packages/7a/75/87a3963a08eafc46a86c1131c6e28a4de103ba30b5ae903114177352a3d7/lxml-5.4.0-cp311-cp311-win32.whl", hash = "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4", size = 3474243, upload-time = "2025-04-23T01:45:58.863Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f9/1f0964c4f6c2be861c50db380c554fb8befbea98c6404744ce243a3c87ef/lxml-5.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539", size = 3815197, upload-time = "2025-04-23T01:46:01.096Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/d101ace719ca6a4ec043eb516fcfcb1b396a9fccc4fcd9ef593df34ba0d5/lxml-5.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4", size = 8127392, upload-time = "2025-04-23T01:46:04.09Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/beddae0cec4dd9ddf46abf156f0af451c13019a0fa25d7445b655ba5ccb7/lxml-5.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d", size = 4415103, upload-time = "2025-04-23T01:46:07.227Z" }, + { url = "https://files.pythonhosted.org/packages/d0/25/d0d93a4e763f0462cccd2b8a665bf1e4343dd788c76dcfefa289d46a38a9/lxml-5.4.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779", size = 5024224, upload-time = "2025-04-23T01:46:10.237Z" }, + { url = "https://files.pythonhosted.org/packages/31/ce/1df18fb8f7946e7f3388af378b1f34fcf253b94b9feedb2cec5969da8012/lxml-5.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e", size = 4769913, upload-time = "2025-04-23T01:46:12.757Z" }, + { url = "https://files.pythonhosted.org/packages/4e/62/f4a6c60ae7c40d43657f552f3045df05118636be1165b906d3423790447f/lxml-5.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9", size = 5290441, upload-time = "2025-04-23T01:46:16.037Z" }, + { url = "https://files.pythonhosted.org/packages/9e/aa/04f00009e1e3a77838c7fc948f161b5d2d5de1136b2b81c712a263829ea4/lxml-5.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5", size = 4820165, upload-time = "2025-04-23T01:46:19.137Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/e0b2f61fa2404bf0f1fdf1898377e5bd1b74cc9b2cf2c6ba8509b8f27990/lxml-5.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5", size = 4932580, upload-time = "2025-04-23T01:46:21.963Z" }, + { url = "https://files.pythonhosted.org/packages/24/a2/8263f351b4ffe0ed3e32ea7b7830f845c795349034f912f490180d88a877/lxml-5.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4", size = 4759493, upload-time = "2025-04-23T01:46:24.316Z" }, + { url = "https://files.pythonhosted.org/packages/05/00/41db052f279995c0e35c79d0f0fc9f8122d5b5e9630139c592a0b58c71b4/lxml-5.4.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e", size = 5324679, upload-time = "2025-04-23T01:46:27.097Z" }, + { url = "https://files.pythonhosted.org/packages/1d/be/ee99e6314cdef4587617d3b3b745f9356d9b7dd12a9663c5f3b5734b64ba/lxml-5.4.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7", size = 4890691, upload-time = "2025-04-23T01:46:30.009Z" }, + { url = "https://files.pythonhosted.org/packages/ad/36/239820114bf1d71f38f12208b9c58dec033cbcf80101cde006b9bde5cffd/lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079", size = 4955075, upload-time = "2025-04-23T01:46:32.33Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e1/1b795cc0b174efc9e13dbd078a9ff79a58728a033142bc6d70a1ee8fc34d/lxml-5.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20", size = 4838680, upload-time = "2025-04-23T01:46:34.852Z" }, + { url = "https://files.pythonhosted.org/packages/72/48/3c198455ca108cec5ae3662ae8acd7fd99476812fd712bb17f1b39a0b589/lxml-5.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8", size = 5391253, upload-time = "2025-04-23T01:46:37.608Z" }, + { url = "https://files.pythonhosted.org/packages/d6/10/5bf51858971c51ec96cfc13e800a9951f3fd501686f4c18d7d84fe2d6352/lxml-5.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f", size = 5261651, upload-time = "2025-04-23T01:46:40.183Z" }, + { url = "https://files.pythonhosted.org/packages/2b/11/06710dd809205377da380546f91d2ac94bad9ff735a72b64ec029f706c85/lxml-5.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc", size = 5024315, upload-time = "2025-04-23T01:46:43.333Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b0/15b6217834b5e3a59ebf7f53125e08e318030e8cc0d7310355e6edac98ef/lxml-5.4.0-cp312-cp312-win32.whl", hash = "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f", size = 3486149, upload-time = "2025-04-23T01:46:45.684Z" }, + { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + +[[package]] +name = "marko" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/2f/050b6d485f052ddf17d76a41f9334d6fb2a8a85df35347a12d97ed3bc5c1/marko-2.2.2.tar.gz", hash = "sha256:6940308e655f63733ca518c47a68ec9510279dbb916c83616e4c4b5829f052e8", size = 143641, upload-time = "2026-01-05T11:04:41.935Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/f8/36d79bac5701e6786f9880c61bbe57574760a13c1af84ab71e5ed21faecc/marko-2.2.2-py3-none-any.whl", hash = "sha256:f064ae8c10416285ad1d96048dc11e98ef04e662d3342ae416f662b70aa7959e", size = 42701, upload-time = "2026-01-05T11:04:40.75Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, + { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, + { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, + { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, + { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "mpire" +version = "2.10.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270, upload-time = "2024-05-07T14:00:31.815Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/14/1db1729ad6db4999c3a16c47937d601fcb909aaa4224f5eca5a2f145a605/mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb", size = 272756, upload-time = "2024-05-07T14:00:29.633Z" }, +] + +[package.optional-dependencies] +dill = [ + { name = "multiprocess" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, + { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, + { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, + { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, + { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, + { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, + { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, + { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, + { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "multiprocess" +version = "0.70.19" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" }, + { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" }, + { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" }, + { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" }, + { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" }, + { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "ninja" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" }, + { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" }, + { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" }, + { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" }, + { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" }, + { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" }, + { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" }, + { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" }, + { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" }, + { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" }, + { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" }, + { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" }, + { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" }, + { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" }, + { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" }, + { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" }, + { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, +] + +[[package]] +name = "nvidia-cublas-cu11" +version = "11.11.3.6" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/be/c222e33e60d28ecd496a46fc4d78ccae0ee28e1fd7dc705b6288b4cad27e/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl", hash = "sha256:39fb40e8f486dd8a2ddb8fdeefe1d5b28f5b99df01c87ab3676f057a74a5a6f3", size = 417870452, upload-time = "2022-10-18T21:17:48.638Z" }, + { url = "https://files.pythonhosted.org/packages/ea/2e/9d99c60771d275ecf6c914a612e9a577f740a615bc826bec132368e1d3ae/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:60252822adea5d0b10cd990a7dc7bedf7435f30ae40083c7a624a85a43225abc", size = 417870460, upload-time = "2024-08-17T00:00:26.889Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu11" +version = "11.8.87" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/c9/b4b15f709a694ea9f84871c6c4fbeeb54bab225962d852665a2c6f77f90d/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl", hash = "sha256:0e50c707df56c75a2c0703dc6b886f3c97a22f37d6f63839f75b7418ba672a8d", size = 13093657, upload-time = "2022-10-03T21:46:12.544Z" }, + { url = "https://files.pythonhosted.org/packages/74/42/9f5c5cc084ce6f3073048c4f6806f45ba4c8c73f227c9587215d9c372e05/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux2014_x86_64.whl", hash = "sha256:4191a17913a706b5098681280cd089cd7d8d3df209a6f5cb79384974a96d24f2", size = 13093662, upload-time = "2024-08-16T23:56:38.082Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu11" +version = "11.8.89" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/08/a9833e4e9f9165bedb7f36033b47aa399b053b9cb2eaf7b84d1e28705cf7/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:1f27d67b0f72902e9065ae568b4f6268dfe49ba3ed269c9a3da99bb86d1d2008", size = 23173264, upload-time = "2022-10-03T21:47:00.705Z" }, + { url = "https://files.pythonhosted.org/packages/60/44/202e027c224c26e15a53f01c5c7604c7f6b4fd368882d3164ea08fead207/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a8d02f3cba345be56b1ffc3e74d8f61f02bb758dd31b0f20e12277a5a244f756", size = 23173745, upload-time = "2024-08-16T23:58:16.539Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu11" +version = "11.8.89" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/3e/84db02be49fe6d6df6e42f69fd64501c22d0f9ada9c9877f885612085d20/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:f587bd726eb2f7612cf77ce38a2c1e65cf23251ff49437f6161ce0d647f64f7c", size = 875585, upload-time = "2022-10-03T21:46:03.05Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ec/a540f28b31de7bc1ed49eecc72035d4cb77db88ead1d42f7bfa5ae407ac6/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:92d04069a987e1fbc9213f8376d265df0f7bb42617d44f5eda1f496acea7f2d1", size = 875592, upload-time = "2024-08-16T23:56:18.774Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu11" +version = "9.1.0.70" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/3b/0b776f04e364cd99e4cf152c2a9eadb5934c67c9a91429da55169a9447fd/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e6135ac63fe9d5b0b89cfb35c3fc1c1349f2b995becadf2e9dc21bca89d9633d", size = 663919573, upload-time = "2024-04-22T15:20:24.839Z" }, +] + +[[package]] +name = "nvidia-cufft-cu11" +version = "10.9.0.58" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/79/b912a77e38e41f15a0581a59f5c3548d1ddfdda3225936fb67c342719e7a/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl", hash = "sha256:222f9da70c80384632fd6035e4c3f16762d64ea7a843829cb278f98b3cb7dd81", size = 168405414, upload-time = "2022-10-03T23:29:47.505Z" }, + { url = "https://files.pythonhosted.org/packages/64/c8/133717b43182ba063803e983e7680a94826a9f4ff5734af0ca315803f1b3/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e21037259995243cc370dd63c430d77ae9280bedb68d5b5a18226bfc92e5d748", size = 168405419, upload-time = "2024-08-17T00:02:03.562Z" }, +] + +[[package]] +name = "nvidia-curand-cu11" +version = "10.3.0.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/28/c47f8e2439ddbcbeae3cf74d43ed572b651d630ea72863d5357f3759eb66/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:ac439548c88580269a1eb6aeb602a5aed32f0dbb20809a31d9ed7d01d77f6bf5", size = 58124493, upload-time = "2022-10-03T23:30:05.413Z" }, + { url = "https://files.pythonhosted.org/packages/58/e5/ce5806afc48a6e4e0dddd25316ac60b6fa94fd1791bdbf4ca17bf52696ea/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:cd4cffbf78bb06580206b4814d5dc696d1161c902aae37b2bba00056832379e6", size = 58124497, upload-time = "2024-08-17T00:03:01.833Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu11" +version = "11.4.1.48" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/ee/939ff0104991dd7bdabb4c9767994c612ba0e1c9a55672a1ddd42f5e5b16/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl", hash = "sha256:ca538f545645b7e6629140786d3127fe067b3d5a085bd794cde5bfe877c8926f", size = 128240842, upload-time = "2022-10-03T23:30:24.348Z" }, + { url = "https://files.pythonhosted.org/packages/52/fe/866e87e6e6a1b0a5fcf8524a058042656702f2057e22bfdb8899a7c38e10/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea9fb1ad8c644ca9ed55af13cc39af3b7ba4c3eb5aef18471fe1fe77d94383cb", size = 128246438, upload-time = "2024-08-17T00:03:52.432Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu11" +version = "11.7.5.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/e0/21b829c535d569831835a4ca5d049a19ba00d3e91f3e12ab4ad27bd7385f/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:4ae709fe78d3f23f60acaba8c54b8ad556cf16ca486e0cc1aa92dca7555d2d2b", size = 204126221, upload-time = "2022-10-18T21:19:28.04Z" }, + { url = "https://files.pythonhosted.org/packages/ed/5c/b0333b07c51ced77397c2fb0d9826072cea0da9d421aa7e792aa0f8ecc72/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8d7cf1628fd8d462b5d2ba6678fae34733a48ecb80495b9c68672ec6a6dde5ef", size = 204126227, upload-time = "2024-08-17T00:05:20.798Z" }, +] + +[[package]] +name = "nvidia-nccl-cu11" +version = "2.21.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/9a/8b6a28b3b87d5fddab0e92cd835339eb8fbddaa71ae67518c8c1b3d05bae/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:49d8350629c7888701d1fd200934942671cb5c728f49acc5a0b3a768820bed29", size = 147811630, upload-time = "2024-04-03T15:33:12.879Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu11" +version = "11.8.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/a2/23214c23118784dc2189ac2d2e48190df3e4206e2f73eb17d47140797a2b/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:890656d8bd9b4e280231c832e1f0d03459200ba4824ddda3dcb59b1e1989b9f5", size = 99125, upload-time = "2022-10-03T21:47:19.565Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, +] + +[[package]] +name = "opencv-python-headless" +version = "4.11.0.86" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/2f/5b2b3ba52c864848885ba988f24b7f105052f68da9ab0e693cc7c25b0b30/opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798", size = 95177929, upload-time = "2025-01-16T13:53:40.22Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/53/2c50afa0b1e05ecdb4603818e85f7d174e683d874ef63a6abe3ac92220c8/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca", size = 37326460, upload-time = "2025-01-16T13:52:57.015Z" }, + { url = "https://files.pythonhosted.org/packages/3b/43/68555327df94bb9b59a1fd645f63fafb0762515344d2046698762fc19d58/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81", size = 56723330, upload-time = "2025-01-16T13:55:45.731Z" }, + { url = "https://files.pythonhosted.org/packages/45/be/1438ce43ebe65317344a87e4b150865c5585f4c0db880a34cdae5ac46881/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb", size = 29487060, upload-time = "2025-01-16T13:51:59.625Z" }, + { url = "https://files.pythonhosted.org/packages/dd/5c/c139a7876099916879609372bfa513b7f1257f7f1a908b0bdc1c2328241b/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b", size = 49969856, upload-time = "2025-01-16T13:53:29.654Z" }, + { url = "https://files.pythonhosted.org/packages/95/dd/ed1191c9dc91abcc9f752b499b7928aacabf10567bb2c2535944d848af18/opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b", size = 29324425, upload-time = "2025-01-16T13:52:49.048Z" }, + { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, +] + +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, +] + +[[package]] +name = "partd" +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "locket" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/3a/3f06f34820a31257ddcabdfafc2672c5816be79c7e353b02c1f318daa7d4/partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c", size = 21029, upload-time = "2024-05-06T19:51:41.945Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, +] + +[[package]] +name = "pikepdf" +version = "10.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "lxml" }, + { name = "packaging" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/ba/7635a5f4259a2a91ed4f094e358dec3068ecedc891d70b8e76a02904ca0c/pikepdf-10.3.0.tar.gz", hash = "sha256:e2a64a5f1ebf8c411193126b9eeff7faf5739a40bce7441e579531422469fbb1", size = 4575749, upload-time = "2026-01-30T07:33:53.317Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a9/0d2107a3c796ab2fa7d379ee801190c95c4132f0bb5cfc1fd8d2e3ac74af/pikepdf-10.3.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:99fb21d20dc02f9828d477d2c549ee3f6e191801f84a2a2505d21baacb731745", size = 4753016, upload-time = "2026-01-30T07:32:51.999Z" }, + { url = "https://files.pythonhosted.org/packages/a9/2b/f634a0956aa15074db6c62309ec3d08bd158ddbdea8bd2081cea8b6eb3ed/pikepdf-10.3.0-cp311-cp311-macosx_15_0_x86_64.whl", hash = "sha256:c8a4b6862d7e0e69dd3f57efd362826966d1f341e0d052f7f23f0fe3a2375a36", size = 5063869, upload-time = "2026-01-30T07:32:54.418Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/d5ba1febacde805e7ec75a3df0888e53212f8e5f82fa1fc09c0fa981c7f9/pikepdf-10.3.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b86d42e66004ffaf5284aae0d9814bb3d19f048a45943479db5ca3d02d46bfb", size = 2445530, upload-time = "2026-01-30T07:32:56.117Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ba/196351a049a7a9d255140a414f586779b3ad77f0d09091e639d9f85c4131/pikepdf-10.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7021b31eddd5aa611f6941a2c171b7ce321c7763263ff658368f5f40bda1d4", size = 2673622, upload-time = "2026-01-30T07:32:57.85Z" }, + { url = "https://files.pythonhosted.org/packages/7c/cf/1315759de9dc66f769f84067da2127046e46489100f6e2be614fcb6c8394/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b653b1d0c5f17efb080ef68b65d3fcc8909f22128b75e0479775a35cd8d9fe6e", size = 3644910, upload-time = "2026-01-30T07:33:00.182Z" }, + { url = "https://files.pythonhosted.org/packages/80/6f/578ee7b53d06267f6c489fb7734792f6fa670a3a7d0b55db20b084e0957d/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fa3e4b32a2c1d15bb57e91ee3896c19b3c8145d46c26fbac8747efe7cb5ce3bd", size = 3835871, upload-time = "2026-01-30T07:33:02.804Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0f/980dbfb5ab9231d30e44d9285e8a7509f0871fc6fe438559e1eed16e683d/pikepdf-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:3233da668d665d301a4a4fd1481867e688336fdb410e9bc9d4e5b0cd62e334eb", size = 3756976, upload-time = "2026-01-30T07:33:05.596Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/d6ca7f6066d7f3b61b56bffeca1069c0ded635ba316aa1df54fcc0e2104f/pikepdf-10.3.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d1a6646def3fc47f763eab0dcb11341a7205cef1b7dc5c62f1dee435a89472b9", size = 4762039, upload-time = "2026-01-30T07:33:08.626Z" }, + { url = "https://files.pythonhosted.org/packages/9c/dc/d0db713a34a493eedf4eded566668762aee5acfad958bdf374a450df931c/pikepdf-10.3.0-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:e968e4e81d6c05d8e4b24594b27a64cb9be3c7a4371bf0635f6b669559171e6b", size = 5078640, upload-time = "2026-01-30T07:33:10.478Z" }, + { url = "https://files.pythonhosted.org/packages/21/c0/e0a1f1afb99ecac5f7f21313b47c174178f85df0f1ec7080e0d431324099/pikepdf-10.3.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfad0e4e6bc268ca041d639b232d76c25c9ad7023b7189d14869ef4446cabda2", size = 2450284, upload-time = "2026-01-30T07:33:12.215Z" }, + { url = "https://files.pythonhosted.org/packages/db/3a/2f0e8bd70cf57896a85b1d7f7ca3ce79d91a17222e1b23b607860ea52a5d/pikepdf-10.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cf7ab25f1e9063de320d2edecb2cd2960329cc25bac645c7938390f6538d9bf", size = 2699411, upload-time = "2026-01-30T07:33:13.878Z" }, + { url = "https://files.pythonhosted.org/packages/fd/10/da5f244aa14b845cd835f34b6a7a217493952f2532d2e00957ed3bd79aea/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3904353137e5b0cb2a316d84057e1e5301a65e6b1810d4763348ae8919ba20f4", size = 3649524, upload-time = "2026-01-30T07:33:15.641Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ef/3efb78a16d9c702dfd64fdeaee6a1ac6af95c41d4ec60b784e9171f20753/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4335ec70a659b5be1dfc7094a67db7f9c017c9c1cf9049b56d0e35ad24a46ff0", size = 3861320, upload-time = "2026-01-30T07:33:17.466Z" }, + { url = "https://files.pythonhosted.org/packages/8d/63/b0243fe62cf5d4d9da49010a15e0177b9629b8183092b3bd804f59a1529a/pikepdf-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac5befc1e991e28b16be104c219bdb1f6cf62a8371f4019ce7bab64ec5ec5745", size = 3763570, upload-time = "2026-01-30T07:33:19.863Z" }, +] + +[[package]] +name = "pillow" +version = "10.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/62/c9449f9c3043c37f73e7487ec4ef0c03eb9c9afc91a92b977a67b3c0bbc5/pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c", size = 3509265, upload-time = "2024-07-01T09:45:49.812Z" }, + { url = "https://files.pythonhosted.org/packages/f4/5f/491dafc7bbf5a3cc1845dc0430872e8096eb9e2b6f8161509d124594ec2d/pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be", size = 3375655, upload-time = "2024-07-01T09:45:52.462Z" }, + { url = "https://files.pythonhosted.org/packages/73/d5/c4011a76f4207a3c151134cd22a1415741e42fa5ddecec7c0182887deb3d/pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3", size = 4340304, upload-time = "2024-07-01T09:45:55.006Z" }, + { url = "https://files.pythonhosted.org/packages/ac/10/c67e20445a707f7a610699bba4fe050583b688d8cd2d202572b257f46600/pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6", size = 4452804, upload-time = "2024-07-01T09:45:58.437Z" }, + { url = "https://files.pythonhosted.org/packages/a9/83/6523837906d1da2b269dee787e31df3b0acb12e3d08f024965a3e7f64665/pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe", size = 4365126, upload-time = "2024-07-01T09:46:00.713Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e5/8c68ff608a4203085158cff5cc2a3c534ec384536d9438c405ed6370d080/pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319", size = 4533541, upload-time = "2024-07-01T09:46:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7c/01b8dbdca5bc6785573f4cee96e2358b0918b7b2c7b60d8b6f3abf87a070/pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d", size = 4471616, upload-time = "2024-07-01T09:46:05.356Z" }, + { url = "https://files.pythonhosted.org/packages/c8/57/2899b82394a35a0fbfd352e290945440e3b3785655a03365c0ca8279f351/pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696", size = 4600802, upload-time = "2024-07-01T09:46:08.145Z" }, + { url = "https://files.pythonhosted.org/packages/4d/d7/a44f193d4c26e58ee5d2d9db3d4854b2cfb5b5e08d360a5e03fe987c0086/pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496", size = 2235213, upload-time = "2024-07-01T09:46:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/c1/d0/5866318eec2b801cdb8c82abf190c8343d8a1cd8bf5a0c17444a6f268291/pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91", size = 2554498, upload-time = "2024-07-01T09:46:12.685Z" }, + { url = "https://files.pythonhosted.org/packages/d4/c8/310ac16ac2b97e902d9eb438688de0d961660a87703ad1561fd3dfbd2aa0/pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22", size = 2243219, upload-time = "2024-07-01T09:46:14.83Z" }, + { url = "https://files.pythonhosted.org/packages/05/cb/0353013dc30c02a8be34eb91d25e4e4cf594b59e5a55ea1128fde1e5f8ea/pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94", size = 3509350, upload-time = "2024-07-01T09:46:17.177Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5c558a0f247e0bf9cec92bff9b46ae6474dd736f6d906315e60e4075f737/pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597", size = 3374980, upload-time = "2024-07-01T09:46:19.169Z" }, + { url = "https://files.pythonhosted.org/packages/84/48/6e394b86369a4eb68b8a1382c78dc092245af517385c086c5094e3b34428/pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80", size = 4343799, upload-time = "2024-07-01T09:46:21.883Z" }, + { url = "https://files.pythonhosted.org/packages/3b/f3/a8c6c11fa84b59b9df0cd5694492da8c039a24cd159f0f6918690105c3be/pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca", size = 4459973, upload-time = "2024-07-01T09:46:24.321Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1b/c14b4197b80150fb64453585247e6fb2e1d93761fa0fa9cf63b102fde822/pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef", size = 4370054, upload-time = "2024-07-01T09:46:26.825Z" }, + { url = "https://files.pythonhosted.org/packages/55/77/40daddf677897a923d5d33329acd52a2144d54a9644f2a5422c028c6bf2d/pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a", size = 4539484, upload-time = "2024-07-01T09:46:29.355Z" }, + { url = "https://files.pythonhosted.org/packages/40/54/90de3e4256b1207300fb2b1d7168dd912a2fb4b2401e439ba23c2b2cabde/pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b", size = 4477375, upload-time = "2024-07-01T09:46:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/13/24/1bfba52f44193860918ff7c93d03d95e3f8748ca1de3ceaf11157a14cf16/pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9", size = 4608773, upload-time = "2024-07-01T09:46:33.73Z" }, + { url = "https://files.pythonhosted.org/packages/55/04/5e6de6e6120451ec0c24516c41dbaf80cce1b6451f96561235ef2429da2e/pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42", size = 2235690, upload-time = "2024-07-01T09:46:36.587Z" }, + { url = "https://files.pythonhosted.org/packages/74/0a/d4ce3c44bca8635bd29a2eab5aa181b654a734a29b263ca8efe013beea98/pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a", size = 2554951, upload-time = "2024-07-01T09:46:38.777Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ca/184349ee40f2e92439be9b3502ae6cfc43ac4b50bc4fc6b3de7957563894/pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9", size = 2243427, upload-time = "2024-07-01T09:46:43.15Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" }, + { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" }, + { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" }, + { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" }, + { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" }, + { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" }, + { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" }, + { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" }, + { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, +] + +[[package]] +name = "pyclipper" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/21/3c06205bb407e1f79b73b7b4dfb3950bd9537c4f625a68ab5cc41177f5bc/pyclipper-1.4.0.tar.gz", hash = "sha256:9882bd889f27da78add4dd6f881d25697efc740bf840274e749988d25496c8e1", size = 54489, upload-time = "2025-12-01T13:15:35.015Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/e3/64cf7794319b088c288706087141e53ac259c7959728303276d18adc665d/pyclipper-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:adcb7ca33c5bdc33cd775e8b3eadad54873c802a6d909067a57348bcb96e7a2d", size = 264281, upload-time = "2025-12-01T13:14:55.47Z" }, + { url = "https://files.pythonhosted.org/packages/34/cd/44ec0da0306fa4231e76f1c2cb1fa394d7bde8db490a2b24d55b39865f69/pyclipper-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fd24849d2b94ec749ceac7c34c9f01010d23b6e9d9216cf2238b8481160e703d", size = 139426, upload-time = "2025-12-01T13:14:56.683Z" }, + { url = "https://files.pythonhosted.org/packages/ad/88/d8f6c6763ea622fe35e19c75d8b39ed6c55191ddc82d65e06bc46b26cb8e/pyclipper-1.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b6c8d75ba20c6433c9ea8f1a0feb7e4d3ac06a09ad1fd6d571afc1ddf89b869", size = 989649, upload-time = "2025-12-01T13:14:58.28Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e9/ea7d68c8c4af3842d6515bedcf06418610ad75f111e64c92c1d4785a1513/pyclipper-1.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e29d7443d7cc0e83ee9daf43927730386629786d00c63b04fe3b53ac01462c", size = 962842, upload-time = "2025-12-01T13:15:00.044Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b7/0b4a272d8726e51ab05e2b933d8cc47f29757fb8212e38b619e170e6015c/pyclipper-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a8d2b5fb75ebe57e21ce61e79a9131edec2622ff23cc665e4d1d1f201bc1a801", size = 95098, upload-time = "2025-12-01T13:15:01.359Z" }, + { url = "https://files.pythonhosted.org/packages/3a/76/4901de2919198bb2bd3d989f86d4a1dff363962425bb2d63e24e6c990042/pyclipper-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:e9b973467d9c5fa9bc30bb6ac95f9f4d7c3d9fc25f6cf2d1cc972088e5955c01", size = 104362, upload-time = "2025-12-01T13:15:02.439Z" }, + { url = "https://files.pythonhosted.org/packages/90/1b/7a07b68e0842324d46c03e512d8eefa9cb92ba2a792b3b4ebf939dafcac3/pyclipper-1.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:222ac96c8b8281b53d695b9c4fedc674f56d6d4320ad23f1bdbd168f4e316140", size = 265676, upload-time = "2025-12-01T13:15:04.15Z" }, + { url = "https://files.pythonhosted.org/packages/6b/dd/8bd622521c05d04963420ae6664093f154343ed044c53ea260a310c8bb4d/pyclipper-1.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f3672dbafbb458f1b96e1ee3e610d174acb5ace5bd2ed5d1252603bb797f2fc6", size = 140458, upload-time = "2025-12-01T13:15:05.76Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/6e3e241882bf7d6ab23d9c69ba4e85f1ec47397cbbeee948a16cf75e21ed/pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d1f807e2b4760a8e5c6d6b4e8c1d71ef52b7fe1946ff088f4fa41e16a881a5ca", size = 978235, upload-time = "2025-12-01T13:15:06.993Z" }, + { url = "https://files.pythonhosted.org/packages/cf/f4/3418c1cd5eea640a9fa2501d4bc0b3655fa8d40145d1a4f484b987990a75/pyclipper-1.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce1f83c9a4e10ea3de1959f0ae79e9a5bd41346dff648fee6228ba9eaf8b3872", size = 961388, upload-time = "2025-12-01T13:15:08.467Z" }, + { url = "https://files.pythonhosted.org/packages/ac/94/c85401d24be634af529c962dd5d781f3cb62a67cd769534df2cb3feee97a/pyclipper-1.4.0-cp312-cp312-win32.whl", hash = "sha256:3ef44b64666ebf1cb521a08a60c3e639d21b8c50bfbe846ba7c52a0415e936f4", size = 95169, upload-time = "2025-12-01T13:15:10.098Z" }, + { url = "https://files.pythonhosted.org/packages/97/77/dfea08e3b230b82ee22543c30c35d33d42f846a77f96caf7c504dd54fab1/pyclipper-1.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:d1e5498d883b706a4ce636247f0d830c6eb34a25b843a1b78e2c969754ca9037", size = 104619, upload-time = "2025-12-01T13:15:11.592Z" }, + { url = "https://files.pythonhosted.org/packages/18/59/81050abdc9e5b90ffc2c765738c5e40e9abd8e44864aaa737b600f16c562/pyclipper-1.4.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98b2a40f98e1fc1b29e8a6094072e7e0c7dfe901e573bf6cfc6eb7ce84a7ae87", size = 126495, upload-time = "2025-12-01T13:15:33.743Z" }, +] + +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, + { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, + { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, + { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, + { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, + { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, + { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, + { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, + { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, + { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, + { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, + { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, + { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, + { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, + { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, + { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, + { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, + { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, +] + +[[package]] +name = "pydantic-settings" +version = "2.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pylatexenc" +version = "2.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" } + +[[package]] +name = "pymupdf" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pymupdfb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/57/da06ca4886afc71a624e4b463d05f45c8a822596ede939957295e229eb4e/PyMuPDF-1.24.10.tar.gz", hash = "sha256:bd3ebd6d3fb8a845582098362f885bfb0a31ae4272587efc2c55c5e29fe7327a", size = 46988085, upload-time = "2024-09-02T16:28:45.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/35/6af0bb4bafe9d54893a04d9639f73b1b754efe0235997052d75fb6b7edc1/PyMuPDF-1.24.10-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:5fbd67cce759fc0126902137409cf9da6313b776c4d5ff0d5200f336350f86a3", size = 3194012, upload-time = "2024-09-02T16:27:14.019Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2b/c254cf49dfcf2469a674407a680f5b2b174b866e84d322f5767baf4d3ad3/PyMuPDF-1.24.10-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:2b14dbdf7c415bb0fa849527abbe7b4f1f55ae23b9355d132951f634438c59ac", size = 2974781, upload-time = "2024-09-02T16:27:17.213Z" }, + { url = "https://files.pythonhosted.org/packages/1c/77/78800d3a711f92060f8e338a5df9330ffb5950f4fb3beeba01e15c03c4c6/PyMuPDF-1.24.10-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:1a87440a6cbc0d5ad513425baa0f4747841898fca6e37350ca3e6b29e5f40c01", size = 3210393, upload-time = "2024-09-02T22:17:05.788Z" }, + { url = "https://files.pythonhosted.org/packages/c5/39/3aaa1e8822c55c71bb37911b5b1c3157ef38d731581224b29a682d80a17b/PyMuPDF-1.24.10-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:c0d1ccdc062ea9961063790831e838bc43fcf9a8436a8b9f55898addf97c0f86", size = 3482650, upload-time = "2024-09-02T16:27:21.101Z" }, + { url = "https://files.pythonhosted.org/packages/5b/73/6b5c2dc59539b79cb9430ff946d7dff308af146f7c8bc7b96c963e12970d/PyMuPDF-1.24.10-cp311-none-musllinux_1_2_x86_64.whl", hash = "sha256:f68671363be5a2ba104ab7d3bad821d2994cbe3f3408538bbc27d32e6dc9f923", size = 3600588, upload-time = "2024-09-02T16:27:25.022Z" }, + { url = "https://files.pythonhosted.org/packages/71/e9/d3bf062325b4821726a2f9ce9d75b63f594ae24bc38c31f55b4285f1f5e1/PyMuPDF-1.24.10-cp311-none-win32.whl", hash = "sha256:49f83556cd1a7d05b36a54ccc01fce324da8a4e6854e36cc5cd94d321e428565", size = 2694768, upload-time = "2024-09-02T16:27:33.318Z" }, + { url = "https://files.pythonhosted.org/packages/30/3f/356a70c105d4410c29529f1ca8c53b5d176b448a4409238b4dcd133507a4/PyMuPDF-1.24.10-cp311-none-win_amd64.whl", hash = "sha256:05b8d360766b87f4abd186eba16a56b92bae513b2361b13f633fe6256329292e", size = 3214889, upload-time = "2024-09-02T16:27:28.174Z" }, + { url = "https://files.pythonhosted.org/packages/75/84/7231344d98355a40fb57c4025391dfb4116e2c3e9d98d5cc83f80c5ea942/PyMuPDF-1.24.10-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f323aa7bb55e0214e632bfe24fa140bd5dcfeac2d3977bdce46e760385140513", size = 3230169, upload-time = "2024-09-02T16:27:37.842Z" }, + { url = "https://files.pythonhosted.org/packages/b2/bc/975b4fe4400b00c912dad1874c43d31486150e6f39d7dae758751c27e2dd/PyMuPDF-1.24.10-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:50d2972558d25ce46a8634b58787b28dbeff9b3fe4299530fc9c8c9921061e83", size = 2980118, upload-time = "2024-09-02T16:27:41.534Z" }, + { url = "https://files.pythonhosted.org/packages/5b/dc/0f22c77ac4f8e6b8316072519513d5f0111fffe96d357051db0ddf043032/PyMuPDF-1.24.10-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:0e3969c2fdff682b3b2c6a2b463adde068d6d8e20e2133ef6c8503469259646a", size = 3216830, upload-time = "2024-09-02T22:17:09.193Z" }, + { url = "https://files.pythonhosted.org/packages/a3/1b/1b41b27aab571b835f8d983492b80ed64548e3b5c4d169e23c639727d43b/PyMuPDF-1.24.10-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:cd78ee1ebefdfe72bc36fd4b731cc8c694eb8ef5337d8ea956b0e94cd88751fc", size = 3491118, upload-time = "2024-09-02T16:27:50.098Z" }, + { url = "https://files.pythonhosted.org/packages/2d/3c/f1ffbc6e13ab37900c2aa71e434bbba922770091242e2b059acdb14f779e/PyMuPDF-1.24.10-cp312-none-musllinux_1_2_x86_64.whl", hash = "sha256:696eed91d2ee44e76277dfeb6bd904c84ae005378588949df6ed9be9e03b9817", size = 3612589, upload-time = "2024-09-02T16:27:54.185Z" }, + { url = "https://files.pythonhosted.org/packages/53/fb/158909af75c84968ea7e6659a75fd67bd462103c599033b23ffd6bc173be/PyMuPDF-1.24.10-cp312-none-win32.whl", hash = "sha256:1e5413e1aeab2f18e1ca1b3ff17057a4a7c5cbf4ff14abc93203da88fc1a1dd8", size = 2701190, upload-time = "2024-09-02T16:27:57.74Z" }, + { url = "https://files.pythonhosted.org/packages/91/4a/4a54d3f6a779ac5eed92e82fe3c1bb426bc40f9ea57c8656839198944a82/PyMuPDF-1.24.10-cp312-none-win_amd64.whl", hash = "sha256:227a4473fce8fa32b9268da68781048795503b67dc045867fc201e1334204bf1", size = 3228084, upload-time = "2024-09-02T16:27:45.749Z" }, +] + +[[package]] +name = "pymupdfb" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/ff/ecfcb41414b51976974d74c8e35fef0a0e5b47c7046a11c860553f5dccf0/PyMuPDFb-1.24.10.tar.gz", hash = "sha256:007b91fa9b528c5c0eecea2e49c486ac02e878274f9e31522bdd948adc5f8327", size = 37502, upload-time = "2024-09-02T16:28:48.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/94/b217dc987b4ac0e3793984427112d6032563b741e27763f7761c2231d022/PyMuPDFb-1.24.10-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:cd6b24630d90dce9ab3e59d06c5e616686f8d7ec626be1311721fcb062aa0078", size = 15536229, upload-time = "2024-09-02T16:25:19.4Z" }, + { url = "https://files.pythonhosted.org/packages/16/7a/f634c76d8331cb8dedcfaced17424cc469ee20b7f53cf29c9ef17a01b461/PyMuPDFb-1.24.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fda2c34b206f724b1b5685b67188e2a57bcaa5c99bc40a0a5bc62057514c5cdf", size = 15149482, upload-time = "2024-09-02T16:25:34.352Z" }, + { url = "https://files.pythonhosted.org/packages/62/97/67b5da2edd034e66dadd0ec530e277afb14fe866a3b3b01d9fad154bc6f8/PyMuPDFb-1.24.10-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4f50a7472f9bb10cbc7a1cd589ee4626ca030b8a4a02749f9a29eb6f00c0e0db", size = 15711338, upload-time = "2024-09-02T22:17:01.592Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/ad3f076e86328880797fe7e98c43b2879df56cf6cb75ac3058da06d6e6cb/PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:409f1270ef2e70d845e80149ff3db9cfed578274042316cba55cc3e3882421ea", size = 15921939, upload-time = "2024-09-02T16:26:00.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/e7/02160ea905a7ba16d6e1ca51759ae1c1045785ebebae57ba30e82617f934/PyMuPDFb-1.24.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:aca96b6e9ee3096a26810592f4d899f4d3cf3cf0c902ae7e8cca09bce4d946c4", size = 17076991, upload-time = "2024-09-02T16:25:46.703Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c0/e1ed840440131f71b068cdb3b620a69ec27543b1012a6bd855d8d05f1629/PyMuPDFb-1.24.10-py3-none-win32.whl", hash = "sha256:2d231b42fe3bf79837df235e7fbdf7ff8b46bf4ca1346d0f0124fb1cdd343ce8", size = 11731706, upload-time = "2024-09-02T16:26:19.131Z" }, + { url = "https://files.pythonhosted.org/packages/70/cb/8459d6c179befd7c6eee555334f054e9a6dcdd9f8671891e1da19e0ce526/PyMuPDFb-1.24.10-py3-none-win_amd64.whl", hash = "sha256:27ea65c701608b6b7632703339ca33ea6d513843b26dbe9bdefb2f56f7b9b196", size = 13186168, upload-time = "2024-09-02T16:26:10.503Z" }, +] + +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "python-bidi" +version = "0.6.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/e3/c0c8bf6fca79ac946a28d57f116e3b9e5b10a4469b6f70bf73f3744c49bf/python_bidi-0.6.7.tar.gz", hash = "sha256:c10065081c0e137975de5d9ba2ff2306286dbf5e0c586d4d5aec87c856239b41", size = 45503, upload-time = "2025-10-22T09:52:49.624Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/de/c30a13ad95239507af472a5fc2cadd2e5e172055068f12ac39b37922c7f8/python_bidi-0.6.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a8892a7da0f617135fe9c92dc7070d13a0f96ab3081f9db7ff5b172a3905bd78", size = 274420, upload-time = "2025-10-22T09:51:58.262Z" }, + { url = "https://files.pythonhosted.org/packages/ad/9f/be5efef7eea5f1e2a6415c4052a988f594dcf5a11a15103f2718d324a35b/python_bidi-0.6.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:06650a164e63e94dc8a291cc9d415b4027cb1cce125bc9b02dac0f34d535ed47", size = 264586, upload-time = "2025-10-22T09:51:49.255Z" }, + { url = "https://files.pythonhosted.org/packages/87/ec/2c374b6de35870817ffb3512c0666ea8c3794ef923b5586c69451e0e5395/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6df7be07af867ec1d121c92ea827efad4d77b25457c06eeab477b601e82b2340", size = 293672, upload-time = "2025-10-22T09:50:58.504Z" }, + { url = "https://files.pythonhosted.org/packages/29/1a/722d7d7128bdc9a530351a0d2fdf2ff5f4af66a865a6bca925f99832e2cc/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:73a88dc333efc42281bd800d5182c8625c6e11d109fc183fe3d7a11d48ab1150", size = 302643, upload-time = "2025-10-22T09:51:06.419Z" }, + { url = "https://files.pythonhosted.org/packages/24/d7/5b9b593dd58fc745233d8476e9f4e0edd437547c78c58340619868470349/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f24189dc3aea3a0a94391a047076e1014306b39ba17d7a38ebab510553cd1a97", size = 441692, upload-time = "2025-10-22T09:51:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/08/b9/16e7a1db5f022da6654e89875d231ec2e044d42ef7b635feeff61cee564c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a507fe6928a27a308e04ebf2065719b7850d1bf9ff1924f4e601ef77758812bd", size = 326933, upload-time = "2025-10-22T09:51:23.631Z" }, + { url = "https://files.pythonhosted.org/packages/e0/a6/45aaec301292c6a07a9cc3168f5d1a92c8adc2ef36a3cd1f227b9caa980c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbbffb948a32f9783d1a28bc0c53616f0a76736ed1e7c1d62e3e99a8dfaab869", size = 302034, upload-time = "2025-10-22T09:51:41.347Z" }, + { url = "https://files.pythonhosted.org/packages/71/a3/7e42cce6e153c21b4e5cc96d429a5910909823f6fedd174b64ff67bc76a7/python_bidi-0.6.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7e507e1e798ebca77ddc9774fd405107833315ad802cfdaa1ab07b6d9154fc8", size = 315738, upload-time = "2025-10-22T09:51:33.409Z" }, + { url = "https://files.pythonhosted.org/packages/43/7c/a5e4c0acc8e6ca61953b4add0576f0483f63b809b5389154e5da13927b0b/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:849a57d39feaf897955d0b19bbf4796bea53d1bcdf83b82e0a7b059167eb2049", size = 473968, upload-time = "2025-10-22T09:52:07.624Z" }, + { url = "https://files.pythonhosted.org/packages/b1/aa/a18bc3cbab7a0e598cbe7b89f2c0913aedcc66dcafce9a4c357465c87859/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5ebc19f24e65a1f5c472e26d88e78b9d316e293bc6f205f32de4c4e99276336e", size = 567038, upload-time = "2025-10-22T09:52:18.594Z" }, + { url = "https://files.pythonhosted.org/packages/92/46/fc6c54a8b5bfbee50e650f885ddef4f8c4f92880467ea0bc2bf133747048/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:24388c77cb00b8aa0f9c84beb7e3e523a3dac4f786ece64a1d8175a07b24da72", size = 493970, upload-time = "2025-10-22T09:52:29.815Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f1/2c15f5b938b2e087e4e950cc14dcead5bedbaabfc6c576dac15739bc0c91/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:19737d217088ef27014f98eac1827c5913e6fb1dea96332ed84ede61791070d9", size = 465161, upload-time = "2025-10-22T09:52:40.517Z" }, + { url = "https://files.pythonhosted.org/packages/56/d7/73a70a1fb819152485521b8dfe627e14ba9d3d5a65213244ab099adf3600/python_bidi-0.6.7-cp311-cp311-win32.whl", hash = "sha256:95c9de7ebc55ffb777548f2ecaf4b96b0fa0c92f42bf4d897b9f4cd164ec7394", size = 157033, upload-time = "2025-10-22T09:52:59.228Z" }, + { url = "https://files.pythonhosted.org/packages/68/84/06999dc54ea047fe33209af7150df4202ab7ad52deeb66b2c2040ac07884/python_bidi-0.6.7-cp311-cp311-win_amd64.whl", hash = "sha256:898db0ea3e4aaa95b7fecba02a7560dfbf368f9d85053f2875f6d610c4d4ec2c", size = 161282, upload-time = "2025-10-22T09:52:51.467Z" }, + { url = "https://files.pythonhosted.org/packages/e5/03/5b2f3e73501d0f41ebc2b075b49473047c6cdfc3465cf890263fc69e3915/python_bidi-0.6.7-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:11c51579e01f768446a7e13a0059fea1530936a707abcbeaad9467a55cb16073", size = 272536, upload-time = "2025-10-22T09:51:59.721Z" }, + { url = "https://files.pythonhosted.org/packages/31/77/c6048e938a73e5a7c6fa3d5e3627a5961109daa728c2e7d050567cecdc26/python_bidi-0.6.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47deaada8949af3a790f2cd73b613f9bfa153b4c9450f91c44a60c3109a81f73", size = 263258, upload-time = "2025-10-22T09:51:50.328Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/ed4dc501cab7de70ce35cd435c86278e4eb1caf238c80bc72297767c9219/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b38ddfab41d10e780edb431edc30aec89bee4ce43d718e3896e99f33dae5c1d3", size = 292700, upload-time = "2025-10-22T09:50:59.628Z" }, + { url = "https://files.pythonhosted.org/packages/77/6a/1bf06d7544c940ffddd97cd0e02c55348a92163c5495fa18e34217dfbebe/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a93b0394cc684d64356b0475858c116f1e335ffbaba388db93bf47307deadfa", size = 300881, upload-time = "2025-10-22T09:51:07.507Z" }, + { url = "https://files.pythonhosted.org/packages/22/1d/ce7577a8f50291c06e94f651ac5de0d1678fc2642af26a5dad9901a0244f/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec1694134961b71ac05241ac989b49ccf08e232b5834d5fc46f8a7c3bb1c13a9", size = 439125, upload-time = "2025-10-22T09:51:16.559Z" }, + { url = "https://files.pythonhosted.org/packages/a3/87/4cf6dcd58e22f0fd904e7a161c6b73a5f9d17d4d49073fcb089ba62f1469/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8047c33b85f7790474a1f488bef95689f049976a4e1c6f213a8d075d180a93e4", size = 325816, upload-time = "2025-10-22T09:51:25.12Z" }, + { url = "https://files.pythonhosted.org/packages/2a/0a/4028a088e29ce8f1673e85ec9f64204fc368355c3207e6a71619c2b4579a/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d9de35eb5987da27dd81e371c52142dd8e924bd61c1006003071ea05a735587", size = 300550, upload-time = "2025-10-22T09:51:42.739Z" }, + { url = "https://files.pythonhosted.org/packages/1f/05/cac15eba462d5a2407ac4ef1c792c45a948652b00c6bd81eaab3834a62d2/python_bidi-0.6.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a99d898ad1a399d9c8cab5561b3667fd24f4385820ac90c3340aa637aa5adfc9", size = 313017, upload-time = "2025-10-22T09:51:34.905Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b1/3ba91b9ea60fa54a9aa730a5fe432bd73095d55be371244584fc6818eae1/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5debaab33562fdfc79ffdbd8d9c51cf07b8529de0e889d8cd145d78137aab21e", size = 472798, upload-time = "2025-10-22T09:52:09.079Z" }, + { url = "https://files.pythonhosted.org/packages/50/40/4bf5fb7255e35c218174f322a4d4c80b63b2604d73adc6e32f843e700824/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c11c62a3cdb9d1426b1536de9e3446cb09c7d025bd4df125275cae221f214899", size = 565234, upload-time = "2025-10-22T09:52:19.703Z" }, + { url = "https://files.pythonhosted.org/packages/bd/81/ad23fb85bff69d0a25729cd3834254b87c3c7caa93d657c8f8edcbed08f6/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6c051f2d28ca542092d01da8b5fe110fb6191ff58d298a54a93dc183bece63bf", size = 491844, upload-time = "2025-10-22T09:52:31.216Z" }, + { url = "https://files.pythonhosted.org/packages/65/85/103baaf142b2838f583b71904a2454fa31bd2a912ff505c25874f45d6c3e/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95867a07c5dee0ea2340fe1d0e4f6d9f5c5687d473193b6ee6f86fa44aac45d1", size = 463753, upload-time = "2025-10-22T09:52:41.943Z" }, + { url = "https://files.pythonhosted.org/packages/54/c3/6a5c3b9f42a6b188430c83a7e70a76bc7c0db3354302fce7c8ed94a0c062/python_bidi-0.6.7-cp312-cp312-win32.whl", hash = "sha256:4c73cd980d45bb967799c7f0fc98ea93ae3d65b21ef2ba6abef6a057720bf483", size = 155820, upload-time = "2025-10-22T09:53:00.254Z" }, + { url = "https://files.pythonhosted.org/packages/45/c4/683216398ee3abf6b9bb0f26ae15c696fabbe36468ba26d5271f0c11b343/python_bidi-0.6.7-cp312-cp312-win_amd64.whl", hash = "sha256:d524a4ba765bae9b950706472a77a887a525ed21144fe4b41f6190f6e57caa2c", size = 159966, upload-time = "2025-10-22T09:52:52.547Z" }, + { url = "https://files.pythonhosted.org/packages/b8/4e/6135798d84b62eea70c0f9435301c2a4ba854e87be93a3fcd1d935266d24/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c9a679b24f5c6f366a0dec75745e1abeae2f597f033d0d54c74cbe62e7e6ae28", size = 276275, upload-time = "2025-10-22T09:52:05.078Z" }, + { url = "https://files.pythonhosted.org/packages/74/83/2123596d43e552af9e2806e361646fa579f34a1d1e9e2c1707a0ab6a02dd/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fe5971110013610f0db40505d0b204edc756e92eafac1372a464f8b9162b11", size = 266951, upload-time = "2025-10-22T09:51:56.216Z" }, + { url = "https://files.pythonhosted.org/packages/5c/8c/8d1e1501717227a6d52fc7b9c47a3de61486b024fbdd4821bfad724c0699/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17572944e6d8fb616d111fc702c759da2bf7cedab85a3e4fa2af0c9eb95ed438", size = 295745, upload-time = "2025-10-22T09:51:04.438Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ff/ef04e7f9067c2c5d862b9f8d9a192486c500c8aa295f0fb756c25ab47fc8/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3b63d19f3f56ff7f99bce5ca9ef8c811dbf0f509d8e84c1bc06105ed26a49528", size = 304123, upload-time = "2025-10-22T09:51:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/be/72/b973895e257a7d4cc8365ab094612f6ee885df863a4964d8865b9f534b67/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1350033431d75be749273236dcfc808e54404cd6ece6204cdb1bc4ccc163455", size = 442484, upload-time = "2025-10-22T09:51:21.575Z" }, + { url = "https://files.pythonhosted.org/packages/c1/1a/68ca9d10bc309828e8cdb2d57a30dd7e5753ac8520c8d7a0322daeb9eef7/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c5fb99f774748de283fadf915106f130b74be1bade934b7f73a7a8488b95da1", size = 329149, upload-time = "2025-10-22T09:51:31.232Z" }, + { url = "https://files.pythonhosted.org/packages/03/40/ab450c06167a7de596d99b1ba5cee2c605b3ff184baccf08210ede706b1b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d28e2bdcadf5b6161bb4ee9313ce41eac746ba57e744168bf723a415a11af05", size = 303529, upload-time = "2025-10-22T09:51:46.997Z" }, + { url = "https://files.pythonhosted.org/packages/ec/c5/585b5c413e3b77a32500fb877ea30aa23c45a6064dbd7fe77d87b72cd90b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3777ae3e088e94df854fbcbd8d59f9239b74aac036cb6bbd19f8035c8e42478", size = 317753, upload-time = "2025-10-22T09:51:39.272Z" }, + { url = "https://files.pythonhosted.org/packages/f9/05/b7b4b447890d614ccb40633f4d65f334bcf9fe3ad13be33aaa54dcbc34f3/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:77bb4cbadf4121db395189065c58c9dd5d1950257cc1983004e6df4a3e2f97ad", size = 476054, upload-time = "2025-10-22T09:52:15.856Z" }, + { url = "https://files.pythonhosted.org/packages/ca/94/64f6d2c09c4426918345b54ca8902f94b663eadd744c9dd89070f546c9bc/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:f1fe71c203f66bc169a393964d5702f9251cfd4d70279cb6453fdd42bd2e675f", size = 568365, upload-time = "2025-10-22T09:52:27.556Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d2/c39a6b82aa0fcedac7cbe6078b78bb9089b43d903f8e00859e42b504bb8e/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:d87ed09e5c9b6d2648e8856a4e556147b9d3cd4d63905fa664dd6706bc414256", size = 495292, upload-time = "2025-10-22T09:52:38.306Z" }, + { url = "https://files.pythonhosted.org/packages/0a/8d/a80f37ab92118e305d7b574306553599f81534c50b4eb23ef34ebe09c09c/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:766d5f5a686eb99b53168a7bdfb338035931a609bdbbcb537cef9e050a86f359", size = 467159, upload-time = "2025-10-22T09:52:48.603Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-docx" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, +] + +[[package]] +name = "python-pptx" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "pillow" }, + { name = "typing-extensions" }, + { name = "xlsxwriter" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, +] + +[[package]] +name = "pytz" +version = "2026.1.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, +] + +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" }, + { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" }, + { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" }, + { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, + { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, +] + +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + +[[package]] +name = "regex" +version = "2026.2.28" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/db/8cbfd0ba3f302f2d09dd0019a9fcab74b63fee77a76c937d0e33161fb8c1/regex-2026.2.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e621fb7c8dc147419b28e1702f58a0177ff8308a76fa295c71f3e7827849f5d9", size = 488462, upload-time = "2026-02-28T02:16:22.616Z" }, + { url = "https://files.pythonhosted.org/packages/5d/10/ccc22c52802223f2368731964ddd117799e1390ffc39dbb31634a83022ee/regex-2026.2.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d5bef2031cbf38757a0b0bc4298bb4824b6332d28edc16b39247228fbdbad97", size = 290774, upload-time = "2026-02-28T02:16:23.993Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/6796b3bf3101e64117201aaa3a5a030ec677ecf34b3cd6141b5d5c6c67d5/regex-2026.2.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcb399ed84eabf4282587ba151f2732ad8168e66f1d3f85b1d038868fe547703", size = 288724, upload-time = "2026-02-28T02:16:25.403Z" }, + { url = "https://files.pythonhosted.org/packages/9c/02/291c0ae3f3a10cea941d0f5366da1843d8d1fa8a25b0671e20a0e454bb38/regex-2026.2.28-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1b34dfa72f826f535b20712afa9bb3ba580020e834f3c69866c5bddbf10098", size = 791924, upload-time = "2026-02-28T02:16:26.863Z" }, + { url = "https://files.pythonhosted.org/packages/0f/57/f0235cc520d9672742196c5c15098f8f703f2758d48d5a7465a56333e496/regex-2026.2.28-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:851fa70df44325e1e4cdb79c5e676e91a78147b1b543db2aec8734d2add30ec2", size = 860095, upload-time = "2026-02-28T02:16:28.772Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7c/393c94cbedda79a0f5f2435ebd01644aba0b338d327eb24b4aa5b8d6c07f/regex-2026.2.28-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:516604edd17b1c2c3e579cf4e9b25a53bf8fa6e7cedddf1127804d3e0140ca64", size = 906583, upload-time = "2026-02-28T02:16:30.977Z" }, + { url = "https://files.pythonhosted.org/packages/2c/73/a72820f47ca5abf2b5d911d0407ba5178fc52cf9780191ed3a54f5f419a2/regex-2026.2.28-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7ce83654d1ab701cb619285a18a8e5a889c1216d746ddc710c914ca5fd71022", size = 800234, upload-time = "2026-02-28T02:16:32.55Z" }, + { url = "https://files.pythonhosted.org/packages/34/b3/6e6a4b7b31fa998c4cf159a12cbeaf356386fbd1a8be743b1e80a3da51e4/regex-2026.2.28-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2791948f7c70bb9335a9102df45e93d428f4b8128020d85920223925d73b9e1", size = 772803, upload-time = "2026-02-28T02:16:34.029Z" }, + { url = "https://files.pythonhosted.org/packages/10/e7/5da0280c765d5a92af5e1cd324b3fe8464303189cbaa449de9a71910e273/regex-2026.2.28-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a83cc26aa2acda6b8b9dfe748cf9e84cbd390c424a1de34fdcef58961a297a", size = 781117, upload-time = "2026-02-28T02:16:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/76/39/0b8d7efb256ae34e1b8157acc1afd8758048a1cf0196e1aec2e71fd99f4b/regex-2026.2.28-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ec6f5674c5dc836994f50f1186dd1fafde4be0666aae201ae2fcc3d29d8adf27", size = 854224, upload-time = "2026-02-28T02:16:38.119Z" }, + { url = "https://files.pythonhosted.org/packages/21/ff/a96d483ebe8fe6d1c67907729202313895d8de8495569ec319c6f29d0438/regex-2026.2.28-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:50c2fc924749543e0eacc93ada6aeeb3ea5f6715825624baa0dccaec771668ae", size = 761898, upload-time = "2026-02-28T02:16:40.333Z" }, + { url = "https://files.pythonhosted.org/packages/89/bd/d4f2e75cb4a54b484e796017e37c0d09d8a0a837de43d17e238adf163f4e/regex-2026.2.28-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ba55c50f408fb5c346a3a02d2ce0ebc839784e24f7c9684fde328ff063c3cdea", size = 844832, upload-time = "2026-02-28T02:16:41.875Z" }, + { url = "https://files.pythonhosted.org/packages/8a/a7/428a135cf5e15e4e11d1e696eb2bf968362f8ea8a5f237122e96bc2ae950/regex-2026.2.28-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edb1b1b3a5576c56f08ac46f108c40333f222ebfd5cf63afdfa3aab0791ebe5b", size = 788347, upload-time = "2026-02-28T02:16:43.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/59/68691428851cf9c9c3707217ab1d9b47cfeec9d153a49919e6c368b9e926/regex-2026.2.28-cp311-cp311-win32.whl", hash = "sha256:948c12ef30ecedb128903c2c2678b339746eb7c689c5c21957c4a23950c96d15", size = 266033, upload-time = "2026-02-28T02:16:45.094Z" }, + { url = "https://files.pythonhosted.org/packages/42/8b/1483de1c57024e89296cbcceb9cccb3f625d416ddb46e570be185c9b05a9/regex-2026.2.28-cp311-cp311-win_amd64.whl", hash = "sha256:fd63453f10d29097cc3dc62d070746523973fb5aa1c66d25f8558bebd47fed61", size = 277978, upload-time = "2026-02-28T02:16:46.75Z" }, + { url = "https://files.pythonhosted.org/packages/a4/36/abec45dc6e7252e3dbc797120496e43bb5730a7abf0d9cb69340696a2f2d/regex-2026.2.28-cp311-cp311-win_arm64.whl", hash = "sha256:00f2b8d9615aa165fdff0a13f1a92049bfad555ee91e20d246a51aa0b556c60a", size = 270340, upload-time = "2026-02-28T02:16:48.626Z" }, + { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" }, + { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" }, + { url = "https://files.pythonhosted.org/packages/9e/06/3ef1ac6910dc3295ebd71b1f9bfa737e82cfead211a18b319d45f85ddd09/regex-2026.2.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b65d33a17101569f86d9c5966a8b1d7fbf8afdda5a8aa219301b0a80f58cf7d", size = 289200, upload-time = "2026-02-28T02:16:54.08Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c9/8cc8d850b35ab5650ff6756a1cb85286e2000b66c97520b29c1587455344/regex-2026.2.28-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e71dcecaa113eebcc96622c17692672c2d104b1d71ddf7adeda90da7ddeb26fc", size = 796765, upload-time = "2026-02-28T02:16:55.905Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5d/57702597627fc23278ebf36fbb497ac91c0ce7fec89ac6c81e420ca3e38c/regex-2026.2.28-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:481df4623fa4969c8b11f3433ed7d5e3dc9cec0f008356c3212b3933fb77e3d8", size = 863093, upload-time = "2026-02-28T02:16:58.094Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/f3ecad537ca2811b4d26b54ca848cf70e04fcfc138667c146a9f3157779c/regex-2026.2.28-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64e7c6ad614573e0640f271e811a408d79a9e1fe62a46adb602f598df42a818d", size = 909455, upload-time = "2026-02-28T02:17:00.918Z" }, + { url = "https://files.pythonhosted.org/packages/9e/40/bb226f203caa22c1043c1ca79b36340156eca0f6a6742b46c3bb222a3a57/regex-2026.2.28-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6b08a06976ff4fb0d83077022fde3eca06c55432bb997d8c0495b9a4e9872f4", size = 802037, upload-time = "2026-02-28T02:17:02.842Z" }, + { url = "https://files.pythonhosted.org/packages/44/7c/c6d91d8911ac6803b45ca968e8e500c46934e58c0903cbc6d760ee817a0a/regex-2026.2.28-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:864cdd1a2ef5716b0ab468af40139e62ede1b3a53386b375ec0786bb6783fc05", size = 775113, upload-time = "2026-02-28T02:17:04.506Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8d/4a9368d168d47abd4158580b8c848709667b1cd293ff0c0c277279543bd0/regex-2026.2.28-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:511f7419f7afab475fd4d639d4aedfc54205bcb0800066753ef68a59f0f330b5", size = 784194, upload-time = "2026-02-28T02:17:06.888Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bf/2c72ab5d8b7be462cb1651b5cc333da1d0068740342f350fcca3bca31947/regex-2026.2.28-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b42f7466e32bf15a961cf09f35fa6323cc72e64d3d2c990b10de1274a5da0a59", size = 856846, upload-time = "2026-02-28T02:17:09.11Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f4/6b65c979bb6d09f51bb2d2a7bc85de73c01ec73335d7ddd202dcb8cd1c8f/regex-2026.2.28-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8710d61737b0c0ce6836b1da7109f20d495e49b3809f30e27e9560be67a257bf", size = 763516, upload-time = "2026-02-28T02:17:11.004Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/29ea5e27400ee86d2cc2b4e80aa059df04eaf78b4f0c18576ae077aeff68/regex-2026.2.28-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4390c365fd2d45278f45afd4673cb90f7285f5701607e3ad4274df08e36140ae", size = 849278, upload-time = "2026-02-28T02:17:12.693Z" }, + { url = "https://files.pythonhosted.org/packages/1d/91/3233d03b5f865111cd517e1c95ee8b43e8b428d61fa73764a80c9bb6f537/regex-2026.2.28-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb3b1db8ff6c7b8bf838ab05583ea15230cb2f678e569ab0e3a24d1e8320940b", size = 790068, upload-time = "2026-02-28T02:17:14.9Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/abc706c1fb03b4580a09645b206a3fc032f5a9f457bc1a8038ac555658ab/regex-2026.2.28-cp312-cp312-win32.whl", hash = "sha256:f8ed9a5d4612df9d4de15878f0bc6aa7a268afbe5af21a3fdd97fa19516e978c", size = 266416, upload-time = "2026-02-28T02:17:17.15Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/2a6f7dff190e5fa9df9fb4acf2fdf17a1aa0f7f54596cba8de608db56b3a/regex-2026.2.28-cp312-cp312-win_amd64.whl", hash = "sha256:01d65fd24206c8e1e97e2e31b286c59009636c022eb5d003f52760b0f42155d4", size = 277297, upload-time = "2026-02-28T02:17:18.723Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f0/58a2484851fadf284458fdbd728f580d55c1abac059ae9f048c63b92f427/regex-2026.2.28-cp312-cp312-win_arm64.whl", hash = "sha256:c0b5ccbb8ffb433939d248707d4a8b31993cb76ab1a0187ca886bf50e96df952", size = 270408, upload-time = "2026-02-28T02:17:20.328Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, +] + +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" }, + { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" }, + { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" }, + { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" }, + { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" }, + { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" }, + { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" }, + { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" }, + { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" }, + { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" }, + { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" }, + { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" }, + { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" }, + { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" }, + { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, + { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, + { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, + { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, + { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, + { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, + { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" }, + { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" }, + { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" }, + { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" }, + { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" }, + { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" }, + { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" }, + { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" }, + { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" }, + { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" }, + { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, +] + +[[package]] +name = "rtree" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/09/7302695875a019514de9a5dd17b8320e7a19d6e7bc8f85dcfb79a4ce2da3/rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46", size = 52425, upload-time = "2025-08-13T19:32:01.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/d9/108cd989a4c0954e60b3cdc86fd2826407702b5375f6dfdab2802e5fed98/rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4", size = 468484, upload-time = "2025-08-13T19:31:50.593Z" }, + { url = "https://files.pythonhosted.org/packages/f3/cf/2710b6fd6b07ea0aef317b29f335790ba6adf06a28ac236078ed9bd8a91d/rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d", size = 436325, upload-time = "2025-08-13T19:31:52.367Z" }, + { url = "https://files.pythonhosted.org/packages/55/e1/4d075268a46e68db3cac51846eb6a3ab96ed481c585c5a1ad411b3c23aad/rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65", size = 459789, upload-time = "2025-08-13T19:31:53.926Z" }, + { url = "https://files.pythonhosted.org/packages/d1/75/e5d44be90525cd28503e7f836d077ae6663ec0687a13ba7810b4114b3668/rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c", size = 507644, upload-time = "2025-08-13T19:31:55.164Z" }, + { url = "https://files.pythonhosted.org/packages/fd/85/b8684f769a142163b52859a38a486493b05bafb4f2fb71d4f945de28ebf9/rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967", size = 1454478, upload-time = "2025-08-13T19:31:56.808Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a4/c2292b95246b9165cc43a0c3757e80995d58bc9b43da5cb47ad6e3535213/rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc", size = 1555140, upload-time = "2025-08-13T19:31:58.031Z" }, + { url = "https://files.pythonhosted.org/packages/74/25/5282c8270bfcd620d3e73beb35b40ac4ab00f0a898d98ebeb41ef0989ec8/rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489", size = 389358, upload-time = "2025-08-13T19:31:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" }, +] + +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + +[package.optional-dependencies] +torch = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "torch" }, +] + +[[package]] +name = "scikit-image" +version = "0.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "imageio" }, + { name = "lazy-loader" }, + { name = "networkx" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "scipy" }, + { name = "tifffile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/b4/2528bb43c67d48053a7a649a9666432dc307d66ba02e3a6d5c40f46655df/scikit_image-0.26.0.tar.gz", hash = "sha256:f5f970ab04efad85c24714321fcc91613fcb64ef2a892a13167df2f3e59199fa", size = 22729739, upload-time = "2025-12-20T17:12:21.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/16/8a407688b607f86f81f8c649bf0d68a2a6d67375f18c2d660aba20f5b648/scikit_image-0.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b1ede33a0fb3731457eaf53af6361e73dd510f449dac437ab54573b26788baf0", size = 12355510, upload-time = "2025-12-20T17:10:31.628Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f9/7efc088ececb6f6868fd4475e16cfafc11f242ce9ab5fc3557d78b5da0d4/scikit_image-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7af7aa331c6846bd03fa28b164c18d0c3fd419dbb888fb05e958ac4257a78fdd", size = 12056334, upload-time = "2025-12-20T17:10:34.559Z" }, + { url = "https://files.pythonhosted.org/packages/9f/1e/bc7fb91fb5ff65ef42346c8b7ee8b09b04eabf89235ab7dbfdfd96cbd1ea/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ea6207d9e9d21c3f464efe733121c0504e494dbdc7728649ff3e23c3c5a4953", size = 13297768, upload-time = "2025-12-20T17:10:37.733Z" }, + { url = "https://files.pythonhosted.org/packages/a5/2a/e71c1a7d90e70da67b88ccc609bd6ae54798d5847369b15d3a8052232f9d/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74aa5518ccea28121f57a95374581d3b979839adc25bb03f289b1bc9b99c58af", size = 13711217, upload-time = "2025-12-20T17:10:40.935Z" }, + { url = "https://files.pythonhosted.org/packages/d4/59/9637ee12c23726266b91296791465218973ce1ad3e4c56fc81e4d8e7d6e1/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d5c244656de905e195a904e36dbc18585e06ecf67d90f0482cbde63d7f9ad59d", size = 14337782, upload-time = "2025-12-20T17:10:43.452Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5c/a3e1e0860f9294663f540c117e4bf83d55e5b47c281d475cc06227e88411/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21a818ee6ca2f2131b9e04d8eb7637b5c18773ebe7b399ad23dcc5afaa226d2d", size = 14805997, upload-time = "2025-12-20T17:10:45.93Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c6/2eeacf173da041a9e388975f54e5c49df750757fcfc3ee293cdbbae1ea0a/scikit_image-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:9490360c8d3f9a7e85c8de87daf7c0c66507960cf4947bb9610d1751928721c7", size = 11878486, upload-time = "2025-12-20T17:10:48.246Z" }, + { url = "https://files.pythonhosted.org/packages/c3/a4/a852c4949b9058d585e762a66bf7e9a2cd3be4795cd940413dfbfbb0ce79/scikit_image-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:0baa0108d2d027f34d748e84e592b78acc23e965a5de0e4bb03cf371de5c0581", size = 11346518, upload-time = "2025-12-20T17:10:50.575Z" }, + { url = "https://files.pythonhosted.org/packages/99/e8/e13757982264b33a1621628f86b587e9a73a13f5256dad49b19ba7dc9083/scikit_image-0.26.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d454b93a6fa770ac5ae2d33570f8e7a321bb80d29511ce4b6b78058ebe176e8c", size = 12376452, upload-time = "2025-12-20T17:10:52.796Z" }, + { url = "https://files.pythonhosted.org/packages/e3/be/f8dd17d0510f9911f9f17ba301f7455328bf13dae416560126d428de9568/scikit_image-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3409e89d66eff5734cd2b672d1c48d2759360057e714e1d92a11df82c87cba37", size = 12061567, upload-time = "2025-12-20T17:10:55.207Z" }, + { url = "https://files.pythonhosted.org/packages/b3/2b/c70120a6880579fb42b91567ad79feb4772f7be72e8d52fec403a3dde0c6/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c717490cec9e276afb0438dd165b7c3072d6c416709cc0f9f5a4c1070d23a44", size = 13084214, upload-time = "2025-12-20T17:10:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/f4/a2/70401a107d6d7466d64b466927e6b96fcefa99d57494b972608e2f8be50f/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df650e79031634ac90b11e64a9eedaf5a5e06fcd09bcd03a34be01745744466", size = 13561683, upload-time = "2025-12-20T17:10:59.49Z" }, + { url = "https://files.pythonhosted.org/packages/13/a5/48bdfd92794c5002d664e0910a349d0a1504671ef5ad358150f21643c79a/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cefd85033e66d4ea35b525bb0937d7f42d4cdcfed2d1888e1570d5ce450d3932", size = 14112147, upload-time = "2025-12-20T17:11:02.083Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b5/ac71694da92f5def5953ca99f18a10fe98eac2dd0a34079389b70b4d0394/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3f5bf622d7c0435884e1e141ebbe4b2804e16b2dd23ae4c6183e2ea99233be70", size = 14661625, upload-time = "2025-12-20T17:11:04.528Z" }, + { url = "https://files.pythonhosted.org/packages/23/4d/a3cc1e96f080e253dad2251bfae7587cf2b7912bcd76fd43fd366ff35a87/scikit_image-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:abed017474593cd3056ae0fe948d07d0747b27a085e92df5474f4955dd65aec0", size = 11911059, upload-time = "2025-12-20T17:11:06.61Z" }, + { url = "https://files.pythonhosted.org/packages/35/8a/d1b8055f584acc937478abf4550d122936f420352422a1a625eef2c605d8/scikit_image-0.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:4d57e39ef67a95d26860c8caf9b14b8fb130f83b34c6656a77f191fa6d1d04d8", size = 11348740, upload-time = "2025-12-20T17:11:09.118Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312, upload-time = "2025-01-10T08:07:55.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/2a/e291c29670795406a824567d1dfc91db7b699799a002fdaa452bceea8f6e/scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33", size = 12102620, upload-time = "2025-01-10T08:06:16.675Z" }, + { url = "https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d", size = 11116234, upload-time = "2025-01-10T08:06:21.83Z" }, + { url = "https://files.pythonhosted.org/packages/30/cd/ed4399485ef364bb25f388ab438e3724e60dc218c547a407b6e90ccccaef/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2", size = 12592155, upload-time = "2025-01-10T08:06:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/62fc9a5a659bb58a03cdd7e258956a5824bdc9b4bb3c5d932f55880be569/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8", size = 13497069, upload-time = "2025-01-10T08:06:32.515Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415", size = 11139809, upload-time = "2025-01-10T08:06:35.514Z" }, + { url = "https://files.pythonhosted.org/packages/0a/18/c797c9b8c10380d05616db3bfb48e2a3358c767affd0857d56c2eb501caa/scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b", size = 12104516, upload-time = "2025-01-10T08:06:40.009Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2", size = 11167837, upload-time = "2025-01-10T08:06:43.305Z" }, + { url = "https://files.pythonhosted.org/packages/a4/f6/ff7beaeb644bcad72bcfd5a03ff36d32ee4e53a8b29a639f11bcb65d06cd/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f", size = 12253728, upload-time = "2025-01-10T08:06:47.618Z" }, + { url = "https://files.pythonhosted.org/packages/29/7a/8bce8968883e9465de20be15542f4c7e221952441727c4dad24d534c6d99/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86", size = 13147700, upload-time = "2025-01-10T08:06:50.888Z" }, + { url = "https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52", size = 11110613, upload-time = "2025-01-10T08:06:54.115Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" }, + { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" }, + { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" }, + { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" }, + { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" }, + { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" }, + { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" }, + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, +] + +[[package]] +name = "semchunk" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpire", extra = ["dill"] }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/62/96/c418c322730b385e81d4ab462e68dd48bb2dbda4d8efa17cad2ca468d9ac/semchunk-2.2.2.tar.gz", hash = "sha256:940e89896e64eeb01de97ba60f51c8c7b96c6a3951dfcf574f25ce2146752f52", size = 12271, upload-time = "2024-12-17T22:54:30.332Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/84/94ca7896c7df20032bcb09973e9a4d14c222507c0aadf22e89fa76bb0a04/semchunk-2.2.2-py3-none-any.whl", hash = "sha256:94ca19020c013c073abdfd06d79a7c13637b91738335f3b8cdb5655ee7cc94d2", size = 10271, upload-time = "2024-12-17T22:54:27.689Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, +] + +[[package]] +name = "shapely" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" }, + { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" }, + { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" }, + { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" }, + { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" }, + { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" }, + { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" }, + { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" }, + { url = "https://files.pythonhosted.org/packages/24/c0/f3b6453cf2dfa99adc0ba6675f9aaff9e526d2224cbd7ff9c1a879238693/shapely-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fe2533caae6a91a543dec62e8360fe86ffcdc42a7c55f9dfd0128a977a896b94", size = 1833550, upload-time = "2025-09-24T13:50:30.019Z" }, + { url = "https://files.pythonhosted.org/packages/86/07/59dee0bc4b913b7ab59ab1086225baca5b8f19865e6101db9ebb7243e132/shapely-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba4d1333cc0bc94381d6d4308d2e4e008e0bd128bdcff5573199742ee3634359", size = 1643556, upload-time = "2025-09-24T13:50:32.291Z" }, + { url = "https://files.pythonhosted.org/packages/26/29/a5397e75b435b9895cd53e165083faed5d12fd9626eadec15a83a2411f0f/shapely-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bd308103340030feef6c111d3eb98d50dc13feea33affc8a6f9fa549e9458a3", size = 2988308, upload-time = "2025-09-24T13:50:33.862Z" }, + { url = "https://files.pythonhosted.org/packages/b9/37/e781683abac55dde9771e086b790e554811a71ed0b2b8a1e789b7430dd44/shapely-2.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1e7d4d7ad262a48bb44277ca12c7c78cb1b0f56b32c10734ec9a1d30c0b0c54b", size = 3099844, upload-time = "2025-09-24T13:50:35.459Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f3/9876b64d4a5a321b9dc482c92bb6f061f2fa42131cba643c699f39317cb9/shapely-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e9eddfe513096a71896441a7c37db72da0687b34752c4e193577a145c71736fc", size = 3988842, upload-time = "2025-09-24T13:50:37.478Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a0/704c7292f7014c7e74ec84eddb7b109e1fbae74a16deae9c1504b1d15565/shapely-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:980c777c612514c0cf99bc8a9de6d286f5e186dcaf9091252fcd444e5638193d", size = 4152714, upload-time = "2025-09-24T13:50:39.9Z" }, + { url = "https://files.pythonhosted.org/packages/53/46/319c9dc788884ad0785242543cdffac0e6530e4d0deb6c4862bc4143dcf3/shapely-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9111274b88e4d7b54a95218e243282709b330ef52b7b86bc6aaf4f805306f454", size = 1542745, upload-time = "2025-09-24T13:50:41.414Z" }, + { url = "https://files.pythonhosted.org/packages/ec/bf/cb6c1c505cb31e818e900b9312d514f381fbfa5c4363edfce0fcc4f8c1a4/shapely-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:743044b4cfb34f9a67205cee9279feaf60ba7d02e69febc2afc609047cb49179", size = 1722861, upload-time = "2025-09-24T13:50:43.35Z" }, +] + +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, +] + +[[package]] +name = "sympy" +version = "1.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, +] + +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tifffile" +version = "2026.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/cb/2f6d79c7576e22c116352a801f4c3c8ace5957e9aced862012430b62e14f/tifffile-2026.3.3.tar.gz", hash = "sha256:d9a1266bed6f2ee1dd0abde2018a38b4f8b2935cb843df381d70ac4eac5458b7", size = 388745, upload-time = "2026-03-03T19:14:38.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.20.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/25/b1681c1c30ea3ea6e584ae3fffd552430b12faa599b558c4c4783f56d7ff/tokenizers-0.20.3.tar.gz", hash = "sha256:2278b34c5d0dd78e087e1ca7f9b1dcbf129d80211afa645f214bd6e051037539", size = 340513, upload-time = "2024-11-05T17:34:10.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/93/6742ef9206409d5ce1fdf44d5ca1687cdc3847ba0485424e2c731e6bcf67/tokenizers-0.20.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:585b51e06ca1f4839ce7759941e66766d7b060dccfdc57c4ca1e5b9a33013a90", size = 2674224, upload-time = "2024-11-05T17:30:49.972Z" }, + { url = "https://files.pythonhosted.org/packages/aa/14/e75ece72e99f6ef9ae07777ca9fdd78608f69466a5cecf636e9bd2f25d5c/tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61cbf11954f3b481d08723ebd048ba4b11e582986f9be74d2c3bdd9293a4538d", size = 2558991, upload-time = "2024-11-05T17:30:51.666Z" }, + { url = "https://files.pythonhosted.org/packages/46/54/033b5b2ba0c3ae01e026c6f7ced147d41a2fa1c573d00a66cb97f6d7f9b3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef820880d5e4e8484e2fa54ff8d297bb32519eaa7815694dc835ace9130a3eea", size = 2892476, upload-time = "2024-11-05T17:30:53.505Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/cc369fb3297d61f3311cab523d16d48c869dc2f0ba32985dbf03ff811041/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:67ef4dcb8841a4988cd00dd288fb95dfc8e22ed021f01f37348fd51c2b055ba9", size = 2802775, upload-time = "2024-11-05T17:30:55.229Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/62ad983e8ea6a63e04ed9c5be0b605056bf8aac2f0125f9b5e0b3e2b89fa/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb", size = 3086138, upload-time = "2024-11-05T17:30:57.332Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ac/4637ba619db25094998523f9e6f5b456e1db1f8faa770a3d925d436db0c3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:444d188186eab3148baf0615b522461b41b1f0cd58cd57b862ec94b6ac9780f1", size = 3098076, upload-time = "2024-11-05T17:30:59.455Z" }, + { url = "https://files.pythonhosted.org/packages/58/ce/9793f2dc2ce529369807c9c74e42722b05034af411d60f5730b720388c7d/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37c04c032c1442740b2c2d925f1857885c07619224a533123ac7ea71ca5713da", size = 3379650, upload-time = "2024-11-05T17:31:01.264Z" }, + { url = "https://files.pythonhosted.org/packages/50/f6/2841de926bc4118af996eaf0bdf0ea5b012245044766ffc0347e6c968e63/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453c7769d22231960ee0e883d1005c93c68015025a5e4ae56275406d94a3c907", size = 2994005, upload-time = "2024-11-05T17:31:02.985Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b2/00915c4fed08e9505d37cf6eaab45b12b4bff8f6719d459abcb9ead86a4b/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4bb31f7b2847e439766aaa9cc7bccf7ac7088052deccdb2275c952d96f691c6a", size = 8977488, upload-time = "2024-11-05T17:31:04.424Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ac/1c069e7808181ff57bcf2d39e9b6fbee9133a55410e6ebdaa89f67c32e83/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:843729bf0f991b29655a069a2ff58a4c24375a553c70955e15e37a90dd4e045c", size = 9294935, upload-time = "2024-11-05T17:31:06.882Z" }, + { url = "https://files.pythonhosted.org/packages/50/47/722feb70ee68d1c4412b12d0ea4acc2713179fd63f054913990f9e259492/tokenizers-0.20.3-cp311-none-win32.whl", hash = "sha256:efcce3a927b1e20ca694ba13f7a68c59b0bd859ef71e441db68ee42cf20c2442", size = 2197175, upload-time = "2024-11-05T17:31:09.385Z" }, + { url = "https://files.pythonhosted.org/packages/75/68/1b4f928b15a36ed278332ac75d66d7eb65d865bf344d049c452c18447bf9/tokenizers-0.20.3-cp311-none-win_amd64.whl", hash = "sha256:88301aa0801f225725b6df5dea3d77c80365ff2362ca7e252583f2b4809c4cc0", size = 2381616, upload-time = "2024-11-05T17:31:10.685Z" }, + { url = "https://files.pythonhosted.org/packages/07/00/92a08af2a6b0c88c50f1ab47d7189e695722ad9714b0ee78ea5e1e2e1def/tokenizers-0.20.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:49d12a32e190fad0e79e5bdb788d05da2f20d8e006b13a70859ac47fecf6ab2f", size = 2667951, upload-time = "2024-11-05T17:31:12.356Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9a/e17a352f0bffbf415cf7d73756f5c73a3219225fc5957bc2f39d52c61684/tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:282848cacfb9c06d5e51489f38ec5aa0b3cd1e247a023061945f71f41d949d73", size = 2555167, upload-time = "2024-11-05T17:31:13.839Z" }, + { url = "https://files.pythonhosted.org/packages/27/37/d108df55daf4f0fcf1f58554692ff71687c273d870a34693066f0847be96/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe4e08c7d0cd6154c795deb5bf81d2122f36daf075e0c12a8b050d824ef0a64", size = 2898389, upload-time = "2024-11-05T17:31:15.12Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/32f29da16d28f59472fa7fb38e7782069748c7e9ab9854522db20341624c/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca94fc1b73b3883c98f0c88c77700b13d55b49f1071dfd57df2b06f3ff7afd64", size = 2795866, upload-time = "2024-11-05T17:31:16.857Z" }, + { url = "https://files.pythonhosted.org/packages/29/4e/8a9a3c89e128c4a40f247b501c10279d2d7ade685953407c4d94c8c0f7a7/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef279c7e239f95c8bdd6ff319d9870f30f0d24915b04895f55b1adcf96d6c60d", size = 3085446, upload-time = "2024-11-05T17:31:18.392Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3b/a2a7962c496ebcd95860ca99e423254f760f382cd4bd376f8895783afaf5/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16384073973f6ccbde9852157a4fdfe632bb65208139c9d0c0bd0176a71fd67f", size = 3094378, upload-time = "2024-11-05T17:31:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f4/a8a33f0192a1629a3bd0afcad17d4d221bbf9276da4b95d226364208d5eb/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:312d522caeb8a1a42ebdec87118d99b22667782b67898a76c963c058a7e41d4f", size = 3385755, upload-time = "2024-11-05T17:31:21.778Z" }, + { url = "https://files.pythonhosted.org/packages/9e/65/c83cb3545a65a9eaa2e13b22c93d5e00bd7624b354a44adbdc93d5d9bd91/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b7cb962564785a83dafbba0144ecb7f579f1d57d8c406cdaa7f32fe32f18ad", size = 2997679, upload-time = "2024-11-05T17:31:23.134Z" }, + { url = "https://files.pythonhosted.org/packages/55/e9/a80d4e592307688a67c7c59ab77e03687b6a8bd92eb5db763a2c80f93f57/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:124c5882ebb88dadae1fc788a582299fcd3a8bd84fc3e260b9918cf28b8751f5", size = 8989296, upload-time = "2024-11-05T17:31:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/60c957af8d2244321124e893828f1a4817cde1a2d08d09d423b73f19bd2f/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2b6e54e71f84c4202111a489879005cb14b92616a87417f6c102c833af961ea2", size = 9303621, upload-time = "2024-11-05T17:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/be/a9/96172310ee141009646d63a1ca267c099c462d747fe5ef7e33f74e27a683/tokenizers-0.20.3-cp312-none-win32.whl", hash = "sha256:83d9bfbe9af86f2d9df4833c22e94d94750f1d0cd9bfb22a7bb90a86f61cdb1c", size = 2188979, upload-time = "2024-11-05T17:31:29.483Z" }, + { url = "https://files.pythonhosted.org/packages/bd/68/61d85ae7ae96dde7d0974ff3538db75d5cdc29be2e4329cd7fc51a283e22/tokenizers-0.20.3-cp312-none-win_amd64.whl", hash = "sha256:44def74cee574d609a36e17c8914311d1b5dbcfe37c55fd29369d42591b91cf2", size = 2380725, upload-time = "2024-11-05T17:31:31.315Z" }, +] + +[[package]] +name = "toolz" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/d6/114b492226588d6ff54579d95847662fc69196bdeec318eb45393b24c192/toolz-1.1.0.tar.gz", hash = "sha256:27a5c770d068c110d9ed9323f24f1543e83b2f300a687b7891c1a6d56b697b5b", size = 52613, upload-time = "2025-10-17T04:03:21.661Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/12/5911ae3eeec47800503a238d971e51722ccea5feb8569b735184d5fcdbc0/toolz-1.1.0-py3-none-any.whl", hash = "sha256:15ccc861ac51c53696de0a5d6d4607f99c210739caf987b5d2054f3efed429d8", size = 58093, upload-time = "2025-10-17T04:03:20.435Z" }, +] + +[[package]] +name = "torch" +version = "2.6.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, + { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:3e73419aab6dbcd888a3cc6a00d1f52f5950d918d7289ea6aeae751346613edc" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:6ab0417ce9b78ab0a34721a99734b5fd4cc3d7b62ff1c068a7d636fd829772db" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:9f7d170d6c78726945d95fcc3a3d7601f36aed0e6e0dc9ca377a64d6a8fd7b3a" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:6c040e4181c5dae73b965b61394ec431c93b2018165e2be8f15fc68d44444cb3" }, +] + +[[package]] +name = "torchaudio" +version = "2.6.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "torch" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, +] + +[[package]] +name = "torchvision" +version = "0.21.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, + { name = "torch" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "transformers" +version = "4.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944, upload-time = "2024-11-18T22:13:01.012Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, +] + +[[package]] +name = "tree-sitter" +version = "0.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, + { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, + { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, + { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, + { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, + { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, + { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, + { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, + { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, + { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, + { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, + { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, + { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, + { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, + { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, + { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, + { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, + { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, + { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, +] + +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, + { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, + { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, + { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, + { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, + { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/fc/bb52958f7e399250aee093751e9373a6311cadbe76b6e0d109b853757f35/tree_sitter_typescript-0.23.2.tar.gz", hash = "sha256:7b167b5827c882261cb7a50dfa0fb567975f9b315e87ed87ad0a0a3aedb3834d", size = 773053, upload-time = "2024-11-11T02:36:11.396Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/95/4c00680866280e008e81dd621fd4d3f54aa3dad1b76b857a19da1b2cc426/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3cd752d70d8e5371fdac6a9a4df9d8924b63b6998d268586f7d374c9fba2a478", size = 286677, upload-time = "2024-11-11T02:35:58.839Z" }, + { url = "https://files.pythonhosted.org/packages/8f/2f/1f36fda564518d84593f2740d5905ac127d590baf5c5753cef2a88a89c15/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c7cc1b0ff5d91bac863b0e38b1578d5505e718156c9db577c8baea2557f66de8", size = 302008, upload-time = "2024-11-11T02:36:00.733Z" }, + { url = "https://files.pythonhosted.org/packages/96/2d/975c2dad292aa9994f982eb0b69cc6fda0223e4b6c4ea714550477d8ec3a/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1eed5b0b3a8134e86126b00b743d667ec27c63fc9de1b7bb23168803879e31", size = 351987, upload-time = "2024-11-11T02:36:02.669Z" }, + { url = "https://files.pythonhosted.org/packages/49/d1/a71c36da6e2b8a4ed5e2970819b86ef13ba77ac40d9e333cb17df6a2c5db/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e96d36b85bcacdeb8ff5c2618d75593ef12ebaf1b4eace3477e2bdb2abb1752c", size = 344960, upload-time = "2024-11-11T02:36:04.443Z" }, + { url = "https://files.pythonhosted.org/packages/7f/cb/f57b149d7beed1a85b8266d0c60ebe4c46e79c9ba56bc17b898e17daf88e/tree_sitter_typescript-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8d4f0f9bcb61ad7b7509d49a1565ff2cc363863644a234e1e0fe10960e55aea0", size = 340245, upload-time = "2024-11-11T02:36:06.473Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ab/dd84f0e2337296a5f09749f7b5483215d75c8fa9e33738522e5ed81f7254/tree_sitter_typescript-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:3f730b66396bc3e11811e4465c41ee45d9e9edd6de355a58bbbc49fa770da8f9", size = 278015, upload-time = "2024-11-11T02:36:07.631Z" }, + { url = "https://files.pythonhosted.org/packages/9f/e4/81f9a935789233cf412a0ed5fe04c883841d2c8fb0b7e075958a35c65032/tree_sitter_typescript-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:05db58f70b95ef0ea126db5560f3775692f609589ed6f8dd0af84b7f19f1cbb7", size = 274052, upload-time = "2024-11-11T02:36:09.514Z" }, +] + +[[package]] +name = "triton" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636, upload-time = "2025-01-22T19:12:51.322Z" }, + { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, +] + +[[package]] +name = "typer" +version = "0.16.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] + +[[package]] +name = "wrapt" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/81/60c4471fce95afa5922ca09b88a25f03c93343f759aae0f31fb4412a85c7/wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb", size = 60666, upload-time = "2026-03-06T02:52:58.934Z" }, + { url = "https://files.pythonhosted.org/packages/6b/be/80e80e39e7cb90b006a0eaf11c73ac3a62bbfb3068469aec15cc0bc795de/wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d", size = 61601, upload-time = "2026-03-06T02:53:00.487Z" }, + { url = "https://files.pythonhosted.org/packages/b0/be/d7c88cd9293c859fc74b232abdc65a229bb953997995d6912fc85af18323/wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894", size = 114057, upload-time = "2026-03-06T02:52:44.08Z" }, + { url = "https://files.pythonhosted.org/packages/ea/25/36c04602831a4d685d45a93b3abea61eca7fe35dab6c842d6f5d570ef94a/wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842", size = 116099, upload-time = "2026-03-06T02:54:56.74Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4e/98a6eb417ef551dc277bec1253d5246b25003cf36fdf3913b65cb7657a56/wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8", size = 112457, upload-time = "2026-03-06T02:53:52.842Z" }, + { url = "https://files.pythonhosted.org/packages/cb/a6/a6f7186a5297cad8ec53fd7578533b28f795fdf5372368c74bd7e6e9841c/wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6", size = 115351, upload-time = "2026-03-06T02:53:32.684Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/06e66189e721dbebd5cf20e138acc4d1150288ce118462f2fcbff92d38db/wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9", size = 111748, upload-time = "2026-03-06T02:53:08.455Z" }, + { url = "https://files.pythonhosted.org/packages/ef/43/4808b86f499a51370fbdbdfa6cb91e9b9169e762716456471b619fca7a70/wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15", size = 113783, upload-time = "2026-03-06T02:53:02.02Z" }, + { url = "https://files.pythonhosted.org/packages/91/2c/a3f28b8fa7ac2cefa01cfcaca3471f9b0460608d012b693998cd61ef43df/wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b", size = 57977, upload-time = "2026-03-06T02:53:27.844Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c3/2b1c7bd07a27b1db885a2fab469b707bdd35bddf30a113b4917a7e2139d2/wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1", size = 60336, upload-time = "2026-03-06T02:54:28.104Z" }, + { url = "https://files.pythonhosted.org/packages/ec/5c/76ece7b401b088daa6503d6264dd80f9a727df3e6042802de9a223084ea2/wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a", size = 58756, upload-time = "2026-03-06T02:53:16.319Z" }, + { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255, upload-time = "2026-03-06T02:52:45.663Z" }, + { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848, upload-time = "2026-03-06T02:53:48.728Z" }, + { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433, upload-time = "2026-03-06T02:54:40.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013, upload-time = "2026-03-06T02:53:26.58Z" }, + { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326, upload-time = "2026-03-06T02:53:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444, upload-time = "2026-03-06T02:54:09.5Z" }, + { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237, upload-time = "2026-03-06T02:54:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563, upload-time = "2026-03-06T02:53:20.412Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198, upload-time = "2026-03-06T02:53:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441, upload-time = "2026-03-06T02:52:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836, upload-time = "2026-03-06T02:53:22.053Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, +] + +[[package]] +name = "xlsxwriter" +version = "3.2.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, +] + +[[package]] +name = "yarl" +version = "1.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" }, + { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" }, + { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" }, + { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" }, + { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" }, + { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" }, + { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" }, + { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" }, + { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" }, + { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" }, + { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" }, + { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, + { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, + { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, + { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, + { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, + { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, + { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, + { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, + { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" }, + { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" }, + { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" }, + { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" }, + { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" }, + { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" }, + { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" }, + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, +] diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index b85460e..c7a5b58 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -1,67 +1,57 @@ # GlossAPI Dependency Profiles & Test Notes ## Environment Profiles -- **Vanilla** – core GlossAPI pipeline without GPU OCR add-ons. Uses `requirements-glossapi-vanilla.txt`. -- **RapidOCR** – Docling + RapidOCR GPU stack. Builds on vanilla requirements and adds ONNX runtime (`requirements-glossapi-rapidocr.txt`). -- **DeepSeek** – GPU OCR via DeepSeek/vLLM. Extends vanilla requirements with torch/cu128, nightly vLLM and supporting CUDA libs (`requirements-glossapi-deepseek.txt`). `xformers` was dropped because the published wheels still pin Torch 2.8; the rest of the stack now installs cleanly on Torch 2.9. +- **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml`. -Each profile is installed through `dependency_setup/setup_glossapi.sh`: +Recommended installation commands: ```bash -# Examples (venv path optional) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests -./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests ``` Key flags: -- `--download-deepseek` optionally fetches DeepSeek weights (skipped by default; set `--weights-dir` if they live elsewhere). +- `--download-model` optionally fetches DeepSeek weights (set `--model-root` if they live elsewhere). - `--smoke-test` (DeepSeek only) runs `dependency_setup/deepseek_gpu_smoke.py`. ## Test Segmentation Pytest markers were added so suites can be run per profile: -- `rapidocr` – GPU Docling/RapidOCR integration tests. - `deepseek` – DeepSeek execution paths. -- Unmarked tests cover the vanilla footprint. +- Unmarked tests cover the Docling/core footprint. -`setup_glossapi.sh` now chooses marker expressions automatically: +Suggested commands: -| Mode | Command run by script | -|-----------|---------------------------------------------------------| -| vanilla | `pytest -q -m "not rapidocr and not deepseek" tests` | -| rapidocr | `pytest -q -m "not deepseek" tests` | -| deepseek | `pytest -q -m "not rapidocr" tests` | +| Profile | Command | +|-----------|---------| +| Docling | `pytest -q -m "not deepseek" tests` | +| DeepSeek | `pytest -q -m "deepseek" tests` | -Heavy GPU tests in `tests/test_pipeline_smoke.py` were guarded with `pytest.importorskip("onnxruntime")` so vanilla installs skip them cleanly. Helper PDFs now embed DejaVuSans with Unicode support and insert spacing to keep OCR-friendly glyphs. +## Validation Runs (2026-03-08) +- `./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --smoke-test` -## Validation Runs (2025-10-30) -- `./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests` - -All three completed successfully after the following adjustments: -1. **Rust extensions** – switched to `pip install -e rust/glossapi_rs_{cleaner,noise}` because `maturin develop` left the wheel unregistered. -2. **Parquet locking** – `_parquet_lock` now creates parent directories before attempting the file lock (fixes `FileNotFoundError` in concurrent metadata tests). -3. **RapidOCR pipeline** – fixed `GlossExtract.create_extractor()` to build the Docling converter regardless of import path and added UTF-8 PDF generation improvements; smoke tests now pass on CUDA. -4. **DeepSeek stack** – updated nightly vLLM pin (`0.11.1rc5.dev58+g60f76baa6.cu129`) and removed `xformers` to resolve Torch 2.9 dependency conflicts. +These completed successfully after the following adjustments: +1. **Rust extensions** – use editable installs for `rust/glossapi_rs_{cleaner,noise}` so local changes are picked up immediately. +2. **DeepSeek stack** – moved to a uv-managed runtime pinned to the `transformers`-based OCR-2 path. +3. **Attention fallback** – the DeepSeek runner falls back to `eager` attention if `flash-attn` is unavailable. ## Known Follow-ups -- **DeepSeek weights** – installer warns if weights are absent. Set `--download-deepseek` or populate `${DEEPSEEK_ROOT}/DeepSeek-OCR` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). -- **xformers kernels** – removed pending compatible Torch 2.9 wheels. Reintroduce once upstream publishes matching builds. +- **DeepSeek weights** – installer warns if weights are absent. Set `--download-model` or populate `${MODEL_ROOT}/DeepSeek-OCR-2` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). +- **flash-attn** – optional. Reintroduce into the pinned flow once wheel availability is stable across target hosts. - **Patchelf warnings** – maturin emits rpath hints if `patchelf` is missing; they are benign but install `patchelf` if cleaner logs are desired. -- **Deprecation noise** – Docling emits future warnings (Pydantic) and RapidOCR font deprecation notices; currently harmless but worth tracking for future upgrades. +- **Deprecation noise** – Docling and Transformers emit some warnings on current pins; currently harmless but worth tracking for future upgrades. ## Quick Reference -- Activate an environment: `source dependency_setup/.venvs//bin/activate` +- Activate an environment: `source dependency_setup/.venvs//bin/activate` - Re-run tests manually: - - Vanilla: `pytest -m "not rapidocr and not deepseek" tests` - - RapidOCR: `pytest -m "not deepseek" tests` - - DeepSeek: `pytest -m "not rapidocr" tests` + - Docling: `pytest -m "not deepseek" tests` + - DeepSeek: `pytest -m "deepseek" tests` - DeepSeek runtime exports: ```bash export GLOSSAPI_DEEPSEEK_PYTHON="dependency_setup/.venvs/deepseek/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="/mnt/data/glossAPI/deepseek-ocr/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="/mnt/data/glossAPI/deepseek-ocr/libjpeg-turbo/lib" - export LD_LIBRARY_PATH="$GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH:${LD_LIBRARY_PATH:-}" + export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT="/mnt/data/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py" + export GLOSSAPI_DEEPSEEK_MODEL_DIR="/mnt/data/glossAPI/deepseek-ocr-2-model/DeepSeek-OCR-2" ``` These notes capture the current dependency state, the rationale behind constraint changes, and the validation steps used to exercise each profile. diff --git a/dependency_setup/requirements-glossapi-deepseek.txt b/dependency_setup/requirements-glossapi-deepseek.txt index 5cc685a..8185d9c 100644 --- a/dependency_setup/requirements-glossapi-deepseek.txt +++ b/dependency_setup/requirements-glossapi-deepseek.txt @@ -1,16 +1,13 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 ---extra-index-url https://wheels.vllm.ai/nightly --r requirements-glossapi-vanilla.txt -# CUDA Torch stack aligned with NVIDIA L4 (CUDA 12.8 wheels) -torch==2.9.0+cu128 -torchvision==0.24.0+cu128 -torchaudio==2.9.0+cu128 -# DeepSeek via nightly vLLM -vllm==0.11.1rc5.dev58+g60f76baa6.cu129 -flashinfer-python==0.4.1 -compressed-tensors==0.12.2 -depyf==0.20.0 -# Auxiliary CUDA libs -nvidia-nvshmem-cu12==3.3.20 -nvidia-nccl-cu12==2.27.5 -triton==3.5.0 +--extra-index-url https://download.pytorch.org/whl/cu118 +-r requirements-glossapi-docling.txt +torch==2.6.0 +torchvision==0.21.0 +torchaudio==2.6.0 +transformers==4.46.3 +tokenizers==0.20.3 +accelerate>=1.2.1,<2 +pymupdf==1.24.10 +Pillow==10.4.0 +img2pdf>=0.5.1 +easydict +addict diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt new file mode 100644 index 0000000..402261a --- /dev/null +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -0,0 +1,38 @@ +# Core GlossAPI runtime (Docling extraction/layout) +maturin>=1.5,<2.0 +numpy<2 +pandas>=1.3.0 +python-dateutil>=2.8.2 +pytz>=2021.1 +scikit-learn==1.6.1 +joblib>=1.0.0 +dask>=2022.1.0 +pyarrow>=7.0.0 +aiohttp>=3.8.0 +aiofiles>=23.0.0 +ftfy>=6.0.0 +tenacity>=8.0.0 +tqdm>=4.67.0 +pyyaml>=6.0 +pypdfium2>=4.0.0 +zstandard>=0.22.0 +docling==2.48.0 +docling-core==2.47.0 +docling-parse==4.4.0 +docling-ibm-models==3.9.1 +msgspec>=0.18.6 +fpdf2>=2.7.0 +cachetools +cbor2 +einops +tiktoken +diskcache==5.6.3 +lark==1.2.2 +numba==0.61.2 +# Tooling / tests +pytest>=8.0 +pytest-mock>=3.14 +psutil>=5.9 +rich>=14.0 +safetensors>=0.4 +huggingface-hub>=0.22 diff --git a/dependency_setup/requirements-glossapi-rapidocr.txt b/dependency_setup/requirements-glossapi-rapidocr.txt deleted file mode 100644 index f5c5839..0000000 --- a/dependency_setup/requirements-glossapi-rapidocr.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements-glossapi-vanilla.txt -rapidocr>=3.3.0 -opencv-python-headless>=4.8.0 -onnxruntime-gpu==1.18.1 diff --git a/dependency_setup/setup_deepseek_uv.sh b/dependency_setup/setup_deepseek_uv.sh new file mode 100755 index 0000000..04a21ba --- /dev/null +++ b/dependency_setup/setup_deepseek_uv.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +PROJECT_DIR="${SCRIPT_DIR}/deepseek_uv" + +PYTHON_BIN="${PYTHON:-python3}" +VENV_PATH="${GLOSSAPI_DEEPSEEK_VENV:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" +MODEL_ROOT="${DEEPSEEK_ROOT:-${REPO_ROOT}/deepseek-ocr-2-model}" +DOWNLOAD_MODEL=0 +RUN_SMOKE=0 +RUN_TESTS=0 + +info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + +SYNC_ARGS=(--no-dev) + +usage() { + cat <<'EOF' +Usage: setup_deepseek_uv.sh [options] + +Options: + --venv PATH Target virtual environment path + --python PATH Python executable to use for uv venv + --model-root PATH Destination root for the DeepSeek-OCR-2 model + --download-model Download DeepSeek-OCR-2 via huggingface_hub + --run-tests Run the DeepSeek pytest subset after installation + --smoke-test Run dependency_setup/deepseek_gpu_smoke.py + --help Show this help message +EOF +} + +while (( "$#" )); do + case "$1" in + --venv) + shift || { echo "--venv requires a path" >&2; exit 1; } + VENV_PATH="${1:-}" + ;; + --python) + shift || { echo "--python requires a path" >&2; exit 1; } + PYTHON_BIN="${1:-}" + ;; + --model-root|--weights-dir) + shift || { echo "--model-root requires a path" >&2; exit 1; } + MODEL_ROOT="${1:-}" + ;; + --download-model|--download-deepseek) + DOWNLOAD_MODEL=1 + ;; + --run-tests) + RUN_TESTS=1 + ;; + --smoke-test) + RUN_SMOKE=1 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift || true +done + +command -v uv >/dev/null 2>&1 || error "uv is required. Install it first, e.g. 'python3 -m pip install --user uv'." + +MODEL_DIR="${MODEL_ROOT}/DeepSeek-OCR-2" + +if [[ -x "${VENV_PATH}/bin/python" ]]; then + info "Reusing uv environment at ${VENV_PATH}" +else + info "Creating uv environment at ${VENV_PATH}" + uv venv --python "${PYTHON_BIN}" "${VENV_PATH}" +fi + +if [[ "${RUN_TESTS}" -eq 1 ]]; then + SYNC_ARGS+=(--group test) +fi + +info "Syncing DeepSeek runtime from ${PROJECT_DIR}" +UV_PROJECT_ENVIRONMENT="${VENV_PATH}" uv sync --project "${PROJECT_DIR}" --python "${VENV_PATH}/bin/python" "${SYNC_ARGS[@]}" + +info "Installing Rust extensions in editable mode" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_noise" + +if [[ "${DOWNLOAD_MODEL}" -eq 1 ]]; then + info "Downloading DeepSeek-OCR-2 model to ${MODEL_DIR}" + HUGGINGFACE_HUB_TOKEN="${HUGGINGFACE_HUB_TOKEN:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}}" \ + "${VENV_PATH}/bin/python" - <\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + usage() { cat <<'EOF' Usage: setup_glossapi.sh [options] Options: - --mode MODE Environment profile: vanilla, rapidocr, deepseek (default: vanilla) + --mode MODE Environment profile: docling or deepseek (default: docling) --venv PATH Target virtual environment path --python PATH Python executable to use when creating the venv - --download-deepseek Fetch DeepSeek-OCR weights (only meaningful for --mode deepseek) - --weights-dir PATH Destination directory for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr) + --download-deepseek Fetch DeepSeek-OCR-2 weights (DeepSeek mode only) + --weights-dir PATH Destination directory root for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr-2-model) --run-tests Run pytest -q after installation --smoke-test Run dependency_setup/deepseek_gpu_smoke.py (deepseek mode only) --help Show this help message @@ -69,13 +73,30 @@ while (( "$#" )); do done case "${MODE}" in - vanilla|rapidocr|deepseek) ;; + vanilla) + warn "Mode 'vanilla' is deprecated; using 'docling' instead." + MODE="docling" + ;; + rapidocr) + error "RapidOCR setup has been removed. Use --mode docling or --mode deepseek." + ;; + docling|deepseek) ;; *) - echo "Invalid mode '${MODE}'. Expected vanilla, rapidocr, or deepseek." >&2 + echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 exit 1 ;; esac +if [[ "${MODE}" == "deepseek" ]]; then + exec "${SCRIPT_DIR}/setup_deepseek_uv.sh" \ + --python "${PYTHON_BIN}" \ + --venv "${VENV_PATH:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" \ + --model-root "${DEEPSEEK_ROOT}" \ + $([[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]] && printf '%s' "--download-model") \ + $([[ "${RUN_TESTS}" -eq 1 ]] && printf '%s' "--run-tests") \ + $([[ "${RUN_SMOKE}" -eq 1 ]] && printf '%s' "--smoke-test") +fi + if [[ -z "${VENV_PATH}" ]]; then VENV_PATH="${REPO_ROOT}/.venv_glossapi_${MODE}" fi @@ -86,10 +107,6 @@ if [[ ! -f "${REQUIREMENTS_FILE}" ]]; then exit 1 fi -info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } -warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } -error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } - ensure_venv() { if [[ ! -d "${VENV_PATH}" ]]; then info "Creating virtual environment at ${VENV_PATH}" @@ -107,44 +124,6 @@ python_run() { "${VENV_PATH}/bin/python" "$@" } -download_deepseek_weights() { - local root="$1" - local target="${root}/DeepSeek-OCR" - - if [[ -d "${target}" ]]; then - info "DeepSeek-OCR weights already present at ${target}" - return 0 - fi - - mkdir -p "${root}" - if command -v huggingface-cli >/dev/null 2>&1; then - info "Downloading DeepSeek weights with huggingface-cli (this may take a while)" - huggingface-cli download deepseek-ai/DeepSeek-OCR \ - --repo-type model \ - --include "DeepSeek-OCR/*" \ - --local-dir "${target}" \ - --local-dir-use-symlinks False || warn "huggingface-cli download failed; falling back to git-lfs" - fi - - if [[ ! -d "${target}" ]]; then - if command -v git >/dev/null 2>&1; then - if ! command -v git-lfs >/dev/null 2>&1; then - warn "git-lfs not available; install git-lfs to clone DeepSeek weights via git." - else - info "Cloning DeepSeek weights via git-lfs" - git lfs install --skip-repo >/dev/null 2>&1 || true - git clone https://huggingface.co/deepseek-ai/DeepSeek-OCR "${target}" - fi - else - warn "Neither huggingface-cli nor git found; skipping DeepSeek weight download." - fi - fi - - if [[ ! -d "${target}" ]]; then - warn "DeepSeek weights were not downloaded. Set DEEPSEEK_ROOT manually once acquired." - fi -} - ensure_venv info "Upgrading pip tooling" pip_run install --upgrade pip wheel setuptools @@ -159,43 +138,18 @@ info "Building Rust extensions via editable installs" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_noise" -if [[ "${MODE}" == "deepseek" ]]; then - export GLOSSAPI_DEEPSEEK_PYTHON="${VENV_PATH}/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="${DEEPSEEK_ROOT}/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="${DEEPSEEK_ROOT}/libjpeg-turbo/lib" - export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 - export LD_LIBRARY_PATH="${GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH:-}" - - if [[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]]; then - download_deepseek_weights "${DEEPSEEK_ROOT}" - else - warn "DeepSeek weights not downloaded (use --download-deepseek to fetch automatically)." - fi -fi - if [[ "${RUN_TESTS}" -eq 1 ]]; then pytest_args=("-q") case "${MODE}" in - vanilla) - pytest_args+=("-m" "not rapidocr and not deepseek") - ;; - rapidocr) + docling) pytest_args+=("-m" "not deepseek") ;; - deepseek) - pytest_args+=("-m" "not rapidocr") - ;; esac info "Running pytest ${pytest_args[*]} tests" python_run -m pytest "${pytest_args[@]}" tests fi -if [[ "${MODE}" == "deepseek" && "${RUN_SMOKE}" -eq 1 ]]; then - info "Running DeepSeek smoke test" - python_run "${SCRIPT_DIR}/deepseek_gpu_smoke.py" -fi - cat < clean/evaluate -> OCR -> section validation has been run on capped Pergamos samples +- OCR progress artifacts were moved out of the canonical `markdown/` tree so downstream stages no longer treat them as real documents + +The following work is intentionally not part of the completed set yet: + +- Docling dependency upgrades +- page-level OCR reevaluation experiments +- broader corpus-level comparative benchmarking beyond the capped validation runs + +## Remaining TODO to wrap up the implemented changes + +These are the remaining tasks for closing out the already-implemented migration work: + +1. review and curate the final commit contents +2. keep only source, docs, and test changes that belong in the `development` branch +3. exclude local artifacts, downloaded models, disposable environments, and ad hoc validation output from the commit +4. optionally run one more small real-PDF compatibility slice if an extra release-confidence check is desired +5. create or switch to the `development` branch and push the finalized change set there + +This means the migration implementation itself is effectively done; what remains is mainly release hygiene and branch preparation. + +## Target architecture + +The target shape is: + +1. `download()` +2. `extract()` via safe backend or Docling +3. `clean()` and compute Greek-quality routing +4. `ocr()` via DeepSeek only for documents that need remediation +5. `section()` +6. `annotate()` +7. `export()` + +Important boundary: + +- keep `Docling` for extraction, layout, Markdown, JSON artifacts, and optional formula/code enrichment +- remove `RapidOCR` from the OCR path and installation surface +- enforce `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` for production and release validation + +This is a simplification, not a redesign of the entire pipeline contract. + +## Why this direction + +The current mixed OCR surface adds complexity in three places: + +- dependency installation and CUDA compatibility +- runtime branching and operational support burden +- validation burden when one OCR path succeeds and another fails differently + +The simplified design still preserves the important current properties: + +- selective OCR after Greek-quality validation +- Docling-generated layout and JSON artifacts for downstream stages +- explicit operational metadata and rerun semantics + +## Stage 1: DeepSeek-only OCR + +Goal: + +- make DeepSeek the only OCR remediation backend +- remove silent stub fallback from production paths + +Changes: + +- remove `rapidocr` as a supported OCR backend +- route `Corpus.ocr()` to DeepSeek only +- fail hard when DeepSeek runtime, weights, or CLI are unavailable +- keep the current document-level `needs_ocr` selection model + +Do not change in this stage: + +- Docling extraction contract +- sectioning and annotation behavior +- page-level routing policy +- formula/code enrichment policy + +Why this stage exists: + +- it gives the desired simplification without changing the rest of the pipeline contract at the same time +- it isolates OCR-engine risk from Docling-upgrade risk + +Success criteria: + +- no remaining production path imports or dispatches RapidOCR +- no final validation run succeeds via stub output +- documents flagged `needs_ocr=True` can still be remediated through DeepSeek + +Status: + +- completed + +## Stage 2: Installation simplification + +Goal: + +- reduce the environment surface to what the simplified pipeline actually needs + +Changes: + +- remove the `rapidocr` install profile and `onnxruntime-gpu` +- simplify setup profiles around: + - Docling extraction/runtime + - DeepSeek OCR runtime +- remove unused requirement baggage where it is not imported by GlossAPI itself +- make Python version constraints match current upstream reality + +Current constraint to fix: + +- GlossAPI currently declares `requires-python = ">=3.8"` while current Docling requires Python `>=3.10` + +Do not change in this stage: + +- pipeline behavior +- artifact layout +- OCR routing logic + +Why this stage exists: + +- environment simplification should follow architectural simplification +- it is easier to reason about required packages once RapidOCR is gone + +Success criteria: + +- setup documentation exposes only the supported environments +- install instructions no longer mention removed OCR components +- Python floor and dependency pins are internally consistent + +Status: + +- completed for the currently supported DeepSeek-only flow +- final branch hygiene and commit curation still remain + +## Stage 3: Docling upgrade + +Goal: + +- upgrade Docling after the OCR surface has already been simplified + +Changes: + +- update `docling` +- update `docling-core` +- update `docling-parse` +- update `docling-ibm-models` +- adapt any compatibility shims required by changed public APIs + +Do not change in this stage: + +- DeepSeek-only OCR decision +- page-level experiment +- formula/code enrichment policy unless explicitly validated + +Why this stage exists: + +- upgrading Docling before removing RapidOCR combines two unrelated breakage sources +- after Stage 1 the Docling integration surface is smaller and easier to validate + +Success criteria: + +- Phase-1 extraction still produces the documented canonical artifacts +- downstream sectioning, annotation, and export still consume the outputs +- metadata and resumability behavior do not regress + +Status: + +- deferred + +## Stage 4: Re-evaluate retained Docling capabilities + +Goal: + +- decide which Docling-powered features remain justified after the simplification + +Features to evaluate: + +- formula enrichment +- code enrichment +- table structure extraction +- any extra model/artifact prefetch currently required for non-default functionality + +Why this stage exists: + +- some capabilities may still be valuable for technical corpora +- some may only be increasing runtime and failure surface + +Rule: + +- do not remove formula/code enrichment just because it simplifies the stack +- remove it only if real-corpus evaluation shows little or no value + +Success criteria: + +- every retained capability has a measurable purpose +- every removed capability has an explicit evaluation-based justification + +Status: + +- pending + +## Stage 5: Page-level reevaluation experiment + +Goal: + +- test whether whole-document OCR reruns should be replaced or complemented by page-level escalation + +Experiment shape: + +- baseline branch: current document-level `needs_ocr` routing +- experiment branch: page-level or ROI-level routing + +What stays fixed: + +- DeepSeek remains the only OCR backend +- Docling remains the structured extraction/layout path + +Why this is separate: + +- it is an architectural experiment, not a prerequisite for the OCR simplification +- it should be compared against the stabilized DeepSeek-only baseline + +Primary evaluation questions: + +- does page-level escalation improve quality on long PDFs +- does it reduce OCR runtime and GPU cost +- does it preserve downstream sectioning and annotation quality + +Status: + +- pending + +## Non-goals for the first pass + +These are intentionally out of scope for the initial migration: + +- replacing Docling JSON/layout artifacts with DeepSeek-native structured artifacts +- merging all runtime concerns into one universal environment regardless of ecosystem constraints +- changing artifact layout at the same time as OCR simplification +- treating synthetic, mocked, or stubbed tests as sufficient release validation + +## Release sequence + +The intended order is: + +1. DeepSeek-only OCR and no-stub enforcement +2. installation simplification +3. Docling upgrade +4. retained-capability review +5. page-level experiment + +This order keeps one major architectural assumption changing at a time. diff --git a/docs/architecture/index.md b/docs/architecture/index.md index a8d8621..f6e1c85 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -103,7 +103,7 @@ Purpose: Important characteristics: -- can use RapidOCR via Docling or DeepSeek OCR +- uses DeepSeek OCR for remediation while keeping Docling in the surrounding extraction/layout flow - reads metadata to find OCR candidates - skiplist-aware - designed as a corrective stage, not the default for every document diff --git a/docs/configuration.md b/docs/configuration.md index 659d65c..0810530 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -20,28 +20,22 @@ Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread ### DeepSeek optional dependencies -Install DeepSeek backend extras to enable the DeepSeek OCR path (imports remain lazy, so the package is optional). Use the CUDA 12.1 wheels for both vLLM and Torch: +Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: ```bash -pip install '.[deepseek]' - -# Install Torch CUDA 12.1 wheels (required by the DeepSeek script) -pip install --extra-index-url https://download.pytorch.org/whl/cu121 \ - 'torch==2.5.1+cu121' 'torchvision==0.20.1+cu121' - -# Alternatively, use the requirements file (edit to uncomment torch lines): -pip install -r deepseek-ocr/requirements-deepseek.txt +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. ### DeepSeek runtime controls -- `GLOSSAPI_DEEPSEEK_ALLOW_STUB` (`1` by default): allow the builtin stub runner for tests and lightweight environments. -- `GLOSSAPI_DEEPSEEK_ALLOW_CLI` (`0` by default): flip to `1` to force the real vLLM CLI even when the stub is allowed. -- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs `run_pdf_ocr_vllm.py` (defaults to the current interpreter). -- `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT`: override path to the DeepSeek CLI script (defaults to `deepseek-ocr/run_pdf_ocr_vllm.py` under the repo). -- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths (e.g., for `libjpeg-turbo`) when launching the CLI. +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB`: must remain `0`; stub execution is rejected. +- `GLOSSAPI_DEEPSEEK_ALLOW_CLI`: keep at `1` to require the real runtime. +- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. +- `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`: override path to the OCR runner script (defaults to `src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`). +- `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. +- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. ## Math Enrichment (Phase‑2) @@ -71,10 +65,6 @@ All LaTeX policy knobs are loaded via `glossapi.text_sanitize.load_latex_policy( - `GLOSSAPI_WORKER_LOG_DIR`: override the directory used for per-worker logs and `gpu.current` markers (defaults to `logs/ocr_workers/` or `logs/math_workers/` under the output directory). - `GLOSSAPI_WORKER_LOG_VERBOSE` = `1|0` (default `1`): emit (or suppress) the GPU binding banner each worker prints on startup. -## RapidOCR Model Paths - -- `GLOSSAPI_RAPIDOCR_ONNX_DIR`: directory containing `det/rec/cls` ONNX models and keys. - ## Triage & Parquet - Triage always writes both: diff --git a/docs/getting_started.md b/docs/getting_started.md index f6bf4ce..94a2325 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -4,46 +4,39 @@ This guide gets a new GlossAPI contributor from clone → first extraction with ## Checklist -- Python 3.8+ (3.10 recommended) +- Python 3.10+ (`3.12` recommended for the DeepSeek runtime) - Recent `pip` (or `uv`) and a C/C++ toolchain for Rust wheels -- Optional: NVIDIA GPU with CUDA 12.x drivers for Docling/RapidOCR acceleration +- Optional: NVIDIA GPU with CUDA drivers for Docling/DeepSeek acceleration ## Install GlossAPI -### Recommended — mode-aware setup script +### Recommended setup -Use `dependency_setup/setup_glossapi.sh` to build an isolated virtualenv with the correct dependency set for vanilla, RapidOCR, or DeepSeek runs. Examples: +Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `dependency_setup/setup_deepseek_uv.sh` for the OCR runtime. Examples: ```bash -# Vanilla pipeline (CPU-only OCR) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# RapidOCR GPU stack -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR on GPU (expects weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR on GPU (uv-managed, downloads DeepSeek-OCR-2 if requested) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Add `--download-deepseek` if you need the script to fetch weights via Hugging Face; otherwise it searches `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Inspect `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation runs. The script installs GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the CLI can run (env vars, model dir, flashinfer, cc1plus, libjpeg). -- Force the real CLI and avoid stub fallback by setting: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. +- Force the real runtime and avoid stub fallback by setting: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- Install a CUDA toolkit with `nvcc` and set `CUDA_HOME` / prepend `$CUDA_HOME/bin` to `PATH` (FlashInfer/vLLM JIT expects it). -- If FlashInfer is unstable on your stack, disable it with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- Avoid FP8 KV cache issues by exporting `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1`; tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. -- Keep `LD_LIBRARY_PATH` pointing at the toolkit lib64 (e.g. `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`). + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- `flash-attn` is optional. The runner uses it when available and otherwise falls back to the Transformers `eager` attention implementation. ### Option 1 — pip (evaluate quickly) @@ -74,30 +67,19 @@ chmod +x scripts/setup_conda.sh conda activate glossapi ``` -The helper script provisions Python 3.10, installs Rust + `maturin`, performs an editable install, and applies the Docling RapidOCR patch automatically. +The helper script provisions Python 3.10, installs Rust + `maturin`, and performs an editable install. ## GPU prerequisites (optional but recommended) -`setup_glossapi.sh` pulls the right CUDA/Torch/ONNX wheels for the RapidOCR and DeepSeek profiles. If you are curating dependencies manually, make sure you: +`setup_glossapi.sh` and `setup_deepseek_uv.sh` pull the required Torch wheels for the supported Docling and DeepSeek flows. If you are curating dependencies manually, make sure you: -- Install the GPU build of ONNX Runtime (`onnxruntime-gpu`) and uninstall the CPU wheel. -- Select the PyTorch build that matches your driver/toolkit (the repository currently targets CUDA 12.8 for DeepSeek). +- Select the PyTorch build that matches your driver/toolkit. - Verify the providers with: ```bash - python -c "import onnxruntime as ort; print(ort.get_available_providers())" python -c "import torch; print(torch.cuda.is_available())" ``` -## RapidOCR models & keys - -GlossAPI ships the required ONNX models and Greek keys under `glossapi/models/rapidocr/{onnx,keys}`. To override them, set `GLOSSAPI_RAPIDOCR_ONNX_DIR` to a directory containing: - -- `det/inference.onnx` -- `rec/inference.onnx` -- `cls/ch_ppocr_mobile_v2.0_cls_infer.onnx` -- `greek_ppocrv5_keys.txt` - ## First run (lightweight corpus) ```bash diff --git a/docs/index.md b/docs/index.md index d696c8d..d8ec279 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,10 +16,11 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. - [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. - [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. +- [DeepSeek-Only Upgrade Roadmap](architecture/deepseek_only_upgrade_roadmap.md) — the staged simplification plan for OCR and dependency upgrades. ## Learn the pipeline - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. -- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers Docling + RapidOCR usage. +- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. - [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. @@ -27,6 +28,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Configuration](configuration.md) lists all environment knobs. - [Troubleshooting](troubleshooting.md) captures the most common pitfalls. - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. +- [Compatibility And Regression Matrix](testing/compatibility_matrix.md) defines the release-validation gates for the migration and upgrades. ## Reference - [Corpus API](api/corpus.md) details public methods and parameters. diff --git a/docs/math_enrichment_runtime.md b/docs/math_enrichment_runtime.md index 21d8617..096209c 100644 --- a/docs/math_enrichment_runtime.md +++ b/docs/math_enrichment_runtime.md @@ -68,9 +68,8 @@ c.ocr(math_targets=targets, math_batch_size=4) ## OCR/Model Constraints (recap) -- ORT GPU only: uninstall `onnxruntime` CPU; use `onnxruntime-gpu`. -- RapidOCR keys: Docling 2.48.0 needs `Rec.rec_keys_path` patch (see README). -- Model discovery: set `GLOSSAPI_RAPIDOCR_ONNX_DIR` or package models under `glossapi/models/rapidocr/`. +- DeepSeek OCR runs in its own pinned runtime; set `GLOSSAPI_DEEPSEEK_PYTHON`, `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`, and `GLOSSAPI_DEEPSEEK_MODEL_DIR`. +- Keep `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` and `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`. - Optional Torch CUDA: needed for GPU layout/enrichment; see README for the CUDA wheels. ## Multi‑GPU diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 197bb0a..f401829 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -1,15 +1,14 @@ # GPU OCR and Math Enrichment -This document summarizes how GlossAPI uses the GPU for OCR and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. +This document summarizes how GlossAPI uses the GPU for OCR remediation and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. ## Overview -- Phase‑1 (Extract): PDF → Markdown via Docling; optional GPU OCR via RapidOCR (ONNXRuntime). Optionally emit JSON + formula index for Phase‑2. +- Phase‑1 (Extract): PDF → Markdown via Docling or the safe backend. Optionally emit JSON + formula index for Phase‑2. - Phase‑2 (Enrich): From Docling JSON, decode math/code on the GPU (CodeFormula) and re‑emit enriched Markdown. Backends -- `backend='rapidocr'` (default): Docling + RapidOCR; Phase‑2 math runs from Docling JSON. -- `backend='deepseek'`: DeepSeek‑OCR; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. +- `backend='deepseek'`: DeepSeek-OCR-2; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. Policy: never OCR and math on the same file - If a file needs OCR, GlossAPI runs OCR only (no Phase‑2 on that file in the same pass). @@ -18,24 +17,20 @@ Policy: never OCR and math on the same file ### Python API layout - DeepSeek entry point: `glossapi.ocr.deepseek.runner.run_for_files(...)` -- RapidOCR dispatcher: `glossapi.ocr.rapidocr.dispatch.run_via_extract(...)` - Math enrichment: `glossapi.ocr.math.enrich.enrich_from_docling_json(...)` - Utility helpers (Docling JSON / cleaning): `glossapi.ocr.utils.*` ## Prerequisites -- RapidOCR/Docling stack: `pip install '.[rapidocr]'` -- DeepSeek CLI stack (in a dedicated venv recommended): `pip install '.[deepseek]'` -- ONNXRuntime GPU installed (no CPU ORT): `onnxruntime-gpu==1.18.1` -- Torch CUDA installed: e.g., `torch==2.5.1+cu121` -- Packaged RapidOCR models/keys found under `glossapi/models/rapidocr/{onnx,keys}` or via `GLOSSAPI_RAPIDOCR_ONNX_DIR`. +- Main GlossAPI stack: `./dependency_setup/setup_glossapi.sh --mode docling` +- DeepSeek runtime: `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek` +- Torch CUDA installed in the DeepSeek env (the uv setup pins the tested stack). - Optional helpers for Phase‑2 JSON: `pypdfium2`, `zstandard`. Verify GPU readiness before forcing OCR or math: ```bash python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())" # expects True, >=1 -python -c "import onnxruntime as ort; print(ort.get_available_providers())" # must include CUDAExecutionProvider ``` ## Running Phase‑1 (Extract) @@ -44,17 +39,14 @@ python -c "import onnxruntime as ort; print(ort.get_available_providers())" from glossapi import Corpus c = Corpus('IN','OUT') -# GPU OCR on PDFs; emit JSON + formula index for Phase‑2 +# Emit JSON + formula index for Phase‑2 c.extract( input_format='pdf', - accel_type='CUDA', # or use_gpus='multi' for multi‑GPU - force_ocr=True, # OCR always on for PDFs + accel_type='CUDA', emit_formula_index=True, # request json/.formula_index.jsonl alongside the default JSON ) ``` -When `force_ocr=True` (or when math/code enrichment is enabled), GlossAPI automatically switches to the Docling backend and aborts if CUDA‑enabled torch/ONNXRuntime providers are not available. - Outputs: - `markdown/.md` - `json/.docling.json(.zst)` and `json/.formula_index.jsonl` @@ -88,12 +80,7 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → runs OCR only for bad files; equations are included inline; Phase‑2 is skipped ``` -If you need Phase‑2 math on files that do not require OCR, use RapidOCR/Docling and math‑only (expects Docling JSON from Phase‑1): - -```python -c.ocr(backend='rapidocr', fix_bad=False, math_enhance=True, mode='math_only') -# → runs Phase‑2 on non‑OCR files only (requires Docling JSON) -``` +If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. ## Multi‑GPU @@ -101,7 +88,7 @@ Phase‑1 (extract): ```python c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) ``` -Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. OCR uses ORT GPU under the same process. +Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. Phase‑2 (enrich): ```python @@ -119,7 +106,7 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning - Batch sizes - - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula docling side throughput. + - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. @@ -159,11 +146,7 @@ OUT/ ## Troubleshooting -- Missing CUDAExecutionProvider - - Ensure `onnxruntime-gpu` is installed and `onnxruntime` CPU is uninstalled. - Torch reports no CUDA - Check `nvidia-smi` and match Torch CUDA build to your driver. -- OCR is slow or falls back to CPU - - Confirm ORT providers include CUDAExecutionProvider and that `accel_type='CUDA'` is used. - Out of memory - Lower `batch_size` for Phase‑2, reduce `GLOSSAPI_IMAGES_SCALE`, or split inputs. diff --git a/docs/quickstart.md b/docs/quickstart.md index 4b10685..a498725 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -38,14 +38,13 @@ Workers report per-batch summaries and extraction progress is persisted into `download_results/download_results.parquet`, so you can restart multi-GPU runs without losing progress (no extra checkpoint files required). -## GPU OCR (opt-in) +## OCR remediation (opt-in) ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', accel_type='CUDA', force_ocr=True) -# or reuse multi-GPU batching -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.clean() +c.ocr(backend='deepseek', fix_bad=True, math_enhance=False) ``` ## Phase‑2 Math Enrichment (from JSON) @@ -76,7 +75,7 @@ c.section() # to parquet c.annotate() # classify/annotate sections ``` -See ocr_and_math_enhancement.md for GPU details, batch sizes, and artifact locations. +See `ocr_and_math_enhancement.md` for OCR runtime details, batch sizes, and artifact locations. ### DeepSeek OCR @@ -89,12 +88,11 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → OCR only for bad files; math is included inline in the Markdown ``` -To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the CLI bits are reachable: +To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the runtime is reachable: ```bash -export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py -export GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek-venv/bin/python -export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR -export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib +export GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek-venv/bin/python +export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2 python -m glossapi.ocr.deepseek.preflight # optional: validates env without running OCR ``` diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 3bf8815..3a7e57c 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -25,12 +25,9 @@ The OCR stage repairs documents whose extracted text is considered unreliable, a ## Backend choices -The pipeline supports at least two OCR-oriented modes: - -- RapidOCR through the Docling path -- DeepSeek OCR for environments configured for that backend - -These are operationally different and should not be treated as interchangeable implementation details. +The supported OCR remediation backend is DeepSeek OCR. Docling remains part of +the surrounding extraction and layout flow, but OCR reruns themselves are now +expected to use the DeepSeek runtime. ## Selection model diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md new file mode 100644 index 0000000..0c00d59 --- /dev/null +++ b/docs/testing/compatibility_matrix.md @@ -0,0 +1,276 @@ +# Compatibility And Regression Matrix + +This document defines the release-validation matrix for the DeepSeek-only migration and subsequent Docling upgrades. + +It is not a generic unit-test list. It is a contract-based validation plan tied to the documented pipeline behavior. + +## Scope + +This matrix applies to changes in: + +- DeepSeek-only OCR migration +- no-stub enforcement +- installation simplification +- Docling dependency upgrades +- page-level reevaluation experiments + +## Validation policy + +Release validation for this migration must use: + +- real PDFs +- real Docling +- real DeepSeek +- real GPUs where the code path requires them +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` + +Developer-only tests may still use mocks or lightweight stubs for fast iteration, but those do not satisfy release gates for this migration. + +## Test levels + +### L0: Install and import sanity + +Purpose: + +- prove the supported environments install cleanly and that removed components are truly gone + +Typical inputs: + +- fresh venv +- supported Python version + +### L1: Lightweight smoke corpus + +Purpose: + +- prove the baseline end-to-end flow still works on the small repo corpus + +Typical inputs: + +- `samples/lightweight_pdf_corpus/` + +### L2: Real-PDF contract validation + +Purpose: + +- prove the documented artifacts and metadata contracts still hold on real documents + +Typical inputs: + +- real PDFs from a representative sample + +### L3: Multi-GPU and operational recovery + +Purpose: + +- prove the runtime behavior remains correct under parallel execution and rerun conditions + +Typical inputs: + +- multiple real PDFs +- at least two visible GPUs + +### L4: Comparative corpus evaluation + +Purpose: + +- compare baseline and changed behavior on a real evaluation slice + +Typical inputs: + +- real corpus slice such as the Pergamos sample + +## Mandatory invariants + +The following must remain true unless a change explicitly revises the contract and updates the docs: + +- canonical Markdown is written to `markdown/.md` +- Docling JSON artifacts are emitted when requested +- cleaner output still drives `needs_ocr` +- OCR remains selective rather than defaulting to all documents +- metadata parquet remains the durable operational record +- reruns skip completed work unless forced +- skiplist semantics remain explicit and stable +- no production path silently falls back to stub OCR + +## Release-gate matrix + +| ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | +| --- | --- | --- | --- | --- | --- | --- | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed RapidOCR profile | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no runtime import of removed RapidOCR modules | +| `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | +| `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | +| `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | +| `OCR-001` | L2 | DeepSeek-only remediation | docs with `needs_ocr=True` | `ocr(backend="deepseek", fix_bad=True)` | recovered docs updated, metadata marks `ocr_success=True` | no stub output, no silent success | +| `OCR-002` | L2 | No-stub enforcement | broken/missing DeepSeek runtime | run OCR with `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` | run fails explicitly | failure must not produce placeholder success artifacts | +| `MTH-001` | L2 | Formula/code enrichment compatibility | math-heavy real PDF | Docling extract plus Phase-2 enrichment | enriched outputs and metadata remain coherent | no schema drift breaking enrichment | +| `SEC-001` | L2 | Sectioning contract | usable real docs | `section()` | `sections/sections_for_annotation.parquet` produced | no empty-output regression caused by upstream changes | +| `ANN-001` | L2 | Annotation contract | section parquet | `annotate()` | classified outputs produced | model integration must not break on changed upstream text/layout | +| `EXP-001` | L2 | Export contract | processed docs | `jsonl()` / `jsonl_sharded()` | JSONL and metadata outputs match documented layout | no dropped metadata fields without explicit design change | +| `RES-001` | L3 | Resumability | interrupted or partial run | rerun with defaults | completed items skipped correctly | no duplicate reprocessing by default | +| `RES-002` | L3 | Force/reprocess semantics | prior successful run | rerun with force/reprocess flag | selected items are reprocessed | no stale completion flags blocking intended rerun | +| `SKP-001` | L3 | Skiplist semantics | run with known problematic items | extract/OCR rerun | skiplist excludes intended stems only | no hidden filtering of healthy items | +| `GPU-001` | L3 | Multi-GPU OCR | real PDF slice on 2 GPUs | DeepSeek OCR in parallel | work is distributed and completes per GPU | no worker success masking failures | +| `CMP-001` | L4 | Baseline quality comparison | Pergamos sample slice | compare pre/post change outputs | no material regression in artifact completeness and downstream usability | runtime improvement alone does not justify quality loss | +| `CMP-002` | L4 | Whole-text vs page-level experiment | long PDFs | compare baseline branch vs page-level branch | quality/runtime tradeoff explicitly measured | experimental branch does not replace baseline without evidence | + +## Detailed test groups + +### Install and runtime compatibility + +What to prove: + +- supported environment installs cleanly +- unsupported/removed OCR components are not required +- Python floor matches actual upstream dependencies + +Critical checks: + +- packaging metadata uses a supported Python minimum +- setup docs expose only supported install paths +- removal of RapidOCR does not leave dead imports or entrypoints + +## Extraction contract + +What to prove: + +- Phase-1 still produces canonical Markdown +- Docling extraction still produces JSON artifacts when requested +- metrics continue to be written where downstream stages expect them + +Artifacts to check: + +- `markdown/.md` +- `json/.docling.json(.zst)` +- `json/.formula_index.jsonl` when requested +- `json/metrics/.metrics.json` +- `json/metrics/.per_page.metrics.json` + +## Cleaning and Greek-quality routing + +What to prove: + +- cleaner still computes routing decisions required for selective OCR +- Greek-text validation remains first-class rather than incidental cleanup + +Fields to check in metadata parquet: + +- `needs_ocr` +- `filter` +- Greek-quality and badness-related fields currently emitted by the cleaner + +## DeepSeek OCR contract + +What to prove: + +- DeepSeek is the only OCR remediation backend +- no-stub enforcement is real +- recovered documents update metadata correctly + +Required environment behavior: + +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` +- real model weights present +- real CLI/runtime path present + +Negative checks: + +- no markdown contains placeholder stub markers +- no OCR pass succeeds after a DeepSeek CLI failure unless real output exists +- no removed OCR backend is referenced during final validation + +## Formula and code enrichment + +What to prove: + +- if retained, enrichment still works with the upgraded Docling stack +- if later removed, the removal is justified by evaluation rather than convenience + +Checks: + +- enriched Markdown is generated where expected +- `json/.latex_map.jsonl` remains coherent when enrichment is enabled +- metadata updates for math enrichment still work + +## Section, annotate, and export contracts + +What to prove: + +- downstream stages still consume the extraction outputs +- output layout and metadata structure remain compatible with the documented pipeline + +Artifacts to check: + +- `sections/sections_for_annotation.parquet` +- `classified_sections.parquet` +- `fully_annotated_sections.parquet` +- exported JSONL shards and related metadata + +## Resumability and operational recovery + +What to prove: + +- reruns still honor completion state +- skiplist semantics remain intact +- multi-worker failures remain visible and recoverable + +Checks: + +- default rerun skips completed items +- explicit force/reprocess reruns the intended items +- problematic stems are persisted and not silently lost + +## Comparative evaluation set + +Suggested real-world slice: + +- lightweight corpus for smoke validation +- representative real PDFs spanning: + - short documents + - medium documents + - long documents + - structure-rich documents + - math-heavy documents where applicable + +For current local evaluation work, a Pergamos sample manifest has been prepared outside the repo and can be used as the L3/L4 real-PDF slice. + +## Suggested release sequence + +For the planned migration, run gates in this order: + +1. `ENV-*` +2. `EXT-*` +3. `CLN-*` +4. `OCR-*` +5. `MTH-*` +6. `SEC-*`, `ANN-*`, `EXP-*` +7. `RES-*`, `SKP-*`, `GPU-*` +8. `CMP-*` + +This keeps low-level compatibility failures from being confused with downstream quality regressions. + +## Exit criteria per stage + +### Stage 1 exit criteria + +- DeepSeek-only OCR path works on real PDFs +- no-stub enforcement verified +- no remaining release dependency on RapidOCR + +### Stage 2 exit criteria + +- install paths reduced to supported environments +- packaging/docs no longer reference removed OCR components + +### Stage 3 exit criteria + +- upgraded Docling passes `EXT-*`, `MTH-*`, `SEC-*`, `ANN-*`, and `EXP-*` + +### Stage 4 exit criteria + +- retained or removed Docling capabilities are justified by evaluation evidence + +### Stage 5 exit criteria + +- page-level branch is compared against the stabilized baseline before any adoption decision diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6691407..24cc470 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -2,19 +2,15 @@ ## OCR runs on CPU -- Verify ONNXRuntime GPU: `python -c "import onnxruntime as ort; print(ort.get_available_providers())"` — must include `CUDAExecutionProvider`. -- Ensure CPU ORT wheel is not installed: `pip uninstall -y onnxruntime`. -- Make sure you pass `accel_type='CUDA'` (or `use_gpus='multi'`). +- Verify Torch CUDA: `python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())"`. +- Make sure the DeepSeek runtime is the one configured in `GLOSSAPI_DEEPSEEK_PYTHON`. +- Run `python -m glossapi.ocr.deepseek.preflight` in the DeepSeek env before large OCR jobs. ## Torch doesn’t see the GPU - Check `nvidia-smi` and driver installation. - Match Torch CUDA build to your driver; see getting_started.md for the recommended wheel. -## RapidOCR font download failure - -- The first OCR call might download a visualization font. Ensure egress is allowed; the file is cached afterwards. - ## Out of memory - Lower Phase‑2 `batch_size` (e.g., 8) and reduce inline `GLOSSAPI_FORMULA_BATCH`. diff --git a/mkdocs.yml b/mkdocs.yml index ba13512..1776dd5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: GlossAPI -site_description: Academic document processing pipeline (Docling + RapidOCR + Rust) +site_description: Academic document processing pipeline (Docling + DeepSeek + Rust) repo_url: https://github.com/eellak/glossAPI theme: name: material @@ -22,6 +22,7 @@ nav: - Metadata, Artifacts, and Run Diagnostics: architecture/metadata_artifacts_and_run_diagnostics.md - Artifact Layout and Stage Handoffs: architecture/artifact_layout_and_stage_handoffs.md - Resumability, Recovery, and Retention: architecture/resumability_recovery_and_retention.md + - DeepSeek-Only Upgrade Roadmap: architecture/deepseek_only_upgrade_roadmap.md - Pipeline: - Pipeline Overview: pipeline.md - OCR & Math Enrichment: ocr_and_math_enhancement.md @@ -39,6 +40,7 @@ nav: - Configuration: configuration.md - AWS Job Distribution: aws_job_distribution.md - Troubleshooting: troubleshooting.md + - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - Corpus API: api/corpus.md - Math Enrichment Runtime: math_enrichment_runtime.md diff --git a/pyproject.toml b/pyproject.toml index 3d0d5fa..60b23f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,11 +10,11 @@ authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"} ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", # ORT+RapidOCR best compatibility + "numpy<2", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -37,28 +37,26 @@ classifiers = [ ] [project.optional-dependencies] -# Docling + RapidOCR ONNX stack (kept optional to preserve import-light installs) -rapidocr = [ +# Docling extraction/layout stack +docling = [ "docling==2.48.0", - # Use RapidOCR core package; avoid rapidocr_onnxruntime to prevent pip - # from auto-installing the CPU-only 'onnxruntime' wheel. - "rapidocr>=3.3.0", - "onnxruntime-gpu==1.18.1", ] # Optional CUDA layout acceleration (Docling) cuda = [ "torch==2.5.1", "torchvision==0.20.1", ] -# DeepSeek OCR backend extras (CUDA 12.1 build of vLLM). Torch is not pinned here -# because users should install the CUDA wheel from the PyTorch index -# (see docs: installing torch==2.5.1+cu121 via extra index URL). +# DeepSeek OCR backend extras (Torch should be installed from the PyTorch index). deepseek = [ - "vllm>=0.11.0", - "transformers>=4.45,<5", + "transformers==4.46.3", + "tokenizers==0.20.3", "accelerate>=1.2.1,<2", "pymupdf==1.24.10", "Pillow==10.4.0", + "img2pdf>=0.5.1", + "einops", + "easydict", + "addict", ] docs = [ "mkdocs>=1.5", @@ -78,6 +76,5 @@ glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ - "rapidocr: requires the RapidOCR/Docling execution stack", "deepseek: exercises the DeepSeek OCR pipeline", ] diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index 4539ead..c92d336 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -1,54 +1,7 @@ -""" -GlossAPI Library - -A library for processing academic texts in Greek and other languages: -- Extracting content from PDFs and other formats with Docling -- Robust batch processing with error isolation and automatic resumption -- Clustering documents based on extraction quality -- Extracting and cleaning academic sections -- Classifying sections using machine learning - -This is an open source project that provides tools for linguistic annotations -and text processing, with a special focus on the Greek language. -""" +"""GlossAPI library.""" from __future__ import annotations -import os - -# Keep Docling/RapidOCR bootstrap optional and import‑light by default. -# If the environment requests skipping (common in tests or minimal envs), -# or if Docling is not installed, we avoid importing heavy dependencies here. -_SKIP_DOCLING_BOOT = os.environ.get("GLOSSAPI_SKIP_DOCLING_BOOT") == "1" - -def _attempt_patch_docling() -> bool: - if _SKIP_DOCLING_BOOT: - return False - try: - # Import inside the function to avoid pulling Docling when unused or missing. - from .ocr.rapidocr.safe import patch_docling_rapidocr # type: ignore - - try: - return bool(patch_docling_rapidocr()) - except Exception: - # Swallow any runtime error to keep top‑level import light/safe. - return False - except Exception: - # Docling (or its transitive deps) not available – keep going. - return False - - -def patch_docling_rapidocr() -> bool: - """Best‑effort registration of the SafeRapidOcrModel. - - Returns True when the patch was applied; False when unavailable or skipped. - Safe to call multiple times. - """ - return _attempt_patch_docling() - -# Attempt the patch once at import time, but never fail import if it does not apply. -_ = _attempt_patch_docling() - __all__ = [ 'GlossSection', 'GlossSectionClassifier', @@ -56,7 +9,6 @@ def patch_docling_rapidocr() -> bool: 'Sampler', 'Section', 'GlossDownloader', - 'patch_docling_rapidocr', ] def __getattr__(name: str): @@ -81,7 +33,6 @@ def __getattr__(name: str): return GlossDownloader raise AttributeError(name) -# Derive version dynamically from installed package metadata if possible try: from importlib.metadata import version as _pkg_version __version__: str = _pkg_version(__name__) diff --git a/src/glossapi/_pipeline.py b/src/glossapi/_pipeline.py index 73e5ecc..1909b60 100644 --- a/src/glossapi/_pipeline.py +++ b/src/glossapi/_pipeline.py @@ -1,7 +1,7 @@ """Backward-compatible adapter. -Docling pipeline builders moved to `glossapi.ocr.rapidocr.pipeline`. +Docling pipeline builders moved to `glossapi.ocr.docling.pipeline`. This module re-exports the public API to preserve legacy imports. """ -from .ocr.rapidocr.pipeline import * # noqa: F401,F403 +from .ocr.docling.pipeline import * # noqa: F401,F403 diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index abdaa5e..e5a4329 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -346,6 +346,8 @@ def finalize(self) -> None: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + encoding="utf-8", + errors="replace", bufsize=1, ) try: diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a584eaf..a748dcc 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -96,6 +96,12 @@ def prime_extractor( except Exception: images_scale_env = "1.25" + if force_ocr: + self.logger.warning( + "Phase-1 Docling OCR is deprecated and no longer executes OCR. " + "Use Corpus.ocr(backend='deepseek') for OCR remediation." + ) + # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, @@ -154,12 +160,12 @@ def _gpu_preflight( require_math: bool, require_backend_gpu: bool = False, ) -> None: - """Abort early when GPU OCR/math is requested but CUDA is unavailable.""" + """Abort early when GPU-backed Docling work is requested but CUDA is unavailable.""" if not (require_ocr or require_math or require_backend_gpu): return instructions = ( - "GPU OCR and math enrichment require CUDA-enabled torch and onnxruntime-gpu. " + "GPU-backed Docling extraction and math enrichment require CUDA-enabled torch. " "Install the CUDA wheels and ensure NVIDIA drivers expose the desired devices." ) @@ -167,30 +173,15 @@ def _gpu_preflight( accel_lower = str(accel_type or "").strip().lower() if accel_lower.startswith("cpu"): raise RuntimeError( - "GPU OCR was requested (force_ocr/math) but accel_type='CPU'. " + "GPU-backed Docling extraction was requested but accel_type='CPU'. " f"{instructions}" ) - try: - import onnxruntime as _ort # type: ignore - providers = _ort.get_available_providers() - except Exception as exc: - raise RuntimeError( - "onnxruntime not available while attempting GPU OCR. " - "Install onnxruntime-gpu and rerun." - ) from exc - - if "CUDAExecutionProvider" not in providers: - raise RuntimeError( - "CUDAExecutionProvider missing from onnxruntime providers. " - f"Detected providers={providers}. {instructions}" - ) - torch_mod = _maybe_import_torch(force=True) if torch_mod is None or not getattr(torch_mod, "cuda", None) or not torch_mod.cuda.is_available(): raise RuntimeError( - "Torch CUDA is not available but GPU OCR/math was requested. " - "Install the CUDA wheel (e.g. torch==2.5.1+cu121) and ensure CUDA drivers/devices are visible." + "Torch CUDA is not available but GPU-backed Docling extraction/math was requested. " + "Install the CUDA wheel and ensure CUDA drivers/devices are visible." ) device_count = torch_mod.cuda.device_count() @@ -208,13 +199,12 @@ def _gpu_preflight( if not self._gpu_banner_logged: self.logger.info( - "GPU preflight: using torch + onnxruntime GPU backends; ensure CUDA drivers are available." + "GPU preflight: using torch-backed Docling extraction; ensure CUDA drivers are available." ) self._gpu_banner_logged = True self.logger.info( - "GPU preflight OK: providers=%s torch_devices=%s", - ",".join(providers), + "GPU preflight OK: torch_devices=%s", ", ".join(device_names) or "", ) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 4dec423..80afc7f 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -33,7 +33,7 @@ def ocr( *, fix_bad: bool = True, mode: Optional[str] = None, - backend: str = "rapidocr", + backend: str = "deepseek", device: Optional[str] = None, model_dir: Optional[Union[str, Path]] = None, max_pages: Optional[int] = None, @@ -70,8 +70,8 @@ def ocr( fix_bad only -> 'ocr_bad'; math_enhance only -> 'math_only'; neither -> no‑op. - - backend: 'rapidocr' (default) uses the Docling + RapidOCR path via Phase‑1 extract(). - 'deepseek' uses the DeepSeek‑OCR path (no Docling JSON, math unsupported). + - backend: 'deepseek' (default) uses the DeepSeek OCR remediation path. + Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. @@ -82,9 +82,9 @@ def ocr( ``reprocess_completed=False``). Prefer the explicit ``reprocess_completed`` toggle. """ # Normalize backend - backend_norm = str(backend or "rapidocr").strip().lower() - if backend_norm not in {"rapidocr", "deepseek"}: - raise ValueError("backend must be 'rapidocr' or 'deepseek'") + backend_norm = str(backend or "deepseek").strip().lower() + if backend_norm != "deepseek": + raise ValueError("backend must be 'deepseek'") # CONTENT_DEBUG override (preferred uppercase alias) # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags @@ -147,13 +147,21 @@ def ocr( reprocess_completed = reprocess_flag # DeepSeek semantics note - if backend_norm == "deepseek": + if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: try: self.logger.info( "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." ) except Exception: pass + if mode_norm == "ocr_bad_then_math": + try: + self.logger.info( + "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." + ) + except Exception: + pass + mode_norm = "ocr_bad" # Identify bad documents from parquet (Rust cleaner output) bad_files: List[str] = [] skipped_completed = 0 @@ -578,24 +586,6 @@ def _run_math(stems: List[str]) -> None: except Exception as _e: self.logger.error("DeepSeek OCR runner failed: %s", _e) raise - else: - # RapidOCR/Docling path via Phase-1 extract - self.extract( - input_format="pdf", - num_threads=os.cpu_count() or 4, - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=bad_files, - skip_existing=False, - use_gpus=use_gpus, - devices=devices, - # Do not generate Docling JSON for OCR targets; math will skip them - export_doc_json=False, - emit_formula_index=False, - phase1_backend="docling", - ) reran_ocr = True # Update metadata to reflect successful OCR reruns try: diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..3788d54 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -10,7 +10,6 @@ AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, - RapidOcrOptions, LayoutOptions, TableStructureOptions, TableFormerMode, @@ -106,11 +105,8 @@ def _ensure_docling_pipeline_loaded() -> None: from docling.pipeline.simple_pipeline import SimplePipeline -# Ensure RapidOCR plugin is registered for factory-based OCR construction -import docling.models.rapid_ocr_model # noqa: F401 -from .ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from .ocr.rapidocr.pool import GLOBAL_RAPID_OCR_POOL import inspect +from .ocr.docling_pipeline import build_layout_pipeline import ftfy import logging @@ -328,7 +324,7 @@ def _apply_thread_caps(self) -> None: self._thread_caps_applied = True def release_resources(self) -> None: - """Release Docling converters, pooled RapidOCR engines, and GPU caches.""" + """Release Docling converters and GPU caches.""" try: self.converter = None except Exception: @@ -343,10 +339,6 @@ def release_resources(self) -> None: setattr(self, attr, None) except Exception: pass - try: - GLOBAL_RAPID_OCR_POOL.clear() - except Exception: - pass torch_mod = _maybe_import_torch() if torch_mod is not None and getattr(torch_mod, "cuda", None): try: @@ -553,12 +545,7 @@ def create_extractor( ocr_langs: list[str] | None = None, profile_timings: bool = True, ): - """Create a document converter with configured options using the canonical builder. - - Delegates PDF pipeline construction to `glossapi.ocr.rapidocr.pipeline.build_rapidocr_pipeline` - to avoid duplicated provider checks and option wiring. Falls back to the legacy - inline path if the canonical builder is unavailable. - """ + """Create a Docling document converter for Phase-1 extraction.""" _ensure_docling_converter_loaded() _ensure_docling_pipeline_loaded() # Enable/disable Docling pipeline timings collection (for benchmarks) @@ -574,171 +561,83 @@ def create_extractor( # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: - if formula_enrichment: + if formula_enrichment or code_enrichment: torch_mod = _maybe_import_torch(force=True) if torch_mod is None: - raise RuntimeError("Torch not available but formula enrichment requested.") + raise RuntimeError("Torch not available but Docling GPU enrichment was requested.") if hasattr(torch_mod, "cuda") and isinstance(getattr(self, "pipeline_options", None), PdfPipelineOptions): dev = getattr(self.pipeline_options, "accelerator_options", None) dv = getattr(dev, "device", None) if (isinstance(dv, str) and dv.lower().startswith("cuda")) and not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") + raise RuntimeError("Torch CUDA not available but Docling GPU enrichment was requested.") except Exception as e: raise RuntimeError(f"Torch CUDA preflight failed: {e}") - # Build PDF pipeline via the canonical builder (preferred) - opts = None - active_backend = DoclingParseV2DocumentBackend - try: - from .ocr.rapidocr.pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - except Exception: # pragma: no cover - adapter fallback - from ._pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - - device_str = self._current_device_str() or "cuda:0" - builder = build_rapidocr_pipeline if enable_ocr else build_layout_pipeline - - try: - _, opts = builder( - device=device_str, - images_scale=float(images_scale), - formula_enrichment=bool(formula_enrichment), - code_enrichment=bool(code_enrichment), - **({"text_score": float(text_score)} if enable_ocr else {}), - ) - - if enable_ocr and hasattr(opts, "ocr_options") and getattr(opts, "ocr_options", None) is not None: - if use_cls is not None: - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - if ocr_langs: - setattr(opts.ocr_options, "lang", list(ocr_langs)) # type: ignore[attr-defined] - if force_full_page_ocr is not None: - setattr(opts.ocr_options, "force_full_page_ocr", bool(force_full_page_ocr)) # type: ignore[attr-defined] - + if enable_ocr: try: - setattr(opts, "images_scale", float(images_scale)) + self._log.warning( + "Docling Phase-1 OCR is no longer supported. " + "Ignoring enable_ocr/force_full_page_ocr; use Corpus.ocr(backend='deepseek') instead." + ) except Exception: pass - self._active_pdf_options = opts - self._current_ocr_enabled = bool(enable_ocr) - - # Create a multi-format DocumentConverter using the built PDF options - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - if opts is None: - opts = self.pipeline_options - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=opts, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), - InputFormat.XML_JATS: XMLJatsFormatOption(), - InputFormat.HTML: HTMLFormatOption(), - InputFormat.PPTX: PowerpointFormatOption(), - InputFormat.CSV: CsvFormatOption(), - InputFormat.MD: MarkdownFormatOption(), - }, - ) - self._active_pdf_backend = active_backend + active_backend = DoclingParseV2DocumentBackend + device_str = self._current_device_str() or "cuda:0" + _, opts = build_layout_pipeline( + device=device_str, + images_scale=float(images_scale), + formula_enrichment=bool(formula_enrichment), + code_enrichment=bool(code_enrichment), + ) + try: + opts.do_ocr = False + setattr(opts, "images_scale", float(images_scale)) except Exception: - # Fallback to legacy inline configuration path - if enable_ocr: - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError( - "RapidOCR ONNX models/keys not found. Ensure models exist under glossapi.models/rapidocr or set GLOSSAPI_RAPIDOCR_ONNX_DIR." - ) - langs = ocr_langs or ["el", "en"] - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=langs, - force_full_page_ocr=bool(force_full_page_ocr), - use_det=True, - use_cls=bool(use_cls), - use_rec=True, - text_score=float(text_score), - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - self.pipeline_options.ocr_options = ocr_opts - # Attach core toggles to existing pipeline_options - try: - self.pipeline_options.do_ocr = bool(enable_ocr) - self.pipeline_options.do_formula_enrichment = bool(formula_enrichment) - self.pipeline_options.do_code_enrichment = bool(code_enrichment) - try: - setattr(self.pipeline_options, "images_scale", float(images_scale)) - except Exception: - pass - except Exception: - pass - if not enable_ocr: - try: - setattr(self.pipeline_options, "ocr_options", None) - except Exception: - pass + pass - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=self.pipeline_options, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - }, - ) + self._active_pdf_options = opts + self._current_ocr_enabled = False - self._active_pdf_options = self.pipeline_options - self._current_ocr_enabled = bool(enable_ocr) - self._active_pdf_backend = active_backend + pdf_backend = DoclingParseV2DocumentBackend + try: + if getattr(self, "use_pypdfium_backend", False): + pdf_backend = PyPdfiumDocumentBackend + self.pdf_backend_name = "pypdfium" + except Exception: + pdf_backend = DoclingParseV2DocumentBackend + active_backend = pdf_backend + + self.converter = DocumentConverter( + allowed_formats=[ + InputFormat.PDF, + InputFormat.DOCX, + InputFormat.XML_JATS, + InputFormat.HTML, + InputFormat.PPTX, + InputFormat.CSV, + InputFormat.MD, + ], + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=opts, + pipeline_cls=StandardPdfPipeline, + backend=active_backend, + ), + InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), + InputFormat.XML_JATS: XMLJatsFormatOption(), + InputFormat.HTML: HTMLFormatOption(), + InputFormat.PPTX: PowerpointFormatOption(), + InputFormat.CSV: CsvFormatOption(), + InputFormat.MD: MarkdownFormatOption(), + }, + ) + self._active_pdf_backend = active_backend # Record last configuration for reuse try: self._last_extractor_cfg = self._cfg_signature( - enable_ocr=enable_ocr, + enable_ocr=False, force_full_page_ocr=force_full_page_ocr, text_score=text_score, images_scale=images_scale, diff --git a/src/glossapi/ocr/__init__.py b/src/glossapi/ocr/__init__.py index bb167c4..df79456 100644 --- a/src/glossapi/ocr/__init__.py +++ b/src/glossapi/ocr/__init__.py @@ -1,7 +1,7 @@ """Lightweight OCR backend package. Exports minimal, import-safe helpers for OCR backends. Heavy -dependencies (vLLM, transformers, PyMuPDF) are imported lazily +dependencies (transformers, PyMuPDF) are imported lazily inside the specific backend functions so importing this package does not require GPU stacks or model weights. """ @@ -12,17 +12,14 @@ __all__ = [ "deepseek", - "rapidocr", "math", "utils", "deepseek_runner", - "rapidocr_dispatch", ] -_SUBPACKAGES = {"deepseek", "rapidocr", "math", "utils"} +_SUBPACKAGES = {"deepseek", "math", "utils"} _ALIASES = { "deepseek_runner": "glossapi.ocr.deepseek.runner", - "rapidocr_dispatch": "glossapi.ocr.rapidocr.dispatch", } diff --git a/src/glossapi/ocr/deepseek/__init__.py b/src/glossapi/ocr/deepseek/__init__.py index 5326c42..a5fb1ca 100644 --- a/src/glossapi/ocr/deepseek/__init__.py +++ b/src/glossapi/ocr/deepseek/__init__.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR backend with a lightweight stub fallback.""" +"""DeepSeek OCR backend.""" from .runner import run_for_files from . import preflight diff --git a/src/glossapi/ocr/deepseek/preflight.py b/src/glossapi/ocr/deepseek/preflight.py index 76810e6..6669707 100644 --- a/src/glossapi/ocr/deepseek/preflight.py +++ b/src/glossapi/ocr/deepseek/preflight.py @@ -1,17 +1,16 @@ -"""Preflight checks for the DeepSeek OCR CLI environment.""" +"""Preflight checks for the DeepSeek OCR environment.""" from __future__ import annotations import dataclasses import os -import shutil import sys from pathlib import Path from typing import Dict, Iterable, List, Optional -DEFAULT_SCRIPT = Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" -DEFAULT_MODEL_DIR = Path.cwd() / "deepseek-ocr" / "DeepSeek-OCR" -DEFAULT_LIB_DIR = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_MODEL_DIR = REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2" @dataclasses.dataclass(frozen=True) @@ -46,9 +45,6 @@ def summarize(self) -> str: def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[Path]: - if not path: - errors.append(CheckResult(label, False, "Not provided")) - return None if not path.exists(): errors.append(CheckResult(label, False, f"Missing at {path}")) return None @@ -58,38 +54,45 @@ def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[ def check_deepseek_env( env: Optional[Dict[str, str]] = None, *, - check_flashinfer: bool = True, + check_torch: bool = True, ) -> PreflightReport: - """Validate DeepSeek CLI prerequisites without running the model.""" + """Validate DeepSeek OCR prerequisites without running the model.""" env = dict(env or os.environ) errors: List[CheckResult] = [] warnings: List[CheckResult] = [] infos: List[CheckResult] = [] - allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" + allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") == "1" + allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1" if not allow_cli: - warnings.append( + errors.append( CheckResult( "allow_cli", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 to force the real CLI.", + "DeepSeek OCR requires the real CLI/runtime. Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1.", ) ) if allow_stub: - warnings.append( + errors.append( CheckResult( "allow_stub", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 to fail instead of falling back to stub output.", + "Stub execution is no longer supported. Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0.", ) ) - script = Path(env.get("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT") or DEFAULT_SCRIPT) - _ensure_path(script, "vllm_script", errors) + script = Path( + env.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT") + or DEFAULT_SCRIPT + ) + _ensure_path(script, "runner_script", errors) - python_bin = Path(env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") or sys.executable) + python_bin = Path( + env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") + or env.get("GLOSSAPI_DEEPSEEK_PYTHON") + or sys.executable + ) _ensure_path(python_bin, "deepseek_python", errors) model_dir = Path( @@ -99,7 +102,7 @@ def check_deepseek_env( ) model_dir = _ensure_path(model_dir, "model_dir", errors) if model_dir: - has_weights = any(model_dir.glob("*.safetensors")) or (model_dir / "model-00001-of-000001.safetensors").exists() + has_weights = any(model_dir.glob("*.safetensors")) has_config = (model_dir / "config.json").exists() if not has_weights or not has_config: errors.append( @@ -110,34 +113,21 @@ def check_deepseek_env( ) ) - ld_path_env = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - lib_dir = Path(ld_path_env) if ld_path_env else DEFAULT_LIB_DIR - _ensure_path(lib_dir, "ld_library_path", errors) - - cc1plus_path = shutil.which("cc1plus", path=env.get("PATH", "")) - if not cc1plus_path: - errors.append( - CheckResult( - "cc1plus", - False, - "C++ toolchain missing (cc1plus not on PATH); install g++ and ensure PATH includes gcc's cc1plus.", - ) - ) - else: - infos.append(CheckResult("cc1plus", True, f"Found at {cc1plus_path}")) - - if check_flashinfer: + if check_torch: try: - import flashinfer # type: ignore + import torch # type: ignore - infos.append(CheckResult("flashinfer", True, f"flashinfer {flashinfer.__version__} import ok")) + infos.append(CheckResult("torch", True, f"torch {torch.__version__} import ok")) + if not torch.cuda.is_available(): + warnings.append(CheckResult("cuda", False, "Torch CUDA is not available.")) except Exception as exc: # pragma: no cover - depends on env - errors.append(CheckResult("flashinfer", False, f"flashinfer import failed: {exc}")) + errors.append(CheckResult("torch", False, f"torch import failed: {exc}")) return PreflightReport(errors=errors, warnings=warnings, infos=infos) def main(argv: Optional[Iterable[str]] = None) -> int: + del argv report = check_deepseek_env() summary = report.summarize() if summary: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py new file mode 100644 index 0000000..0e0e868 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -0,0 +1,188 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files.""" + +from __future__ import annotations + +import argparse +import json +import re +import tempfile +from pathlib import Path +from typing import Iterable, List + +import fitz +import torch +from PIL import Image +from transformers import AutoModel, AutoTokenizer + +PROMPT = "\n<|grounding|>Convert the document to markdown. " +PAGE_SPLIT = "\n<--- Page Split --->\n" + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: + if files: + return [(input_dir / name).resolve() for name in files] + return sorted(input_dir.glob("*.pdf")) + + +def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: + images: List[Image.Image] = [] + doc = fitz.open(pdf_path) + try: + page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) + zoom = 144 / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for idx in range(page_count): + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + images.append(img) + finally: + doc.close() + return images + + +def _clean_markdown(text: str) -> str: + text = (text or "").replace("<|end▁of▁sentence|>", "").strip() + pattern = re.compile(r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL) + matches = pattern.findall(text) + for full_match, label, _coords in matches: + if label == "image": + text = text.replace(full_match, "") + else: + text = text.replace(full_match, "") + return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() + + +def _load_model(model_dir: Path, device: str): + attn_impl = "flash_attention_2" + try: + import flash_attn # noqa: F401 + except Exception: + attn_impl = "eager" + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + if device.startswith("cuda"): + model = model.eval().to(device).to(torch.bfloat16) + else: + model = model.eval().to(device) + return tokenizer, model + + +def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: + result = model.infer( + tokenizer, + prompt=PROMPT, + image_file=str(image_path), + output_path=str(output_dir), + base_size=1024, + image_size=768, + crop_mode=True, + save_results=False, + eval_mode=True, + ) + return _clean_markdown(str(result)) + + +def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + (md_dir / f"{stem}.md").write_text(markdown.strip() + "\n", encoding="utf-8") + metrics = { + "page_count": page_count, + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") + partial_path = progress_dir / f"{stem}.partial.md" + if partial_path.exists(): + partial_path.unlink() + + +def _write_progress( + output_dir: Path, + stem: str, + page_outputs: List[str], + total_pages: int, + completed_pages: int, +) -> None: + """Emit lightweight progress artifacts during long OCR runs.""" + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + partial_markdown = PAGE_SPLIT.join(page_outputs).strip() + if partial_markdown: + (progress_dir / f"{stem}.partial.md").write_text(partial_markdown + "\n", encoding="utf-8") + progress = { + "completed_pages": completed_pages, + "total_pages": total_pages, + "status": "running" if completed_pages < total_pages else "complete", + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.progress.json").write_text( + json.dumps(progress, indent=2), + encoding="utf-8", + ) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + pdfs = _iter_pdfs(input_dir, args.files) + if not pdfs: + return 0 + + tokenizer, model = _load_model(model_dir, args.device) + + for pdf_path in pdfs: + images = _render_pages(pdf_path, args.max_pages) + page_outputs: List[str] = [] + total_pages = len(images) + _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) + with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + for idx, image in enumerate(images): + page_png = tmp_dir / f"page_{idx + 1:04d}.png" + image.save(page_png, format="PNG") + page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + if args.content_debug: + page_text = f"\n{page_text}".strip() + page_outputs.append(page_text) + _write_progress( + output_dir, + pdf_path.stem, + page_outputs, + total_pages, + idx + 1, + ) + markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" + _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index d68f05c..2568665 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR runner with stub and optional CLI dispatch.""" +"""DeepSeek OCR runner.""" from __future__ import annotations @@ -17,6 +17,8 @@ _pypdfium2 = None LOGGER = logging.getLogger(__name__) +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" def _page_count(pdf_path: Path) -> int: @@ -32,12 +34,13 @@ def _run_cli( input_dir: Path, output_dir: Path, *, + files: List[str], + model_dir: Path, python_bin: Optional[Path], script: Path, max_pages: Optional[int], content_debug: bool, - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, + device: Optional[str], ) -> None: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -47,78 +50,62 @@ def _run_cli( str(input_dir), "--output-dir", str(output_dir), + "--model-dir", + str(model_dir), ] + if files: + cmd += ["--files", *files] if max_pages is not None: cmd += ["--max-pages", str(max_pages)] if content_debug: cmd.append("--content-debug") - if gpu_memory_utilization is not None: - cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)] - if disable_fp8_kv: - cmd.append("--no-fp8-kv") + if device: + cmd += ["--device", str(device)] env = os.environ.copy() if shutil.which("cc1plus", path=env.get("PATH", "")) is None: - # FlashInfer JIT (via vLLM) needs a C++ toolchain; add a known cc1plus location if missing. for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): - env["PATH"] = f"{candidate.parent}:{env.get('PATH','')}" + env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" break ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: - env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH','')}" + env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" - LOGGER.info("Running DeepSeek CLI: %s", " ".join(cmd)) + LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments -def _run_one_pdf(pdf_path: Path, md_out: Path, metrics_out: Path, cfg: Dict[str, Any]) -> Dict[str, Any]: - """Stub processor for a single PDF.""" - page_count = _page_count(pdf_path) - max_pages = cfg.get("max_pages") - if max_pages is not None and page_count: - page_count = min(page_count, max_pages) - - md_lines = [ - f"# DeepSeek OCR (stub) — {pdf_path.name}", - "", - f"Pages: {page_count if page_count else 'unknown'}", - ] - if cfg.get("content_debug"): - md_lines.append("") - md_lines.append("") - md_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("\n".join(md_lines) + "\n", encoding="utf-8") - - metrics = {"page_count": page_count} - metrics_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - return metrics - - def run_for_files( self_ref: Any, files: Iterable[str], *, - model_dir: Optional[Path] = None, # kept for API compatibility + model_dir: Optional[Path] = None, output_dir: Optional[Path] = None, - log_dir: Optional[Path] = None, # unused placeholder to mirror rapidocr + log_dir: Optional[Path] = None, # kept for API compatibility max_pages: Optional[int] = None, - allow_stub: bool = True, - allow_cli: bool = False, + allow_stub: bool = False, # ignored after stub removal; kept for compatibility + allow_cli: bool = True, # ignored after stub removal; kept for compatibility python_bin: Optional[Path] = None, vllm_script: Optional[Path] = None, content_debug: bool = False, persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved - device: Optional[str] = None, # reserved - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, + device: Optional[str] = None, + gpu_memory_utilization: Optional[float] = None, # reserved + disable_fp8_kv: bool = False, # reserved **_: Any, ) -> Dict[str, Any]: - """Run DeepSeek OCR for the provided files. + """Run DeepSeek OCR for the provided files.""" + + requested_stub = bool(allow_stub) + del log_dir, allow_stub, allow_cli, persist_engine, precision + del gpu_memory_utilization, disable_fp8_kv - Returns a mapping of stem -> minimal metadata (page_count). - """ + if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": + raise RuntimeError( + "DeepSeek stub execution has been removed. " + "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." + ) file_list = [str(f) for f in files or []] if not file_list: @@ -131,67 +118,63 @@ def run_for_files( md_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True) - env_allow_stub = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" - env_allow_cli = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - - use_cli = allow_cli or env_allow_cli - use_stub = allow_stub and env_allow_stub - - script_path = Path(vllm_script) if vllm_script else Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" - # Optional GPU memory utilization override (env wins over kwarg) - env_gpu_mem = os.environ.get("GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION") - gpu_mem_fraction = gpu_memory_utilization - if env_gpu_mem: - try: - gpu_mem_fraction = float(env_gpu_mem) - except Exception: - gpu_mem_fraction = gpu_memory_utilization - disable_fp8_kv = disable_fp8_kv or os.environ.get("GLOSSAPI_DEEPSEEK_NO_FP8_KV") == "1" - - if use_cli and script_path.exists(): - try: - _run_cli( - input_root, - out_root, - python_bin=python_bin, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, - gpu_memory_utilization=gpu_mem_fraction, - disable_fp8_kv=disable_fp8_kv, - ) - results: Dict[str, Any] = {} - for name in file_list: - pdf_path = (input_root / name).resolve() - stem = Path(name).stem - md_path = md_dir / f"{stem}.md" - metrics_path = metrics_dir / f"{stem}.metrics.json" - if not md_path.exists() or not md_path.read_text(encoding="utf-8").strip(): - placeholder = [ - f"# DeepSeek OCR — {pdf_path.name}", - "", - "[[Blank page]]", - ] - md_path.parent.mkdir(parents=True, exist_ok=True) - md_path.write_text("\n".join(placeholder) + "\n", encoding="utf-8") - page_count = _page_count(pdf_path) - if not metrics_path.exists(): - metrics_path.parent.mkdir(parents=True, exist_ok=True) - metrics_path.write_text(json.dumps({"page_count": page_count}, indent=2), encoding="utf-8") - results[stem] = {"page_count": page_count} - return results - except Exception as exc: - if not use_stub: - raise - LOGGER.warning("DeepSeek CLI failed (%s); falling back to stub output", exc) - - cfg = {"max_pages": max_pages, "content_debug": content_debug} + model_root = Path( + model_dir + or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR", "") + or (REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2") + ) + if not model_root.exists(): + raise FileNotFoundError( + "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." + ) + + script_path = Path( + vllm_script + or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") + or DEFAULT_SCRIPT + ) + if not script_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") + + python_exe = Path( + python_bin + or os.environ.get("GLOSSAPI_DEEPSEEK_PYTHON", "") + or os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", "") + or sys.executable + ) + if not python_exe.exists(): + raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") + + _run_cli( + input_dir=input_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) + results: Dict[str, Any] = {} for name in file_list: pdf_path = (input_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" - results[stem] = _run_one_pdf(pdf_path, md_path, metrics_path, cfg) + if not md_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR did not produce markdown for {name}: {md_path}") + if not md_path.read_text(encoding="utf-8").strip(): + raise RuntimeError(f"DeepSeek OCR produced empty markdown for {name}: {md_path}") + page_count = _page_count(pdf_path) + if metrics_path.exists(): + try: + results[stem] = json.loads(metrics_path.read_text(encoding="utf-8")) + continue + except Exception: + pass + results[stem] = {"page_count": page_count} + metrics_path.write_text(json.dumps(results[stem], indent=2), encoding="utf-8") return results diff --git a/src/glossapi/ocr/docling/__init__.py b/src/glossapi/ocr/docling/__init__.py new file mode 100644 index 0000000..28d4b0a --- /dev/null +++ b/src/glossapi/ocr/docling/__init__.py @@ -0,0 +1,5 @@ +"""Docling PDF pipeline helpers used by GlossAPI.""" + +from .pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py new file mode 100644 index 0000000..aea64fd --- /dev/null +++ b/src/glossapi/ocr/docling/pipeline.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + LayoutOptions, + PdfPipelineOptions, + PictureDescriptionApiOptions, + TableFormerMode, + TableStructureOptions, +) + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + acc = AcceleratorOptions(device=dev) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def _apply_common_pdf_options( + *, + acc: AcceleratorOptions, + images_scale: float, + formula_enrichment: bool, + code_enrichment: bool, +) -> PdfPipelineOptions: + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + opts = PdfPipelineOptions( + accelerator_options=acc, + layout_options=LayoutOptions(), + do_ocr=False, + do_table_structure=True, + do_formula_enrichment=bool(formula_enrichment), + do_code_enrichment=bool(code_enrichment), + force_backend_text=False, + generate_parsed_pages=False, + table_structure_options=table_opts, + allow_external_plugins=True, + ) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", images_scale) + except Exception: + pass + return opts + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Create a Docling layout-only PDF pipeline.""" + + acc, _ = _resolve_accelerator(device) + opts = _apply_common_pdf_options( + acc=acc, + images_scale=float(images_scale), + formula_enrichment=formula_enrichment, + code_enrichment=code_enrichment, + ) + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py new file mode 100644 index 0000000..ef85950 --- /dev/null +++ b/src/glossapi/ocr/docling_pipeline.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + LayoutOptions, + PdfPipelineOptions, + PictureDescriptionApiOptions, + TableFormerMode, + TableStructureOptions, +) + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + acc = AcceleratorOptions(device=dev) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Build the Docling PDF pipeline used for Phase-1 extraction.""" + + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + acc, _ = _resolve_accelerator(device) + opts = PdfPipelineOptions( + accelerator_options=acc, + layout_options=LayoutOptions(), + do_ocr=False, + do_table_structure=True, + do_formula_enrichment=bool(formula_enrichment), + do_code_enrichment=bool(code_enrichment), + force_backend_text=False, + generate_parsed_pages=False, + table_structure_options=table_opts, + allow_external_plugins=True, + ) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", float(images_scale)) + except Exception: + pass + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts + + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/__init__.py b/src/glossapi/ocr/rapidocr/__init__.py deleted file mode 100644 index c0d1232..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""RapidOCR subpackage with lazy re-exports.""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - -__all__ = [ - "dispatch", - "docling_pipeline", - "pool", - "safe", - "onnx", - "_paths", - "pipeline", -] - - -def __getattr__(name: str) -> Any: - if name in __all__: - return import_module(f"glossapi.ocr.rapidocr.{name}") - raise AttributeError(name) - - -def __dir__() -> list[str]: - return sorted(set(globals().keys()) | set(__all__)) diff --git a/src/glossapi/ocr/rapidocr/__init__.py.backup b/src/glossapi/ocr/rapidocr/__init__.py.backup deleted file mode 100644 index 865f119..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py.backup +++ /dev/null @@ -1,6 +0,0 @@ -"""RapidOCR subpackage (shim).""" - -from __future__ import annotations - -__all__ = ["dispatch"] - diff --git a/src/glossapi/ocr/rapidocr/_paths.py b/src/glossapi/ocr/rapidocr/_paths.py deleted file mode 100644 index 4c1cc2a..0000000 --- a/src/glossapi/ocr/rapidocr/_paths.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Tuple -import importlib -import os - - -@dataclass -class ResolvedOnnx: - det: Optional[str] - rec: Optional[str] - cls: Optional[str] - keys: Optional[str] - - -def _find_first(base: Path, patterns: list[str]) -> Optional[str]: - for pat in patterns: - for p in base.rglob(pat): - if p.is_file(): - return str(p) - return None - - -def _resolve_packaged_cls_fallback() -> Optional[str]: - try: - rapidocr = importlib.import_module("rapidocr") - base = Path(rapidocr.__file__).resolve().parent / "models" - pref = base / "ch_ppocr_mobile_v2.0_cls_infer.onnx" - if pref.exists(): - return str(pref) - return _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - except Exception: - return None - - -def resolve_packaged_onnx_and_keys() -> ResolvedOnnx: - """Locate ONNX det/rec/cls and Greek keys packaged with the glossapi package. - - Search order: - 1) GLOSSAPI_RAPIDOCR_ONNX_DIR (env var) with heuristic file names - 2) Under the installed glossapi package folder `models/` and common subfolders - 3) CLS only: fallback to RapidOCR’s bundled cls model if missing - """ - # 1) Explicit override directory - override = os.getenv("GLOSSAPI_RAPIDOCR_ONNX_DIR") - if override: - base = Path(override) - det = _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - if det or rec or cls or keys: - return ResolvedOnnx(det, rec, cls, keys) - - # 2) Search inside installed glossapi package - try: - glossapi = importlib.import_module("glossapi") - pkg_root = Path(glossapi.__file__).resolve().parent - # Candidate asset directories inside the package - candidates = [ - pkg_root / "models", - pkg_root / "models" / "rapidocr", - pkg_root / "models" / "rapidocr" / "onnx", - pkg_root / "models" / "rapidocr" / "keys", - pkg_root / "resources", - pkg_root / "assets", - pkg_root / "data", - ] - det = rec = cls = keys = None - for base in candidates: - if not base.exists(): - continue - det = det or _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = rec or _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = cls or _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = keys or _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - - if cls is None: - cls = _resolve_packaged_cls_fallback() - return ResolvedOnnx(det, rec, cls, keys) - except Exception: - return ResolvedOnnx(None, None, _resolve_packaged_cls_fallback(), None) - - -def summarize_resolution() -> Tuple[bool, str]: - r = resolve_packaged_onnx_and_keys() - ok = bool(r.det and r.rec and r.cls and r.keys) - msg = f"det={bool(r.det)} rec={bool(r.rec)} cls={bool(r.cls)} keys={bool(r.keys)}" - return ok, msg - diff --git a/src/glossapi/ocr/rapidocr/dispatch.py b/src/glossapi/ocr/rapidocr/dispatch.py deleted file mode 100644 index 7deeba2..0000000 --- a/src/glossapi/ocr/rapidocr/dispatch.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -from typing import Iterable, Optional - - -def run_via_extract( - corpus, - files: Iterable[str], - *, - export_doc_json: bool = False, - internal_debug: bool = False, - content_debug: Optional[bool] = None, -) -> None: - """Thin adapter that forwards to Corpus.extract for RapidOCR/Docling. - - This exists for symmetry with deepseek_runner and to keep the OCR package - as the single entry point for OCR backends. - """ - # Note: internal_debug/content_debug are no-ops for the Docling/RapidOCR path. - # Docling's output already produces a single concatenated Markdown document. - corpus.extract( - input_format="pdf", - num_threads=1, # let extract decide; override in tests if needed - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=list(files), - skip_existing=False, - export_doc_json=bool(export_doc_json), - emit_formula_index=bool(export_doc_json), - phase1_backend="docling", - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py b/src/glossapi/ocr/rapidocr/docling_pipeline.py deleted file mode 100644 index bb8988f..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi.ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup b/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup deleted file mode 100644 index f80344d..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi._rapidocr_paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi._pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/onnx.py b/src/glossapi/ocr/rapidocr/onnx.py deleted file mode 100644 index 57430d1..0000000 --- a/src/glossapi/ocr/rapidocr/onnx.py +++ /dev/null @@ -1,105 +0,0 @@ -"""OCR helpers for GlossAPI using Docling + RapidOCR (ONNXRuntime). - -GPU-first OCR that auto-discovers packaged ONNX models and Greek keys within -the installed `glossapi` package. Designed as a drop-in for Corpus.ocr(). -""" -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Dict, Any, Tuple - -_PIPELINE_CACHE: dict[str, Tuple[object, object]] = {} - - -def _build_pipeline( - device: Optional[str] = None, - *, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, -): - # Delegate to canonical builder to avoid duplication - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline - - engine, opts = build_rapidocr_pipeline( - device=(device or "cuda:0"), - text_score=(0.45 if text_score is None else float(text_score)), - images_scale=(1.25 if images_scale is None else float(images_scale)), - formula_enrichment=False, - code_enrichment=False, - ) - # Apply use_cls override if requested - try: - if use_cls is not None and hasattr(opts, "ocr_options"): - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - except Exception: - pass - return engine, opts - - -def run_rapidocr_onnx( - pdf_path: Path | str, - *, - device: Optional[str] = None, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, - max_pages: Optional[int] = None, -) -> Dict[str, Any]: - """Run Docling + RapidOCR (ONNX) OCR on a PDF and return markdown text. - - Returns - ------- - dict with keys: - - markdown_text: str - - duration_s: float - - pages: int - - models: dict with file names of det/rec/cls/keys - """ - from time import perf_counter - pdf_p = Path(pdf_path) - if not pdf_p.exists(): - raise FileNotFoundError(pdf_p) - - key = str(device or "cuda:0").lower() - cached = _PIPELINE_CACHE.get(key) - if cached is None: - pipe, r = _build_pipeline(device=device, use_cls=use_cls, text_score=text_score, images_scale=images_scale) - _PIPELINE_CACHE[key] = (pipe, r) - else: - pipe, r = cached # type: ignore[misc] - - t0 = perf_counter() - conv = pipe.convert(source=str(pdf_p)) # type: ignore[attr-defined] - doc = conv.document - md_text = doc.export_to_markdown() - duration = perf_counter() - t0 - - # Attempt to get page count from conv/document - pages = 0 - try: - if hasattr(doc, "pages"): - pages = len(doc.pages) # type: ignore[attr-defined] - except Exception: - pages = 0 - - # Return model identifiers as file names only (no full paths) - import os as _os - models = { - "det": _os.path.basename(r.det) if r.det else None, - "rec": _os.path.basename(r.rec) if r.rec else None, - "cls": _os.path.basename(r.cls) if r.cls else None, - "keys": _os.path.basename(r.keys) if r.keys else None, - } - - return { - "markdown_text": md_text or "", - "duration_s": duration, - "pages": int(pages), - "models": models, - } - - -__all__ = [ - "run_rapidocr_onnx", -] diff --git a/src/glossapi/ocr/rapidocr/pipeline.py b/src/glossapi/ocr/rapidocr/pipeline.py deleted file mode 100644 index a623c3d..0000000 --- a/src/glossapi/ocr/rapidocr/pipeline.py +++ /dev/null @@ -1,229 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PictureDescriptionApiOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import DocumentConverter, PdfFormatOption - -from ._paths import resolve_packaged_onnx_and_keys -from .pool import GLOBAL_RAPID_OCR_POOL -from .safe import SafeRapidOcrModel, patch_docling_rapidocr - -_logger = logging.getLogger(__name__) - -patch_docling_rapidocr() - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def _apply_common_pdf_options( - *, - acc: AcceleratorOptions, - images_scale: float, - formula_enrichment: bool, - code_enrichment: bool, -) -> PdfPipelineOptions: - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - # Prefer lightweight placeholder picture descriptions to avoid heavy VLM backends. - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - return opts - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Builder for a Docling PDF pipeline without RapidOCR. - - Returns ``(converter, PdfPipelineOptions)`` where ``converter`` is a - ``StandardPdfPipeline`` configured for layout extraction only. - """ - - acc, _ = _resolve_accelerator(device) - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts - - -def build_rapidocr_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Canonical builder for Docling + RapidOCR pipeline. - - Returns a tuple (engine, PdfPipelineOptions). Prefers explicit RapidOCR injection - when supported; otherwise returns a DocumentConverter using the factory path. - """ - - def _fallback_layout(reason: str) -> Tuple[object, PdfPipelineOptions]: - _logger.warning( - "RapidOCR pipeline fallback: %s. Using Docling layout-only configuration.", - reason, - ) - pipeline, opts = build_layout_pipeline( - device=device, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - return pipeline, opts - - acc, want_cuda = _resolve_accelerator(device) - - # Optional provider preflight only when CUDA requested - if want_cuda: - try: - import onnxruntime as ort # type: ignore - - prov = ort.get_available_providers() - if "CUDAExecutionProvider" not in prov: - raise RuntimeError(f"CUDAExecutionProvider not available: {prov}") - except Exception as e: # pragma: no cover - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - return _fallback_layout("packaged RapidOCR ONNX assets missing") - - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=["el", "en"], - force_full_page_ocr=False, - use_det=True, - use_cls=False, - use_rec=True, - text_score=text_score, - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - opts.do_ocr = True - opts.ocr_options = ocr_opts - - # Prefer explicit injection of RapidOCR model when available - try: - from docling.models.rapid_ocr_model import RapidOcrModel # type: ignore - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - import inspect - - sig = inspect.signature(StandardPdfPipeline.__init__) - if "ocr_model" not in sig.parameters: - raise RuntimeError("Docling build does not support RapidOCR injection") - - def _factory(): - try: - return SafeRapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - except Exception: # pragma: no cover - # Fall back to the stock implementation if our wrapper misbehaves. - return RapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - - pooled_model = GLOBAL_RAPID_OCR_POOL.get( - str(acc.device), - ocr_opts, - _factory, - expected_type=SafeRapidOcrModel, - ) - pipeline = StandardPdfPipeline(opts, ocr_model=pooled_model) # type: ignore - return pipeline, opts - except Exception as exc: - _logger.warning( - "RapidOCR injection unavailable (%s); using DocumentConverter factory path.", - exc, - ) - - # Fallback: use DocumentConverter factory - try: - converter = DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} - ) - return converter, opts - except Exception as exc: - return _fallback_layout(f"DocumentConverter failed: {exc}") - - -__all__ = ["build_layout_pipeline", "build_rapidocr_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/pool.py b/src/glossapi/ocr/rapidocr/pool.py deleted file mode 100644 index db1e8f2..0000000 --- a/src/glossapi/ocr/rapidocr/pool.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Shared RapidOCR engine pooling utilities.""" -from __future__ import annotations - -from dataclasses import dataclass -from threading import Lock -from typing import Callable, Dict, Optional, Union, Type - -from docling.datamodel.pipeline_options import RapidOcrOptions - - -@dataclass(frozen=True) -class _PoolKey: - device: str - det_model_path: str - rec_model_path: str - cls_model_path: str - lang: Tuple[str, ...] - text_score: float - use_det: bool - use_cls: bool - use_rec: bool - - -class RapidOcrEnginePool: - """Process-local cache of RapidOCR models keyed by configuration.""" - - def __init__(self) -> None: - self._lock = Lock() - self._cache: Dict[_PoolKey, object] = {} - - def _make_key(self, device: str, opts: RapidOcrOptions) -> _PoolKey: - lang = tuple(opts.lang or []) - return _PoolKey( - device=str(device), - det_model_path=str(getattr(opts, "det_model_path", "")), - rec_model_path=str(getattr(opts, "rec_model_path", "")), - cls_model_path=str(getattr(opts, "cls_model_path", "")), - lang=lang, - text_score=float(getattr(opts, "text_score", 0.0)), - use_det=bool(getattr(opts, "use_det", True)), - use_cls=bool(getattr(opts, "use_cls", False)), - use_rec=bool(getattr(opts, "use_rec", True)), - ) - - def get( - self, - device: str, - opts: RapidOcrOptions, - factory: Callable[[], object], - *, - expected_type: Optional[Union[Type[object], tuple[Type[object], ...]]] = None, - ) -> object: - key = self._make_key(device, opts) - with self._lock: - model = self._cache.get(key) - if expected_type is not None and model is not None and not isinstance(model, expected_type): - self._cache.pop(key, None) - model = None - if model is None: - model = factory() - if expected_type is None or isinstance(model, expected_type): - self._cache[key] = model - return model - - def clear(self) -> None: - with self._lock: - self._cache.clear() - - -GLOBAL_RAPID_OCR_POOL = RapidOcrEnginePool() - -__all__ = ["RapidOcrEnginePool", "GLOBAL_RAPID_OCR_POOL"] diff --git a/src/glossapi/ocr/rapidocr/safe.py b/src/glossapi/ocr/rapidocr/safe.py deleted file mode 100644 index 5534563..0000000 --- a/src/glossapi/ocr/rapidocr/safe.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Temporary wrappers around Docling's RapidOCR integration. - -The upstream Docling release (2.48.x) does not tolerate RapidOCR returning -``None`` for a given crop. That bubbles up as an AttributeError inside the -conversion loop and the entire document fails. Until Docling includes a fix, we -wrap the loader so that ``None`` simply means "no detections" and processing -continues. Once Docling ships a release with the guard we can drop this shim and -revert to the vanilla ``RapidOcrModel``. -""" - -from __future__ import annotations - -import importlib.util -import sys -from collections.abc import Iterable -from pathlib import Path -from typing import Optional, Type - -import numpy - -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrOptions, RapidOcrOptions -from docling.models.rapid_ocr_model import RapidOcrModel as _RapidOcrModel -from docling.models.rapid_ocr_model import TextCell, _log -from docling.utils.profiling import TimeRecorder -from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle - -from ._paths import resolve_packaged_onnx_and_keys - - -class SafeRapidOcrModel(_RapidOcrModel): - """Drop-in RapidOCR wrapper that copes with ``None`` OCR results. - - Docling 2.48.0 assumes ``self.reader`` always returns an object with - ``boxes/txts/scores``. RapidOCR occasionally yields ``None`` for problematic - crops, which crashes the extractor. We normalise the return value before the - original list(zip(...)) call and treat anything unexpected as "no boxes". - Remove this once Docling hardens the upstream implementation. - """ - - # NOTE: keep signature identical so StandardPdfPipeline can instantiate it. - _rapidocr_available: Optional[bool] = None - - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - options: RapidOcrOptions, - accelerator_options, - ): - rapidocr_available = self._rapidocr_available - if rapidocr_available is None: - rapidocr_available = bool( - importlib.util.find_spec("rapidocr") is not None or "rapidocr" in sys.modules - ) - SafeRapidOcrModel._rapidocr_available = rapidocr_available - - effective_enabled = bool(enabled and rapidocr_available) - if enabled and not rapidocr_available: - _log.warning( - "RapidOCR python package not found; continuing with Docling pipeline OCR disabled." - ) - - if effective_enabled: - try: - resolved = resolve_packaged_onnx_and_keys() - - _log.warning( - 'SafeRapidOcrModel initial options: det=%s rec=%s cls=%s keys=%s', - getattr(options, 'det_model_path', None), - getattr(options, 'rec_model_path', None), - getattr(options, 'cls_model_path', None), - getattr(options, 'rec_keys_path', None), - ) - - if resolved.det: - options.det_model_path = resolved.det - if resolved.rec: - options.rec_model_path = resolved.rec - if resolved.cls: - options.cls_model_path = resolved.cls - if resolved.keys: - options.rec_keys_path = resolved.keys - - try: - from rapidocr.ch_ppocr_rec import main as _rapidocr_rec_main - - if not getattr(_rapidocr_rec_main.TextRecognizer, '_glossapi_patch', False): - original_get_character_dict = _rapidocr_rec_main.TextRecognizer.get_character_dict - - def _patched_get_character_dict(self, cfg): - try: - current_keys = cfg.get('keys_path', None) - current_rec_keys = cfg.get('rec_keys_path', None) - if current_rec_keys is None and current_keys is not None: - cfg['rec_keys_path'] = current_keys - _log.warning('Patched RapidOCR cfg: set rec_keys_path from keys_path=%s', current_keys) - else: - _log.warning('Patched RapidOCR cfg: existing rec_keys_path=%s keys_path=%s', current_rec_keys, current_keys) - except Exception: - _log.warning('RapidOCR cfg inspection failed', exc_info=True) - return original_get_character_dict(self, cfg) - - _rapidocr_rec_main.TextRecognizer.get_character_dict = _patched_get_character_dict - _rapidocr_rec_main.TextRecognizer._glossapi_patch = True - except Exception: - _log.warning('Failed to patch RapidOCR TextRecognizer for keys fallback', exc_info=True) - - _log.warning( - 'SafeRapidOcrModel using packaged assets: det=%s rec=%s cls=%s keys=%s', - options.det_model_path, - options.rec_model_path, - options.cls_model_path, - options.rec_keys_path, - ) - except Exception: - _log.warning( - 'SafeRapidOcrModel bootstrap failed to resolve packaged assets', - exc_info=True, - ) - - super().__init__( - enabled=effective_enabled, - artifacts_path=artifacts_path, - options=options, - accelerator_options=accelerator_options, - ) - - @classmethod - def get_options_type(cls) -> Type[OcrOptions]: - return RapidOcrOptions - - def _normalise_result(self, result): - """Return an iterable of (bbox, text, score) triples. - - RapidOCR returns ``None`` or semi-populated structures in some corner - cases. We swallow those and log a one-line warning so the page still - progresses through the pipeline. - """ - - if result is None: - _log.warning("RapidOCR returned None; skipping crop") - return [] - boxes = getattr(result, "boxes", None) - txts = getattr(result, "txts", None) - scores = getattr(result, "scores", None) - if boxes is None or txts is None or scores is None: - _log.warning("RapidOCR returned incomplete data; treating crop as empty") - return [] - try: - return list(zip(boxes.tolist(), txts, scores)) - except Exception as exc: # pragma: no cover - defensive only - _log.warning("RapidOCR result normalisation failed: %s", exc) - return [] - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - if not self.enabled: - yield from page_batch - return - - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - continue - - with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - raw_result = self.reader( - im, - use_det=self.options.use_det, - use_cls=self.options.use_cls, - use_rec=self.options.use_rec, - ) - result = self._normalise_result(raw_result) - del high_res_image - del im - - if not result: - continue - - cells = [ - TextCell( - index=ix, - text=line[1], - orig=line[1], - confidence=line[2], - from_ocr=True, - rect=BoundingRectangle.from_bounding_box( - BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ) - ), - ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) - - self.post_process_cells(all_ocr_cells, page) - - from docling.datamodel.settings import settings - - if settings.debug.visualize_ocr: - self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) - - yield page - - -def patch_docling_rapidocr() -> bool: - """Replace Docling's RapidOcrModel with the safe shim if available.""" - - try: - import docling.models.rapid_ocr_model as rapid_module - except Exception: # pragma: no cover - Docling missing - return False - - current = getattr(rapid_module, "RapidOcrModel", None) - if current is SafeRapidOcrModel: - return False - - rapid_module.RapidOcrModel = SafeRapidOcrModel - try: - from docling.models.factories import get_ocr_factory # type: ignore - import logging - except Exception: - return True - - try: - factory = get_ocr_factory() - options_type = SafeRapidOcrModel.get_options_type() - - if hasattr(factory, "classes"): - factory.classes[options_type] = SafeRapidOcrModel - elif hasattr(factory, "_classes"): - factory._classes[options_type] = SafeRapidOcrModel - logging.getLogger(__name__).info( - "Registered SafeRapidOcrModel for %s", options_type - ) - try: - from docling.pipeline import standard_pdf_pipeline as _std_pdf # type: ignore - from docling.datamodel.pipeline_options import RapidOcrOptions # type: ignore - from functools import lru_cache - except Exception as _exc: # pragma: no cover - best effort - logging.getLogger(__name__).warning( - "Docling factory patch limited to local mutation: %s", _exc - ) - else: - original_get_factory = getattr( - _std_pdf.get_ocr_factory, "__wrapped__", _std_pdf.get_ocr_factory - ) - - def _ensure_safe(factory_obj): - try: - current = factory_obj.classes.get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - factory_obj.classes[RapidOcrOptions] = SafeRapidOcrModel - except AttributeError: - current = getattr(factory_obj, "_classes", {}).get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - getattr(factory_obj, "_classes", {})[RapidOcrOptions] = SafeRapidOcrModel - return factory_obj - - @lru_cache(maxsize=None) - def _patched_get_ocr_factory(allow_external_plugins: bool = False): - return _ensure_safe(original_get_factory(allow_external_plugins)) - - _patched_get_ocr_factory.__wrapped__ = original_get_factory # type: ignore[attr-defined] - _std_pdf.get_ocr_factory = _patched_get_ocr_factory # type: ignore[attr-defined] - try: - _ensure_safe(_std_pdf.get_ocr_factory(False)) - except Exception: - pass - except Exception as exc: # pragma: no cover - best effort - import logging - - logging.getLogger(__name__).warning( - "Failed to re-register SafeRapidOcrModel: %s", exc - ) - return True - - -__all__ = ["SafeRapidOcrModel", "patch_docling_rapidocr"] diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 29db5be..424d359 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -50,12 +50,6 @@ def make_corpus(tmp_path): return Corpus(input_dir=input_dir, output_dir=output_dir) -def set_onnx_providers(monkeypatch, providers): - stub = SimpleNamespace(get_available_providers=lambda: providers) - monkeypatch.setitem(sys.modules, "onnxruntime", stub) - return stub - - def set_torch_stub(monkeypatch, *, available: bool, device_count: int): cuda_ns = SimpleNamespace( is_available=lambda: available, @@ -70,8 +64,7 @@ def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() - set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CPUExecutionProvider"]) + set_torch_stub(monkeypatch, available=False, device_count=0) with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( @@ -81,7 +74,7 @@ def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): phase1_backend="docling", ) - assert "CUDAExecutionProvider" in str(exc.value) + assert "Torch CUDA is not available" in str(exc.value) def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -89,8 +82,6 @@ def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -106,8 +97,6 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CPU", @@ -125,8 +114,6 @@ def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatc corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=2) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CUDA", @@ -147,8 +134,6 @@ def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypa corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", diff --git a/tests/test_deepseek_preflight.py b/tests/test_deepseek_preflight.py index 1900a2b..73e761d 100644 --- a/tests/test_deepseek_preflight.py +++ b/tests/test_deepseek_preflight.py @@ -1,5 +1,4 @@ import sys -from pathlib import Path from glossapi.ocr.deepseek.preflight import check_deepseek_env @@ -9,45 +8,34 @@ def test_preflight_reports_missing_components(tmp_path): "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "0", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "1", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": str(tmp_path / "missing_python"), - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(tmp_path / "missing_script.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(tmp_path / "missing_script.py"), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(tmp_path / "missing_model"), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(tmp_path / "missing_lib"), - "PATH": str(tmp_path), # no cc1plus here } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) names = {c.name for c in report.errors} + assert "allow_cli" in names + assert "allow_stub" in names assert "deepseek_python" in names - assert "vllm_script" in names + assert "runner_script" in names assert "model_dir" in names - assert "ld_library_path" in names - assert "cc1plus" in names assert not report.ok def test_preflight_passes_with_complete_env(tmp_path): - script = tmp_path / "run_pdf_ocr_vllm.py" + script = tmp_path / "run_pdf_ocr_transformers.py" script.write_text("#!/usr/bin/env python3\n", encoding="utf-8") - model_dir = tmp_path / "DeepSeek-OCR" + model_dir = tmp_path / "DeepSeek-OCR-2" model_dir.mkdir() (model_dir / "config.json").write_text("{}", encoding="utf-8") (model_dir / "model-00001-of-000001.safetensors").write_bytes(b"stub") - lib_dir = tmp_path / "libjpeg" - lib_dir.mkdir() - fake_bin = tmp_path / "bin" - fake_bin.mkdir() - cc1plus = fake_bin / "cc1plus" - cc1plus.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") - cc1plus.chmod(0o755) env = { "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "1", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "0", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": sys.executable, - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(script), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(script), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(model_dir), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(lib_dir), - "PATH": str(fake_bin), } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) assert report.ok assert not report.errors diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py new file mode 100644 index 0000000..a5a93e4 --- /dev/null +++ b/tests/test_deepseek_runner_contract.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import pandas as pd +import pytest + + +def _mk_corpus(tmp_path: Path): + from glossapi import Corpus + + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_deepseek_backend_rejects_stub_mode(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + df = pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ) + parquet_path = dl_dir / "download_results.parquet" + df.to_parquet(parquet_path, index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%real\n") + + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") + + with pytest.raises(RuntimeError, match="stub execution has been removed"): + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc[fname, "ocr_success"]) is False + assert bool(updated.loc[fname, "needs_ocr"]) is True + + +def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _write_outputs, _write_progress + + output_dir = tmp_path / "output" + _write_progress( + output_dir=output_dir, + stem="doc", + page_outputs=["page one"], + total_pages=5, + completed_pages=1, + ) + + canonical_markdown = output_dir / "markdown" / "doc.md" + progress_markdown = output_dir / "sidecars" / "ocr_progress" / "doc.partial.md" + progress_json = output_dir / "json" / "metrics" / "doc.progress.json" + + assert not canonical_markdown.exists() + assert progress_markdown.exists() + assert progress_json.exists() + + _write_outputs(output_dir=output_dir, stem="doc", markdown="final", page_count=5) + + assert canonical_markdown.exists() + assert canonical_markdown.read_text(encoding="utf-8") == "final\n" + assert not progress_markdown.exists() diff --git a/tests/test_deepseek_runner_stub.py b/tests/test_deepseek_runner_stub.py deleted file mode 100644 index aee5177..0000000 --- a/tests/test_deepseek_runner_stub.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -import pandas as pd - - -def _mk_corpus(tmp_path: Path): - from glossapi import Corpus - - root = tmp_path / "corpus" - root.mkdir() - return Corpus(input_dir=root, output_dir=root) - - -def test_deepseek_backend_stub_runs_and_updates_parquet(tmp_path, monkeypatch): - corpus = _mk_corpus(tmp_path) - - # Seed a minimal metadata parquet with one bad file - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - fname = "doc.pdf" - df = pd.DataFrame( - [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] - ) - parquet_path = dl_dir / "download_results.parquet" - df.to_parquet(parquet_path, index=False) - - # Create an empty placeholder file for the PDF - (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") - - # Monkeypatch the runner internal to avoid heavy imports - from glossapi.ocr.deepseek import runner - - def fake_run_one(pdf_path, md_out, metrics_out, cfg): - md_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("deepseek stub output\n", encoding="utf-8") - metrics_out.write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") - return {"page_count": 1} - - monkeypatch.setattr(runner, "_run_one_pdf", fake_run_one) - - # Run OCR via dispatcher - corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) - - # Artifacts exist - stem = "doc" - md = corpus.output_dir / "markdown" / f"{stem}.md" - metrics = corpus.output_dir / "json" / "metrics" / f"{stem}.metrics.json" - assert md.exists(), "Markdown output should be created by deepseek stub" - assert metrics.exists(), "Metrics JSON should be created by deepseek stub" - - # Parquet updated - updated = pd.read_parquet(parquet_path).set_index("filename") - row = updated.loc[fname] - assert bool(row["ocr_success"]) is True - assert bool(row["needs_ocr"]) is False - # extraction_mode is optional; if present assert value - if "extraction_mode" in updated.columns: - assert updated.loc[fname, "extraction_mode"] == "deepseek" diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index 0419ba4..6c410c5 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -11,7 +11,7 @@ def _mk_corpus(tmp_path: Path): return Corpus(input_dir=root, output_dir=root) -def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): +def test_deepseek_ocr_then_math_only_smoke(tmp_path, monkeypatch): corpus = _mk_corpus(tmp_path) # Two PDFs: one needs OCR, one does not (for math-only later) @@ -28,7 +28,7 @@ def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) - # DeepSeek stub for OCR + # DeepSeek runner is stubbed here only to avoid the heavy model during unit tests. from glossapi.ocr.deepseek import runner def fake_run_for_files(self_ref, files, **kwargs): @@ -45,7 +45,7 @@ def fake_run_for_files(self_ref, files, **kwargs): # Run DeepSeek OCR for bad files corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=True, mode="ocr_bad_then_math") - # RapidOCR math-only pass: ensure JSON for clean.pdf and run math + # Math-only pass: ensure JSON for clean.pdf and run math json_dir = corpus.output_dir / "json" json_dir.mkdir(parents=True, exist_ok=True) (json_dir / "clean.docling.json").write_text("{}", encoding="utf-8") @@ -58,7 +58,7 @@ def fake_enrich(files=None, **kwargs): monkeypatch.setattr(corpus, "formula_enrich_from_json", fake_enrich) - corpus.ocr(backend="rapidocr", fix_bad=False, math_enhance=True, mode="math_only") + corpus.ocr(backend="deepseek", fix_bad=False, math_enhance=True, mode="math_only") # Verify updated = pd.read_parquet(parquet_path).set_index("filename") diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 965692b..3779d07 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -51,29 +51,7 @@ def fail_math(*args, **kwargs): assert calls.get("files") == [fname] -def test_rapidocr_backend_routes_to_extract_with_docling(tmp_path, monkeypatch): +def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) - - # Seed minimal metadata parquet that flags a single file for OCR - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - df = pd.DataFrame([ - {"filename": "doc.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False} - ]) - df.to_parquet(dl_dir / "download_results.parquet", index=False) - - captured = {} - - def fake_extract(**kwargs): - captured.update(kwargs) - return None - - monkeypatch.setattr(corpus, "extract", fake_extract) - - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False, use_gpus="single", devices=[0]) - - assert captured, "Expected extract() to be called for rapidocr backend" - assert captured.get("force_ocr") is True - assert captured.get("phase1_backend") == "docling" - files = captured.get("filenames") or [] - assert files and files[0] == "doc.pdf" + with pytest.raises(ValueError, match="backend must be 'deepseek'"): + corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) diff --git a/tests/test_ocr_imports.py b/tests/test_ocr_imports.py index 3487619..094e72b 100644 --- a/tests/test_ocr_imports.py +++ b/tests/test_ocr_imports.py @@ -8,32 +8,19 @@ def test_import_ocr_package_is_lightweight(): import glossapi.ocr as ocr assert hasattr(ocr, "deepseek") - assert hasattr(ocr, "rapidocr") # New subpackages remain importable lazily import glossapi.ocr.deepseek.runner as deepseek_runner - import glossapi.ocr.rapidocr.dispatch as rapid_dispatch assert ocr.deepseek.runner is deepseek_runner - assert ocr.rapidocr.dispatch is rapid_dispatch assert ocr.deepseek_runner is deepseek_runner - assert ocr.rapidocr_dispatch is rapid_dispatch assert hasattr(deepseek_runner, "run_for_files") - assert hasattr(rapid_dispatch, "run_via_extract") # Utilities module always available (pure Python) from glossapi.ocr.utils import json_io as utils_json assert hasattr(utils_json, "export_docling_json") - if importlib.util.find_spec("docling") is not None: - try: - from glossapi.ocr.rapidocr import pool as rapid_pool - except ModuleNotFoundError: - pytest.skip("Docling optional dependencies not available") - else: - assert hasattr(rapid_pool, "GLOBAL_RAPID_OCR_POOL") - if importlib.util.find_spec("docling_core") is not None: try: from glossapi.ocr.math import enrich_from_docling_json, RoiEntry diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 4fe7464..7dae1b7 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -1,4 +1,5 @@ import os +import sys from pathlib import Path import pandas as pd @@ -7,10 +8,6 @@ pytest.importorskip("docling") pytest.importorskip("glossapi_rs_cleaner") -pytest.importorskip( - "onnxruntime", reason="RapidOCR/DeepSeek end-to-end tests require onnxruntime" -) -import onnxruntime as ort # noqa: E402 from glossapi import Corpus from glossapi.corpus import _resolve_skiplist_path @@ -106,11 +103,8 @@ def _assert_dir_contents( pytest.fail(f"Unexpected file {entry} in {root}") -@pytest.mark.rapidocr -def test_pipeline_smoke_and_artifacts(tmp_path): +def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for pipeline smoke test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -146,6 +140,21 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert bool(needs.get("blank.pdf")), "Blank PDF should be flagged for OCR" assert not bool(needs.get("text.pdf")) + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + (markdown_dir / f"{stem}.md").write_text("[[Blank page]]\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( mode="ocr_bad", use_gpus="single", @@ -193,15 +202,8 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert sections_file.exists() -@pytest.mark.rapidocr def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - - assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -256,6 +258,25 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert bool(greek_row["needs_ocr"]), "Greek consonant doc should require OCR rerun" assert "non_greek_text" in str(greek_row.get("filter", "")), "Filter should record non-Greek text" + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + if stem == "greek_consonants": + text = documents["greek_consonants"] + else: + text = documents.get(stem) or "[[Blank page]]" + (markdown_dir / f"{stem}.md").write_text(f"{text}\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( fix_bad=True, math_enhance=True, @@ -268,6 +289,15 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not bool(greek_after["needs_ocr"]), "Greek consonant doc should be resolved after OCR rerun" assert bool(greek_after.get("ocr_success", False)), "OCR rerun should mark greek consonant doc as success" + corpus.ocr( + backend="deepseek", + fix_bad=False, + math_enhance=True, + mode="math_only", + use_gpus="single", + devices=[device_idx], + ) + json_dir = corpus_dir / "json" assert json_dir.exists(), "Docling JSON directory should exist after extraction" for stem in documents: @@ -304,11 +334,8 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not skiplist_path.read_text(encoding="utf-8").strip(), "Fatal skip-list should remain empty" -@pytest.mark.rapidocr def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for OCR recovery test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -384,8 +411,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): script = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py", + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + Path.cwd() / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py", ) ) if not script.exists(): @@ -393,8 +420,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): python_bin = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_TEST_PYTHON", - Path("/mnt/data/glossAPI/deepseek_venv/bin/python"), + "GLOSSAPI_DEEPSEEK_PYTHON", + os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", sys.executable), ) ) if not python_bin.exists(): @@ -409,29 +436,17 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): if not model_dir.exists(): pytest.skip(f"DeepSeek model directory missing: {model_dir}") - lib_path = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - if not lib_path: - candidate = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" - if candidate.exists(): - lib_path = str(candidate) - if not lib_path or not Path(lib_path).exists(): - pytest.skip("Set GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH to the libjpeg-turbo library directory") - - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - device_idx = 0 if torch.cuda.device_count() > 1: device_idx = torch.cuda.current_device() - # Force the CLI path (no stub fallback) and point to the desired interpreter/script. + # Force the real runner path and point to the desired interpreter/script. monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", str(python_bin)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", str(script)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH", lib_path) - monkeypatch.setenv("VLLM_ALLOW_REMOTE_CODE", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", str(script)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) existing_py_path = os.environ.get("PYTHONPATH", "") src_path = str(Path.cwd() / "src") if existing_py_path: @@ -439,13 +454,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): else: monkeypatch.setenv("PYTHONPATH", src_path) - import glossapi.ocr.deepseek.runner as deepseek_runner - - def _raise_if_stub(*_args, **_kwargs): - raise AssertionError("DeepSeek fallback stub should not run in CLI smoke test") - - monkeypatch.setattr(deepseek_runner, "_run_one_pdf", _raise_if_stub) - corpus_dir = tmp_path / "corpus" corpus_dir.mkdir() diff --git a/tests/test_rapidocr_patch.py b/tests/test_rapidocr_patch.py deleted file mode 100644 index 93a8ca5..0000000 --- a/tests/test_rapidocr_patch.py +++ /dev/null @@ -1,368 +0,0 @@ -import importlib -import sys -import types -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pytest - - -def _clear_modules(prefix: str) -> None: - for name in list(sys.modules): - if name == prefix or name.startswith(f"{prefix}."): - sys.modules.pop(name, None) - - -def _install_docling_stub(*, supports_injection: bool) -> None: - _clear_modules("docling") - _clear_modules("docling_core") - _clear_modules("glossapi") - - def register(name: str) -> types.ModuleType: - module = types.ModuleType(name) - sys.modules[name] = module - return module - - docling = register("docling") - register("docling.backend") - register("docling.backend.docling_parse_backend").DoclingParseDocumentBackend = object - register("docling.backend.docling_parse_v2_backend").DoclingParseV2DocumentBackend = object - register("docling.backend.pypdfium2_backend").PyPdfiumDocumentBackend = object - - base_models = register("docling.datamodel.base_models") - - class InputFormat: - PDF = "pdf" - DOCX = "docx" - XML_JATS = "xml" - HTML = "html" - PPTX = "pptx" - CSV = "csv" - MD = "md" - - class ConversionStatus: - SUCCESS = "success" - PARTIAL_SUCCESS = "partial" - - class Page: - def __init__(self): - self._backend = types.SimpleNamespace( - is_valid=lambda: True, - get_page_image=lambda *args, **kwargs: types.SimpleNamespace() - ) - - base_models.InputFormat = InputFormat - base_models.ConversionStatus = ConversionStatus - base_models.Page = Page - - pipeline_opts = register("docling.datamodel.pipeline_options") - - class AcceleratorDevice: - AUTO = "auto" - CUDA = "cuda" - MPS = "mps" - CPU = "cpu" - - class AcceleratorOptions: - def __init__(self, num_threads=None, device=None): - self.num_threads = num_threads - self.device = device - - class PdfPipelineOptions: - def __init__(self, **_kwargs): - self.ocr_options = None - self.do_ocr = False - - class RapidOcrOptions: - def __init__(self, **kwargs): - for key, value in kwargs.items(): - setattr(self, key, value) - self.rec_keys_path = None - - class OcrOptions: - pass - - class LayoutOptions: - pass - - class TableStructureOptions: - def __init__(self, mode=None): - self.mode = mode - self.do_cell_matching = False - - class TableFormerMode: - ACCURATE = "accurate" - - class PictureDescriptionApiOptions: - pass - - pipeline_opts.AcceleratorDevice = AcceleratorDevice - pipeline_opts.AcceleratorOptions = AcceleratorOptions - pipeline_opts.PdfPipelineOptions = PdfPipelineOptions - pipeline_opts.RapidOcrOptions = RapidOcrOptions - pipeline_opts.OcrOptions = OcrOptions - pipeline_opts.LayoutOptions = LayoutOptions - pipeline_opts.TableStructureOptions = TableStructureOptions - pipeline_opts.TableFormerMode = TableFormerMode - pipeline_opts.PictureDescriptionApiOptions = PictureDescriptionApiOptions - - register("docling.datamodel.document").ConversionResult = object - - settings_mod = register("docling.datamodel.settings") - - class _Debug: - def __init__(self): - self.profile_pipeline_timings = False - self.visualize_ocr = False - - class _Settings: - def __init__(self): - self.debug = _Debug() - - settings_mod.settings = _Settings() - - converter_mod = register("docling.document_converter") - - class DocumentConverter: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - class PdfFormatOption: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - converter_mod.DocumentConverter = DocumentConverter - converter_mod.PdfFormatOption = PdfFormatOption - converter_mod.WordFormatOption = object - converter_mod.HTMLFormatOption = object - converter_mod.XMLJatsFormatOption = object - converter_mod.PowerpointFormatOption = object - converter_mod.MarkdownFormatOption = object - converter_mod.CsvFormatOption = object - - register("docling.pipeline.simple_pipeline").SimplePipeline = object - - pipelines_mod = register("docling.pipelines.standard_pdf_pipeline") - pipeline_mod = register("docling.pipeline.standard_pdf_pipeline") - - if supports_injection: - class StandardPdfPipeline: - def __init__(self, opts, ocr_model=None, **_): - self.opts = opts - self.ocr_model = ocr_model - else: - class StandardPdfPipeline: - def __init__(self, opts, **_): - self.opts = opts - - pipelines_mod.StandardPdfPipeline = StandardPdfPipeline - pipeline_mod.StandardPdfPipeline = StandardPdfPipeline - - rapid_module = register("docling.models.rapid_ocr_model") - - class DummyReader: - def __call__(self, *_args, **_kwargs): - return [] - - class RapidOcrModel: - def __init__(self, enabled, artifacts_path, options, accelerator_options): - self.enabled = enabled - self.reader = DummyReader() - self.options = options - - def get_ocr_rects(self, _page): - return [] - - def post_process_cells(self, _cells, _page): - pass - - class TextCell: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - class _Log: - @staticmethod - def warning(_msg, *args, **kwargs): - return None - - rapid_module.RapidOcrModel = RapidOcrModel - rapid_module.TextCell = TextCell - rapid_module._log = _Log() - - utils_mod = register("docling.utils") - profiling_mod = register("docling.utils.profiling") - - class TimeRecorder: - def __init__(self, *_args, **_kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, *exc): - return False - - profiling_mod.TimeRecorder = TimeRecorder - utils_mod.profiling = profiling_mod - - register("docling.models") - - core_doc = register("docling_core.types.doc") - - class BoundingBox: - @staticmethod - def from_tuple(coord, origin=None): - return SimpleNamespace(coord=coord, origin=origin) - - class CoordOrigin: - TOPLEFT = "topleft" - - core_doc.BoundingBox = BoundingBox - core_doc.CoordOrigin = CoordOrigin - - core_page = register("docling_core.types.doc.page") - - class BoundingRectangle: - @staticmethod - def from_bounding_box(box): - return box - - core_page.BoundingRectangle = BoundingRectangle - - -def _install_onnxruntime_stub(): - sys.modules['onnxruntime'] = types.SimpleNamespace( - get_available_providers=lambda: ['CUDAExecutionProvider'] - ) - - -def _make_safe_ocr() -> SimpleNamespace: - """Return an instantiated SafeRapidOcrModel with stubbed dependencies.""" - rapid_opts = sys.modules['docling.datamodel.pipeline_options'].RapidOcrOptions() - accel_opts = sys.modules['docling.datamodel.pipeline_options'].AcceleratorOptions(device='cuda:0') - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel - - return SafeRapidOcrModel(enabled=True, artifacts_path=None, options=rapid_opts, accelerator_options=accel_opts) - - -@pytest.fixture(autouse=True) -def _cleanup_modules(): - yield - for name in [n for n in list(sys.modules) if n.startswith('glossapi') and '_rapidocr_paths' not in n]: - if name.startswith('glossapi_rs_'): - continue - sys.modules.pop(name, None) - _clear_modules('docling') - _clear_modules('docling_core') - sys.modules.pop('onnxruntime', None) - - -def test_patch_runs_on_import(): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - rapid_module = sys.modules['docling.models.rapid_ocr_model'] - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel, patch_docling_rapidocr - - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - patch_docling_rapidocr() - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - -def test_build_rapidocr_pipeline_injects_when_supported(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - glossapi_mod = importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - captured = {} - - def fake_pool_get(device, opts, factory, expected_type): - model = factory() - assert isinstance(model, pipeline.SafeRapidOcrModel) - assert expected_type is pipeline.SafeRapidOcrModel - captured['device'] = device - captured['opts'] = opts - return SimpleNamespace() - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fake_pool_get)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - assert hasattr(engine, 'ocr_model') - assert captured['device'] == 'cuda:0' - assert opts.do_ocr is True - - -def test_build_rapidocr_pipeline_falls_back_without_injection(monkeypatch): - _install_docling_stub(supports_injection=False) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - def fail_pool(*_args, **_kwargs): - raise AssertionError('Pool should not be used when injection unsupported') - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fail_pool)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - converter_mod = importlib.import_module('docling.document_converter') - assert isinstance(engine, converter_mod.DocumentConverter) - assert opts.do_ocr is True - - -def test_safe_rapidocr_normalises_none(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - assert model._normalise_result(None) == [] - - -def test_safe_rapidocr_normalises_incomplete_and_valid_data(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - class IncompleteResult: - boxes = None - txts = ['foo'] - scores = [0.9] - - assert model._normalise_result(IncompleteResult()) == [] - - box = np.array([ - [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0]], - ]) - - class FullResult: - boxes = box - txts = ['foo'] - scores = [0.9] - - output = model._normalise_result(FullResult()) - assert output == [ - (box[0].tolist(), 'foo', 0.9) - ] From 83f7bf276078059e6665dec3ed6d548881cda8df Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 23:45:29 +0200 Subject: [PATCH 06/26] Add GitHub Pages docs workflow --- .github/workflows/docs.yml | 40 ++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 8 +------- 2 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/docs.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..6c7fcbd --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Build and Deploy Docs + +on: + push: + branches: + - development + - main + - master + workflow_dispatch: + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install MkDocs + run: | + python -m pip install --upgrade pip + pip install mkdocs mkdocs-material + + - name: Build site + run: mkdocs build --strict + + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site + publish_branch: gh-pages + force_orphan: true diff --git a/mkdocs.yml b/mkdocs.yml index 1776dd5..ba0a1e4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,14 +42,8 @@ nav: - Troubleshooting: troubleshooting.md - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - - Corpus API: api/corpus.md + - Corpus API: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md - - Divio Skeleton: - - Overview: divio/overview.md - - Tutorials: divio/tutorials.md - - How-to Guides: divio/how_to_guides.md - - Reference: divio/reference.md - - Explanation: divio/explanation.md docs_dir: docs markdown_extensions: - admonition From 1bf4261d3a2a0597c37bce0b68c8d0faa773b5e7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 23:54:20 +0200 Subject: [PATCH 07/26] Fix docs links for Pages build --- .github/workflows/docs.yml | 2 +- docs/index.md | 16 +--------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6c7fcbd..4719481 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -26,7 +26,7 @@ jobs: - name: Install MkDocs run: | python -m pip install --upgrade pip - pip install mkdocs mkdocs-material + pip install 'mkdocs<2' 'mkdocs-material<10' - name: Build site run: mkdocs build --strict diff --git a/docs/index.md b/docs/index.md index d8ec279..01ad63c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,29 +7,15 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Quickstart Recipes](quickstart.md) — common extraction/OCR flows in copy-paste form. - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. -## Understand the architecture -- [Architecture Overview](architecture/index.md) — the end-to-end staged model and why it exists. -- [Core Design Principles](architecture/core_design_principles.md) — the design constraints that shape the pipeline. -- [Docling Throughput and Batching](architecture/docling_throughput_and_batching.md) — how throughput and stability trade off. -- [Failure Recovery and Skiplist](architecture/docling_failure_recovery_and_skiplist.md) — how the pipeline survives problematic PDFs. -- [Greek Text Validation](architecture/greek_text_validation.md) — why extraction success is not enough for Greek corpora. -- [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. -- [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. -- [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. -- [DeepSeek-Only Upgrade Roadmap](architecture/deepseek_only_upgrade_roadmap.md) — the staged simplification plan for OCR and dependency upgrades. - ## Learn the pipeline - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. -- [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. ## Configure and debug - [Configuration](configuration.md) lists all environment knobs. - [Troubleshooting](troubleshooting.md) captures the most common pitfalls. - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. -- [Compatibility And Regression Matrix](testing/compatibility_matrix.md) defines the release-validation gates for the migration and upgrades. ## Reference -- [Corpus API](api/corpus.md) details public methods and parameters. -- `docs/divio/` contains placeholder pages for the upcoming Divio restructuring—feel free to open PRs fleshing them out. +- [Corpus API](api_corpus_tmp.md) details public methods and parameters. From 79cc99c237ba9ea685e0e94bc349907c26502bf7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 10 Mar 2026 01:31:58 +0200 Subject: [PATCH 08/26] docs: map pipeline concepts to implementation --- README.md | 17 ++++ docs/api/corpus.md | 216 +++++++++++++++++++++++++++++++++++++++++++++ docs/code_map.md | 61 +++++++++++++ docs/index.md | 4 +- docs/pipeline.md | 105 +++++++++++++++++++--- mkdocs.yml | 4 +- 6 files changed, 395 insertions(+), 12 deletions(-) create mode 100644 docs/api/corpus.md create mode 100644 docs/code_map.md diff --git a/README.md b/README.md index e581361..5ad3b1f 100644 --- a/README.md +++ b/README.md @@ -80,11 +80,28 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips. ## Repo Landmarks +- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files. +- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs. - `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown. - `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic. - `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus). - `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides. +## Pipeline map + +Use this as the shortest path from a documentation concept to the public call that implements it. + +| Stage | Main call | Important parameters | Writes | +| --- | --- | --- | --- | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | +| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | +| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns | +| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL | + ## Contributing - Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check. - Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together. diff --git a/docs/api/corpus.md b/docs/api/corpus.md new file mode 100644 index 0000000..40f8c47 --- /dev/null +++ b/docs/api/corpus.md @@ -0,0 +1,216 @@ +# API Reference — `glossapi.Corpus` + +The `Corpus` class is the high‑level entrypoint for the pipeline. Below are the most commonly used methods. + +Use this page as a compact contract reference. For the stage-by-stage artifact view, see `../pipeline.md`. For the source-level ownership map, see `../code_map.md`. + +## Constructor + +```python +glossapi.Corpus( + input_dir: str | Path, + output_dir: str | Path, + section_classifier_model_path: str | Path | None = None, + extraction_model_path: str | Path | None = None, + metadata_path: str | Path | None = None, + annotation_mapping: dict[str, str] | None = None, + downloader_config: dict[str, Any] | None = None, + log_level: int = logging.INFO, + verbose: bool = False, +) +``` + +- `input_dir`: source files (PDF/DOCX/HTML/…) +- `output_dir`: pipeline outputs (markdown, json, sections, …) +- `downloader_config`: defaults for `download()` (e.g., concurrency, cookies) +- Main side effects: creates the standard output folders and lazy-initializes the extractor, sectioner, and classifier. + +## extract() + +```python +extract( + input_format: str = 'all', + num_threads: int | None = None, + accel_type: str = 'CUDA', # 'CPU'|'CUDA'|'MPS'|'Auto' + *, + force_ocr: bool = False, + formula_enrichment: bool = False, + code_enrichment: bool = False, + filenames: list[str] | None = None, + skip_existing: bool = True, + use_gpus: str = 'single', # 'single'|'multi' + devices: list[int] | None = None, + use_cls: bool = False, + benchmark_mode: bool = False, + export_doc_json: bool = True, + emit_formula_index: bool = False, +) -> None +``` + +- Purpose: Phase‑1 extraction from source files into markdown plus optional JSON intermediates. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout/OCR + - `force_ocr=True`: turn on OCR during extraction + - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `export_doc_json=True`: write `json/.docling.json(.zst)` + - `emit_formula_index=True`: also write `json/.formula_index.jsonl` +- Main outputs: + - `markdown/.md` + - `json/.docling.json(.zst)` when enabled + - `json/metrics/.metrics.json` + - `json/metrics/.per_page.metrics.json` + +## clean() + +```python +clean( + input_dir: str | Path | None = None, + threshold: float = 0.10, + num_threads: int | None = None, + drop_bad: bool = True, +) -> None +``` + +- Purpose: run the Rust cleaner/noise pipeline and decide which documents are safe for downstream processing. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if present +- Important parameters: + - `threshold`: badness threshold + - `drop_bad`: whether to remove bad files from downstream selection + - `empty_char_threshold`, `empty_min_pages`: heuristics for OCR rerun recommendation +- Main outputs: + - `clean_markdown/.md` + - cleaner report parquet + - updated parquet columns such as `filter`, `needs_ocr`, and metrics fields +- Operational note: this stage is the quality gate that drives `section()` and `ocr()`. + +## ocr() + +```python +ocr( + *, + fix_bad: bool = True, + mode: str | None = None, + device: str | None = None, + model_dir: str | Path | None = None, + max_pages: int | None = None, + persist_engine: bool = True, + limit: int | None = None, + dpi: int | None = None, + precision: str | None = None, + math_enhance: bool = True, + math_targets: dict[str, list[tuple[int,int]]] | None = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = 'single', + devices: list[int] | None = None, + force: bool | None = None, +) -> None +``` + +- Purpose: selective OCR retry and optional Phase‑2 math/code enrichment. +- Mode selection: + - `ocr_bad`: rerun OCR only for cleaner-flagged docs + - `math_only`: run enrichment from existing Docling JSON + - `ocr_bad_then_math`: OCR flagged docs, then enrich them +- Important parameters: + - `mode`, `fix_bad`, `math_enhance` + - `use_gpus`, `devices` + - `math_targets` to restrict enrichment to specific items +- Main outputs: + - refreshed `markdown/.md` + - refreshed cleaner/parquet metadata after OCR reruns + - `json/.latex_map.jsonl` when enrichment runs + +## formula_enrich_from_json() + +```python +formula_enrich_from_json( + files: list[str] | None = None, + *, + device: str = 'cuda', + batch_size: int = 8, + dpi_base: int = 220, + targets_by_stem: dict[str, list[tuple[int,int]]] | None = None, +) -> None +``` + +- Purpose: Phase‑2 GPU enrichment from previously exported Docling JSON. +- Typical inputs: + - `json/.docling.json(.zst)` + - optional formula/code index data +- Important parameters: + - `files`: restrict to specific stems + - `device`, `batch_size`, `dpi_base` + - `targets_by_stem`: target specific `(page_no, item_index)` tuples +- Main outputs: + - enriched markdown back into `markdown/.md` + - `json/.latex_map.jsonl` + +## section(), annotate() + +```python +section() -> None +annotate(annotation_type: str = 'text', fully_annotate: bool = True) -> None +``` + +- `section()`: + - purpose: convert markdown into one row per section with structural flags + - inputs: markdown selected by cleaner/parquet metadata + - outputs: `sections/sections_for_annotation.parquet` +- `annotate()`: + - purpose: classify sections and optionally expand them into full document structure + - important parameters: `annotation_type='text'|'chapter'|'auto'`, `fully_annotate` + - outputs: `classified_sections.parquet` and `fully_annotated_sections.parquet` + +## download() + +```python +download( + input_parquet: str | Path, + *, + links_column: str | None = None, + parallelize_by: str | None = None, + verbose: bool | None = None, + **kwargs, +) -> pd.DataFrame +``` + +- Purpose: fetch source files described in a parquet dataset. +- Typical inputs: + - an explicit `input_parquet` + - or the first parquet file found in `input_dir` +- Important parameters: + - `links_column`: override URL column name + - `parallelize_by`: choose grouping for the scheduler + - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` + - returned `pd.DataFrame` with download status and metadata + +## triage_math() + +- Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. +- Inputs: `json/metrics/.per_page.metrics.json` +- Outputs: updated `download_results` parquet with routing fields such as formula totals and phase recommendation + +## Suggested Reading Order + +1. `download()` if you start from URLs. +2. `extract()` for Phase‑1 layout/markdown. +3. `clean()` to decide what needs OCR. +4. `ocr()` if you need OCR retry or Phase‑2 enrichment. +5. `section()` and `annotate()` for structured downstream outputs. + +--- + +See also: +- Code map: ../code_map.md +- Pipeline overview and artifacts: ../pipeline.md +- Configuration and environment variables: ../configuration.md +- OCR and math enrichment details: ../ocr_and_math_enhancement.md diff --git a/docs/code_map.md b/docs/code_map.md new file mode 100644 index 0000000..97f12d5 --- /dev/null +++ b/docs/code_map.md @@ -0,0 +1,61 @@ +# Code Map + +This page maps the main documentation ideas to the code that implements them. It is +meant to help you move from "what does GlossAPI do?" to "where do I change it?" +without reading the entire repo. + +## Top-Level Entry Points + +| Area | Main code | Responsibility | +| --- | --- | --- | +| Public package entry | `src/glossapi/__init__.py` | Applies the RapidOCR patch on import and exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes. | +| High-level orchestration | `src/glossapi/corpus.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | + +## Pipeline Stages + +| Stage | Main methods/classes | Notes | +| --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | Supports URL expansion, deduplication, checkpoints, per-domain scheduling, and resume. | +| Extract | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` | Handles backend choice, GPU preflight, and single- vs multi-GPU dispatch. | +| Clean / quality gate | `Corpus.clean()` | Runs the Rust cleaner and merges quality metrics back into parquet metadata. | +| OCR retry / math follow-up | `Corpus.ocr()`, `Corpus.formula_enrich_from_json()` | Re-runs OCR only for flagged documents and optionally performs Phase-2 math/code enrichment from JSON. | +| Sectioning | `Corpus.section()`, `GlossSection.to_parquet()` | Converts markdown documents into section rows for later classification. | +| Classification / annotation | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | Runs the SVM classifier and post-processes section labels into final document structure. | +| Export / triage | `Corpus.jsonl()`, `Corpus.triage_math()` | Produces training/export JSONL and computes routing hints for math-dense documents. | + +## Backend and Runtime Helpers + +| File | Responsibility | +| --- | --- | +| `src/glossapi/_pipeline.py` | Canonical builders for layout-only and RapidOCR-backed Docling pipelines. | +| `src/glossapi/rapidocr_safe.py` | Monkey-patch/shim for Docling 2.48.x so problematic OCR crops do not crash whole documents. | +| `src/glossapi/_rapidocr_paths.py` | Resolves packaged RapidOCR ONNX models and Greek keys, with env-var override support. | +| `src/glossapi/ocr_pool.py` | Reuses RapidOCR model instances where possible. | +| `src/glossapi/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/triage.py` | Summarizes per-page formula density and updates parquet routing metadata. | +| `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | + +## Rust Extensions + +| Crate | Path | Purpose | +| --- | --- | --- | +| Cleaner | `rust/glossapi_rs_cleaner` | Markdown cleaning, script/noise filtering, and report generation used by `Corpus.clean()`. | +| Noise metrics | `rust/glossapi_rs_noise` | Fast quality metrics used by the broader pipeline and package build configuration. | + +## Tests To Read First + +| Test | Why it matters | +| --- | --- | +| `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | +| `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | +| `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | +| `tests/test_rapidocr_patch.py` | Covers the Docling/RapidOCR compatibility patch and fallback paths. | + +## If You Need To Change... + +- Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. +- Phase-1 parsing, OCR selection, or artifact generation: start in `src/glossapi/corpus.py` and `src/glossapi/gloss_extract.py`. +- Docling/RapidOCR wiring or provider issues: start in `src/glossapi/_pipeline.py`, `src/glossapi/rapidocr_safe.py`, and `src/glossapi/_rapidocr_paths.py`. +- Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus.py`. diff --git a/docs/index.md b/docs/index.md index 01ad63c..997d2d8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,6 +8,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. ## Learn the pipeline +- [Code Map](code_map.md) links the main documentation ideas to the classes and files that implement them. - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. @@ -18,4 +19,5 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. ## Reference -- [Corpus API](api_corpus_tmp.md) details public methods and parameters. +- [Corpus API](api/corpus.md) gives the compact contract view of the main public methods. +- [Legacy Corpus API Notes](api_corpus_tmp.md) remains available while the docs are being consolidated. diff --git a/docs/pipeline.md b/docs/pipeline.md index cb11662..cacc8c4 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -6,16 +6,88 @@ GlossAPI is a staged pipeline. You can enter at any stage and use the same folde The `Corpus` class is the stable surface of the project. New functionality should plug into the existing phase mixins so callers can stick to the small set of entrypoints (`download()`, `extract()`, `clean()`, `ocr()`, `section()`, `annotate()`, `export/jsonl*()`). The expected usage pattern is a short script that chains these calls; avoid ad-hoc monkeypatches or bypassing the orchestrator when adding features so downstream users retain resumability and consistent artifacts. -## Stages - -- Download (optional): fetch source files from URLs → `downloads/` -- Extract (Phase‑1): parse PDFs to Markdown; optional GPU OCR → `markdown/.md` -- Clean: compute quality metrics and filter low‑quality items; decide which to OCR -- OCR (compat shim): re‑run extract on filtered items with `force_ocr=True` -- JSON + index (optional): emit `json/.docling.json(.zst)` and `json/.formula_index.jsonl` for Phase‑2 -- Enrich (Phase‑2): decode FORMULA/CODE from JSON on GPU → overwrite `markdown/.md`, write `json/.latex_map.jsonl` -- Section: produce `sections/sections_for_annotation.parquet` -- Annotate: classify sections; produce `classified_sections.parquet` and `fully_annotated_sections.parquet` +## Stage Map + +| Stage | Main code | Typical inputs | Important parameters | Main outputs | +| --- | --- | --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `devices`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | +| OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | +| Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | +| Section | `Corpus.section()`, `GlossSection.to_parquet()` | markdown selected by cleaner/parquet | no major public knobs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | section parquet and classifier model | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage / export | `Corpus.triage_math()`, `Corpus.jsonl()` | metrics, parquet metadata, cleaned markdown | output path for JSONL | parquet routing hints, JSONL export | + +## Stage Contracts + +### 1. Download + +- Main code: `Corpus.download()` -> `GlossDownloader.download_files()` +- Purpose: read a metadata parquet, expand list/JSON URL cells, deduplicate URLs, download supported file types, and checkpoint progress. +- Typical inputs: + - a parquet file in `input_dir` or an explicit `input_parquet` + - a URL column such as `url` or `links_column` +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` +- Read this next if you want the scheduler details: `gloss_downloader.py` + +### 2. Extract (Phase‑1) + +- Main code: `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` +- Purpose: convert source files to markdown and optional intermediate JSON artifacts. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'` + - `force_ocr=True` to turn on OCR during extraction + - `use_gpus='single'|'multi'` + - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Main outputs: + - canonical markdown in `markdown/.md` + - optional Docling JSON and index artifacts in `json/` + - per-document and per-page metrics in `json/metrics/` + +### 3. Clean + +- Main code: `Corpus.clean()` +- Purpose: run the Rust cleaner, compute quality/noise signals, and decide what should continue downstream. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if one exists +- Important parameters: + - `threshold` and `drop_bad` + - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions +- Main outputs: + - cleaned markdown in `clean_markdown/` + - merged parquet metadata including OCR-related flags + +### 4. OCR Retry and Phase‑2 Enrichment + +- Main code: `Corpus.ocr()` and `Corpus.formula_enrich_from_json()` +- Purpose: + - rerun OCR only for documents marked bad by the cleaner + - optionally decode formula/code regions from Docling JSON into markdown +- Modes: + - `ocr_bad` + - `math_only` + - `ocr_bad_then_math` +- Main outputs: + - refreshed `markdown/.md` + - `json/.latex_map.jsonl` when math/code enrichment runs + +### 5. Section and Annotate + +- Main code: `Corpus.section()`, `GlossSection.to_parquet()`, `Corpus.annotate()`, `GlossSectionClassifier.*` +- Purpose: + - split markdown into sections suitable for classification + - classify sections and optionally expand coarse labels into full document structure +- Main outputs: + - `sections/sections_for_annotation.parquet` + - `classified_sections.parquet` + - `fully_annotated_sections.parquet` ## Artifact Layout @@ -44,6 +116,19 @@ Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). - Metrics lived under `markdown/` in earlier versions; they now live under `json/metrics/`. - When math enrichment cannot recover after the configured number of respawns, the corresponding PDFs and Docling artifacts are copied into the `problematic_math/` folders above and the stems are added to the fatal skip-list for later review. +- The same folder can act as both `input_dir` and `output_dir`; the pipeline creates its own subdirectories under that root. + +## Readability Shortcut + +If you only need the shortest path through the system: + +1. `Corpus.download()` if you start from URLs. +2. `Corpus.extract()` for Phase‑1 markdown. +3. `Corpus.clean()` to decide what needs OCR. +4. `Corpus.ocr()` for selective OCR and optional math/code enrichment. +5. `Corpus.section()` and `Corpus.annotate()` for structured outputs. + +If you need to jump from these ideas to the source files, see `code_map.md`. ## Exporting corpora diff --git a/mkdocs.yml b/mkdocs.yml index ba0a1e4..43b70fa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,7 +42,9 @@ nav: - Troubleshooting: troubleshooting.md - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - - Corpus API: api_corpus_tmp.md + - Code Map: code_map.md + - Corpus API: api/corpus.md + - Legacy Corpus API Notes: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md docs_dir: docs markdown_extensions: From 379b8f0ff65817ee481153a14fe35b96039bd22a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 17 Mar 2026 22:55:01 +0200 Subject: [PATCH 09/26] Handle HTML download interstitials --- src/glossapi/gloss_downloader.py | 85 +++++++++++++++++++++ tests/test_gloss_downloader_dynamic_html.py | 53 +++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 tests/test_gloss_downloader_dynamic_html.py diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index f9a7bf2..5afba9c 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -765,6 +765,77 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes # 5) Fall back to URL ext if any, otherwise 'bin' return url_ext if url_ext else 'bin' + + def _url_looks_like_file_endpoint(self, url: str) -> bool: + """Return True when the URL shape suggests a direct file download endpoint.""" + try: + lowered = str(url or "").lower() + except Exception: + return False + hints = ( + ".pdf", + ".docx", + ".pptx", + ".xml", + ".csv", + "/pdf", + "format=pdf", + "type=pdf", + "download", + "attachment", + "/file", + "getfile.php", + ) + return any(token in lowered for token in hints) + + def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: bytes) -> Optional[str]: + """ + Detect HTML challenge/viewer pages that should not count as successful downloads. + + We still allow regular HTML documents, but fail fast on common interstitials + such as WAF challenge pages and JavaScript-only document viewers. + """ + try: + lower_headers = {str(k).lower(): str(v).lower() for k, v in (headers or {}).items()} + lower_body = (content or b"")[: 1 << 17].decode("utf-8", errors="ignore").lower() + except Exception: + lower_headers = {} + lower_body = "" + + if not lower_body: + return None + + if ( + "x-amzn-waf-action" in lower_headers + or "awswafintegration" in lower_body + or "challenge.js" in lower_body + or "verify that you're not a robot" in lower_body + ): + return ( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ) + + viewer_markers = ( + "fliphtml5_pages", + "monitor:player:html5", + "javascript/loadingjs.js", + "javascript/main.js", + "bookconfig.totalpagecount", + "getfile.php?lib=", + ) + viewer_hits = sum(1 for marker in viewer_markers if marker in lower_body) + if viewer_hits >= 2: + return ( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ) + + content_type = lower_headers.get("content-type", "") + if self._url_looks_like_file_endpoint(url) and "text/html" in content_type: + return "Expected a file-like response but received HTML instead" + + return None async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], rate_limiter: RateLimiter, retry_count: int = 0, @@ -916,6 +987,15 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn await f.write(chunk) # Infer extension using URL, headers and first bytes file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, bytes(head)) + if html_issue: + try: + os.remove(tmp_path) + except Exception: + pass + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count if not self.is_supported_format(file_ext): # Clean up temp and report try: @@ -946,6 +1026,11 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn session, requester, url, headers, timeout ) file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count if not self.is_supported_format(file_ext): self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count diff --git a/tests/test_gloss_downloader_dynamic_html.py b/tests/test_gloss_downloader_dynamic_html.py new file mode 100644 index 0000000..a1bd678 --- /dev/null +++ b/tests/test_gloss_downloader_dynamic_html.py @@ -0,0 +1,53 @@ +from glossapi.gloss_downloader import GlossDownloader + + +def test_detects_waf_challenge_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + headers = { + "Content-Type": "text/html; charset=UTF-8", + "x-amzn-waf-action": "challenge", + } + body = b""" + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "challenge page" in error.lower() + + +def test_detects_js_document_viewer_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b""" + + + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "document viewer" in error.lower() + + +def test_regular_html_document_is_still_allowed(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://example.org/article" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b"""Article +

Normal HTML document

Body text.

""" + + assert downloader.infer_file_extension(url, headers, body) == "html" + assert downloader._detect_html_interstitial(url, headers, body) is None From aca4dbb695dfc83d70e03887491df82e0efc4fdd Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 18 Mar 2026 01:26:18 +0200 Subject: [PATCH 10/26] Add browser-gated download mode --- README.md | 61 ++- docs/api/corpus.md | 23 + docs/stages/download.md | 25 ++ install_glossapi.py | 23 + pyproject.toml | 4 + src/glossapi/__init__.py | 4 + src/glossapi/corpus/phase_download.py | 35 +- src/glossapi/download_policy.py | 125 ++++++ src/glossapi/gloss_browser_downloader.py | 415 +++++++++++++++++ src/glossapi/gloss_downloader.py | 545 ++++++++++++++--------- src/glossapi/scripts/install_glossapi.py | 230 ++++++++++ tests/test_browser_gloss_downloader.py | 297 ++++++++++++ tests/test_install_glossapi.py | 51 +++ 13 files changed, 1618 insertions(+), 220 deletions(-) create mode 100644 install_glossapi.py create mode 100644 src/glossapi/download_policy.py create mode 100644 src/glossapi/gloss_browser_downloader.py create mode 100644 src/glossapi/scripts/install_glossapi.py create mode 100644 tests/test_browser_gloss_downloader.py create mode 100644 tests/test_install_glossapi.py diff --git a/README.md b/README.md index 5ad3b1f..953c03b 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,65 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend `setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +If you want a guided install that asks which phases you plan to use, run: + +```bash +python install_glossapi.py +``` + +That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them. + +## Browser-Gated Download Mode + +`Corpus.download(...)` now supports three high-level routes for file acquisition: + +- `download_mode="standard"`: direct HTTP downloader only +- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial +- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints + +Use `browser_mode=True` as a legacy alias for `download_mode="browser"`. + +### Policy-driven routing + +If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL: + +```yaml +default: + downloader: standard + +rules: + - match: + domains: [eur-lex.europa.eu] + downloader: browser + + - match: + url_regex: "https://example.org/protected/.*" + downloader: auto +``` + +```python +from glossapi import Corpus + +corpus = Corpus(input_dir="out", output_dir="out") +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +### Operational notes + +- Browser mode is for browser-gated file endpoints, not viewer-only sources. +- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files. +- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory. +- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files. + +### Regression strategy + +The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs. + +For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite. + **DeepSeek runtime checklist** - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. - Export these to force the real runtime and avoid silent stub output: @@ -93,7 +152,7 @@ Use this as the shortest path from a documentation concept to the public call th | Stage | Main call | Important parameters | Writes | | --- | --- | --- | --- | -| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | | Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | | OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 40f8c47..2fb796c 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -187,12 +187,35 @@ download( - Important parameters: - `links_column`: override URL column name - `parallelize_by`: choose grouping for the scheduler + - `download_mode`: one of `standard`, `auto`, or `browser` + - `browser_mode=True`: alias for `download_mode="browser"` + - `download_policy_file`: route specific domains/URL patterns to `standard`, `auto`, or `browser` - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. - Main outputs: - downloaded files in `downloads/` - partial/final results in `download_results/` - returned `pd.DataFrame` with download status and metadata +Browser-capable download mode is intended for browser-gated file endpoints where a real file still exists behind session/bootstrap checks. It is not a general viewer extractor. Viewer-only sources should still fail cleanly with a recorded error and no local file artifact. + +Example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_mode="browser", +) +``` + +Policy-routed example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + ## triage_math() - Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. diff --git a/docs/stages/download.md b/docs/stages/download.md index 99bc4f8..c70c551 100644 --- a/docs/stages/download.md +++ b/docs/stages/download.md @@ -8,6 +8,7 @@ The download stage acquires source documents from parquet-based URL metadata and - read URL-bearing parquet input - download files concurrently +- route known browser-gated sources through browser-assisted acquisition when configured - retain source metadata context - avoid refetching previously successful downloads - assign stable-enough local filenames for downstream processing @@ -42,10 +43,34 @@ Typical issues include: - transient network failures - rate limiting +- browser-gated file endpoints that return HTML challenge/interstitial pages +- viewer-only sources that should fail cleanly instead of being recorded as successful downloads - duplicate URLs - filename collisions - partially completed corpus fetches +## Browser-gated sources + +The downloader now distinguishes between: + +- direct file endpoints +- browser-gated file endpoints +- viewer-only/document-reader sources + +For browser-gated file endpoints: + +- `download_mode="auto"` probes with direct HTTP and escalates to a browser session when it detects a recoverable interstitial +- `download_mode="browser"` goes directly to the browser-assisted path +- `download_policy_file=...` can route known domains or URL patterns to the correct path without probing every file + +Browser-assisted mode is designed for retrievable file endpoints, not for sources that only expose page images, tiles, HTML/SVG re-rendering, or DRM-wrapped readers. + +## Session reuse + +Browser-assisted mode reuses cached browser session state per domain so multiple files from the same protected source do not need a fresh browser bootstrap every time. + +This keeps the browser as a session-bootstrap resource rather than the main downloader. + ## Contributor note Any change to filename assignment or result parquet structure can have downstream impact on: diff --git a/install_glossapi.py b/install_glossapi.py new file mode 100644 index 0000000..ef7a7c9 --- /dev/null +++ b/install_glossapi.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +def _bootstrap_repo_src() -> None: + repo_root = Path(__file__).resolve().parent + src_dir = repo_root / "src" + src_str = str(src_dir) + if src_str not in sys.path: + sys.path.insert(0, src_str) + + +def main() -> int: + _bootstrap_repo_src() + from glossapi.scripts.install_glossapi import main as _main + + return int(_main()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index 60b23f8..3c045db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,10 @@ classifiers = [ ] [project.optional-dependencies] +# Browser automation fallback for browser-gated file endpoints +browser = [ + "playwright>=1.52,<2", +] # Docling extraction/layout stack docling = [ "docling==2.48.0", diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index c92d336..14f0c31 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -9,6 +9,7 @@ 'Sampler', 'Section', 'GlossDownloader', + 'BrowserGlossDownloader', ] def __getattr__(name: str): @@ -31,6 +32,9 @@ def __getattr__(name: str): if name == 'GlossDownloader': from .gloss_downloader import GlossDownloader # type: ignore return GlossDownloader + if name == 'BrowserGlossDownloader': + from .gloss_browser_downloader import BrowserGlossDownloader # type: ignore + return BrowserGlossDownloader raise AttributeError(name) try: diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py index 38179fd..c543076 100644 --- a/src/glossapi/corpus/phase_download.py +++ b/src/glossapi/corpus/phase_download.py @@ -19,6 +19,7 @@ import pandas as pd from .._naming import canonical_stem +from ..gloss_browser_downloader import BrowserGlossDownloader from ..gloss_downloader import GlossDownloader # Avoid importing section/classifier here; download phase does not use them. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path @@ -212,6 +213,22 @@ def _looks_like_list(s: str) -> bool: # Initialize downloader configuration (kwargs take precedence) dl_cfg = dict(self.downloader_config) dl_cfg.update(kwargs) + browser_mode = dl_cfg.pop('browser_mode', None) + if browser_mode is not None and 'download_mode' not in dl_cfg: + dl_cfg['download_mode'] = 'browser' if browser_mode else 'standard' + download_mode = str(dl_cfg.pop('download_mode', 'standard')).strip().lower() + policy_requested = bool(dl_cfg.get('download_policy_file') or dl_cfg.get('download_policy')) + if download_mode in {'standard', 'default', 'http'} and not policy_requested: + downloader_cls = GlossDownloader + default_download_route = 'standard' + elif download_mode in {'browser', 'browser_protected'} or policy_requested: + downloader_cls = BrowserGlossDownloader + default_download_route = 'browser' if download_mode in {'browser', 'browser_protected'} else 'standard' + elif download_mode in {'auto', 'browser_fallback'}: + downloader_cls = BrowserGlossDownloader + default_download_route = 'auto' + else: + raise ValueError(f"Unsupported download_mode: {download_mode}") # Allow caller to override which column holds links if links_column: url_column = links_column @@ -232,14 +249,18 @@ def _looks_like_list(s: str) -> bool: except Exception: pass - downloader = GlossDownloader( - url_column=url_column, - output_dir=str(self.output_dir), - log_level=self.logger.level, - verbose=verbose if verbose is not None else self.verbose, + downloader_kwargs = { + "url_column": url_column, + "output_dir": str(self.output_dir), + "log_level": self.logger.level, + "verbose": verbose if verbose is not None else self.verbose, **{k: v for k, v in dl_cfg.items() if k not in {'input_parquet'}}, - _used_filename_bases=used_bases - ) + "_used_filename_bases": used_bases, + } + if downloader_cls is BrowserGlossDownloader: + downloader_kwargs["default_download_route"] = default_download_route + + downloader = downloader_cls(**downloader_kwargs) # Download files self.logger.info(f"Downloading files from URLs in {input_parquet}...") diff --git a/src/glossapi/download_policy.py b/src/glossapi/download_policy.py new file mode 100644 index 0000000..f42e043 --- /dev/null +++ b/src/glossapi/download_policy.py @@ -0,0 +1,125 @@ +"""Policy routing for downloader selection.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Optional +from urllib.parse import urlparse + +import yaml + +VALID_DOWNLOADERS = {"standard", "browser", "auto"} +ROUTE_OPTION_KEYS = { + "browser_timeout_ms", + "browser_post_load_wait_ms", + "browser_engine", + "browser_headless", + "browser_session_ttl_seconds", +} + + +def _normalize_downloader(value: Any, default: str = "standard") -> str: + normalized = str(value or default).strip().lower() + if normalized in {"default", "http"}: + normalized = "standard" + if normalized in {"browser_fallback"}: + normalized = "auto" + if normalized in {"browser_protected"}: + normalized = "browser" + if normalized not in VALID_DOWNLOADERS: + raise ValueError(f"Unsupported downloader route: {value}") + return normalized + + +@dataclass(frozen=True) +class DownloadPolicyMatch: + domains: tuple[str, ...] = () + url_regex: Optional[re.Pattern[str]] = None + + def matches(self, url: str) -> bool: + parsed = urlparse(url) + hostname = (parsed.hostname or "").lower() + if self.domains: + matched_domain = any( + hostname == domain or hostname.endswith(f".{domain}") + for domain in self.domains + ) + if not matched_domain: + return False + if self.url_regex and not self.url_regex.search(url): + return False + return True + + +@dataclass(frozen=True) +class DownloadPolicyRule: + matcher: DownloadPolicyMatch + downloader: str + options: Dict[str, Any] + + def matches(self, url: str) -> bool: + return self.matcher.matches(url) + + +@dataclass(frozen=True) +class DownloadPolicy: + default_downloader: str = "standard" + default_options: Dict[str, Any] | None = None + rules: tuple[DownloadPolicyRule, ...] = () + + def resolve(self, url: str) -> tuple[str, Dict[str, Any]]: + for rule in self.rules: + if rule.matches(url): + return rule.downloader, dict(rule.options) + return self.default_downloader, dict(self.default_options or {}) + + +def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS} + + +def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch: + domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip()) + url_regex = raw.get("url_regex") + compiled = re.compile(str(url_regex)) if url_regex else None + return DownloadPolicyMatch(domains=domains, url_regex=compiled) + + +def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy: + default_block = dict(data.get("default") or {}) + default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard") + default_options = _extract_route_options(default_block) + + rules = [] + for raw_rule in data.get("rules") or []: + raw_rule = dict(raw_rule or {}) + matcher = _build_matcher(dict(raw_rule.get("match") or {})) + downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader) + options = _extract_route_options(raw_rule) + rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options)) + + return DownloadPolicy( + default_downloader=default_downloader, + default_options=default_options, + rules=tuple(rules), + ) + + +def load_download_policy(path: str | Path) -> DownloadPolicy: + policy_path = Path(path).expanduser().resolve() + payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {} + if not isinstance(payload, dict): + raise ValueError("Download policy file must define a mapping at the top level") + return build_download_policy(payload) + + +__all__ = [ + "DownloadPolicy", + "DownloadPolicyMatch", + "DownloadPolicyRule", + "VALID_DOWNLOADERS", + "build_download_policy", + "load_download_policy", +] diff --git a/src/glossapi/gloss_browser_downloader.py b/src/glossapi/gloss_browser_downloader.py new file mode 100644 index 0000000..1fc41fa --- /dev/null +++ b/src/glossapi/gloss_browser_downloader.py @@ -0,0 +1,415 @@ +"""Browser-capable downloader mode for browser-gated file endpoints.""" + +from __future__ import annotations + +import asyncio +import os +import time +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Any, Dict, Optional, Tuple + +import aiofiles +import aiohttp + +from .download_policy import DownloadPolicy, load_download_policy +from .gloss_downloader import GlossDownloader + + +@dataclass +class BrowserSessionState: + user_agent: str + cookie_header: str + cached_at: float + + +class BrowserGlossDownloader(GlossDownloader): + """ + Downloader variant that retries browser-gated file endpoints via Playwright. + + This mode only targets file endpoints that are protected by browser/session + checks. It intentionally does not attempt viewer-style extraction. + """ + + def __init__( + self, + *args, + browser_timeout_ms: int = 60000, + browser_post_load_wait_ms: int = 3000, + browser_engine: str = "chromium", + browser_headless: bool = True, + browser_session_ttl_seconds: int = 900, + browser_max_parallel_bootstraps: int = 2, + default_download_route: str = "auto", + **kwargs, + ): + super().__init__(*args, **kwargs) + self.browser_timeout_ms = int(browser_timeout_ms) + self.browser_post_load_wait_ms = int(browser_post_load_wait_ms) + self.browser_engine = str(browser_engine or "chromium") + self.browser_headless = bool(browser_headless) + self.browser_session_ttl_seconds = int(browser_session_ttl_seconds) + self.browser_max_parallel_bootstraps = max(1, int(browser_max_parallel_bootstraps)) + self.browser_bootstrap_semaphore = asyncio.Semaphore(self.browser_max_parallel_bootstraps) + self._browser_session_cache: Dict[str, BrowserSessionState] = {} + self._browser_session_locks: Dict[str, asyncio.Lock] = {} + self.default_download_route = str(default_download_route or "auto").strip().lower() + self.policy = self._load_policy() + + def _load_policy(self) -> Optional[DownloadPolicy]: + if self.download_policy is not None: + return self.download_policy + if self.download_policy_file: + return load_download_policy(self.download_policy_file) + return None + + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + if self.policy is not None: + return self.policy.resolve(url) + return self.default_download_route, {} + + def _route_setting(self, route_options: Dict[str, Any], name: str, fallback: Any) -> Any: + return route_options.get(name, fallback) + + def _domain_key(self, url: str) -> str: + return self._extract_base_domain(url) or (urlparse(url).hostname or "").lower() + + def _choose_browser_bootstrap_url(self, url: str) -> str: + if self._url_looks_like_file_endpoint(url): + return self.get_base_url(url) + return url + + def _should_ignore_navigation_exception(self, url: str, exc: Exception) -> bool: + message = str(exc) + if self._url_looks_like_file_endpoint(url) and "net::ERR_ABORTED" in message: + return True + return False + + def _session_lock_for_domain(self, domain_key: str) -> asyncio.Lock: + lock = self._browser_session_locks.get(domain_key) + if lock is None: + lock = asyncio.Lock() + self._browser_session_locks[domain_key] = lock + return lock + + def _is_browser_session_fresh(self, state: BrowserSessionState, route_options: Dict[str, Any]) -> bool: + ttl = int(self._route_setting(route_options, "browser_session_ttl_seconds", self.browser_session_ttl_seconds)) + if ttl <= 0: + return False + return (time.time() - state.cached_at) < ttl + + def _should_attempt_browser_recovery(self, url: str, html_issue: str) -> bool: + issue = str(html_issue or "").lower() + if "document viewer returned" in issue: + return False + if "challenge page returned" in issue: + return True + if "cookie bootstrap is required" in issue: + return True + if "expected a file-like response but received html instead" in issue: + return self._url_looks_like_file_endpoint(url) + return False + + def _build_ssl_connector(self) -> Optional[aiohttp.TCPConnector]: + connector = None + if not self.ssl_verify: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + def _domain_cookies_for_url(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} + for domain_pattern, domain_cookies in self.domain_cookies.items(): + if domain_pattern in url: + cookies.update(domain_cookies) + return cookies + + async def _write_recovered_file(self, row_index: int, filename: str, body: bytes) -> None: + tmp_path = self.downloads_dir / f".part_browser_{row_index}" + async with aiofiles.open(tmp_path, "wb") as handle: + await handle.write(body) + final_path = self.downloads_dir / filename + os.replace(tmp_path, final_path) + + async def _fetch_with_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + state: BrowserSessionState, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + request_headers = { + "User-Agent": state.user_agent, + "Accept": "application/pdf,application/octet-stream,*/*;q=0.8", + } + if state.cookie_header: + request_headers["Cookie"] = state.cookie_header + if referer: + request_headers["Referer"] = referer + + connector = self._build_ssl_connector() + timeout = aiohttp.ClientTimeout(total=min(max(self.request_timeout, 30), 180)) + async with aiohttp.ClientSession(connector=connector) as session: + async with session.get(url, headers=request_headers, timeout=timeout) as response: + response.raise_for_status() + body = await response.read() + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + return body, response_headers, {"candidate_url": url, "session_reused": True} + + async def _bootstrap_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + route_options: Dict[str, Any], + ) -> tuple[BrowserSessionState, list[tuple[str, Dict[str, str], str]]]: + timeout_ms = int(self._route_setting(route_options, "browser_timeout_ms", self.browser_timeout_ms)) + post_load_wait_ms = int( + self._route_setting(route_options, "browser_post_load_wait_ms", self.browser_post_load_wait_ms) + ) + browser_engine = str(self._route_setting(route_options, "browser_engine", self.browser_engine)) + browser_headless = bool(self._route_setting(route_options, "browser_headless", self.browser_headless)) + + try: + from playwright.async_api import async_playwright + except ImportError as exc: # pragma: no cover - exercised via monkeypatch + raise RuntimeError( + "Browser download mode requires the optional 'browser' dependencies " + "(install Playwright and browser binaries)" + ) from exc + + accepted_responses: list[tuple[str, Dict[str, str], str]] = [] + bootstrap_url = self._choose_browser_bootstrap_url(url) + + async with self.browser_bootstrap_semaphore: + async with async_playwright() as playwright: + browser_type = getattr(playwright, browser_engine, None) + if browser_type is None: + raise RuntimeError(f"Unsupported browser engine: {browser_engine}") + + browser = await browser_type.launch(headless=browser_headless) + context = await browser.new_context(ignore_https_errors=not self.ssl_verify) + parsed = urlparse(url) + browser_cookies = [ + { + "name": key, + "value": str(value), + "domain": parsed.hostname or "", + "path": "/", + } + for key, value in self._domain_cookies_for_url(url).items() + ] + if browser_cookies: + await context.add_cookies(browser_cookies) + page = await context.new_page() + if referer: + await page.set_extra_http_headers({"Referer": referer}) + + async def _route_filter(route: Any) -> None: + req = route.request + if req.resource_type in {"image", "media", "font"}: + await route.abort() + return + req_url = str(req.url or "") + if "googletagmanager" in req_url or "google-analytics.com" in req_url: + await route.abort() + return + await route.continue_() + + await page.route("**/*", _route_filter) + + def _record_response(response: Any) -> None: + try: + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + file_ext = self.infer_file_extension(response.url, response_headers, b"") + if file_ext and file_ext != "html" and self.is_supported_format(file_ext): + accepted_responses.append((response.url, response_headers, file_ext)) + except Exception: + return + + page.on("response", _record_response) + + try: + main_response = None + try: + main_response = await page.goto(bootstrap_url, wait_until="networkidle", timeout=timeout_ms) + except Exception as exc: + if not self._should_ignore_navigation_exception(bootstrap_url, exc): + raise + if main_response is not None: + main_headers = {str(k): str(v) for k, v in (main_response.headers or {}).items()} + main_ext = self.infer_file_extension(main_response.url, main_headers, b"") + if main_ext and main_ext != "html" and self.is_supported_format(main_ext): + accepted_responses.insert(0, (main_response.url, main_headers, main_ext)) + if not accepted_responses and post_load_wait_ms > 0: + await page.wait_for_timeout(post_load_wait_ms) + + browser_user_agent = await page.evaluate("() => navigator.userAgent") + browser_cookies = await context.cookies() + finally: + await browser.close() + + cookie_header = "; ".join( + f"{cookie['name']}={cookie['value']}" for cookie in browser_cookies if cookie.get("name") + ) + return BrowserSessionState( + user_agent=browser_user_agent, + cookie_header=cookie_header, + cached_at=time.time(), + ), accepted_responses + + async def _download_via_browser_session( + self, + *, + url: str, + referer: Optional[str], + route_options: Optional[Dict[str, Any]] = None, + force_refresh: bool = False, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + options = dict(route_options or {}) + domain_key = self._domain_key(url) + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + lock = self._session_lock_for_domain(domain_key) + async with lock: + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + state, accepted_responses = await self._bootstrap_browser_session_state( + url=url, + referer=referer, + route_options=options, + ) + self._browser_session_cache[domain_key] = state + candidate_url = accepted_responses[0][0] if accepted_responses else url + body, response_headers, meta = await self._fetch_with_browser_session_state( + url=candidate_url, + referer=referer, + state=state, + ) + meta.update({ + "candidate_url": candidate_url, + "session_reused": False, + "domain_key": domain_key, + }) + return body, response_headers, meta + + async def _download_browser_route( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + route_options: Dict[str, Any], + ) -> Tuple[bool, str, str, str, int]: + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + error_msg = f"Browser-routed download failed: {exc}" + self.logger.warning(error_msg) + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + return await self._finalize_download_result( + row_index=row_index, + url=meta.get("candidate_url") or url, + resp_headers=response_headers, + content=body, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route != "browser": + return None + return await self._download_browser_route( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + route_options=route_options, + ) + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route == "standard": + return None + if route == "auto" and not self._should_attempt_browser_recovery(url, html_issue): + return None + + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + message = f"{html_issue}; browser recovery failed: {exc}" + self.logger.warning(message) + return False, "", "html", message, retry_count + 1 + + file_ext = self.infer_file_extension(meta["candidate_url"], response_headers, body) + if file_ext == "html": + message = ( + f"{html_issue}; browser recovery still returned HTML from {meta['candidate_url']}" + ) + self.logger.warning(message) + return False, "", file_ext, message, retry_count + 1 + if not self.is_supported_format(file_ext): + message = ( + f"{html_issue}; browser recovery returned unsupported format: {file_ext}" + ) + self.logger.warning(message) + return False, "", file_ext or "", message, retry_count + 1 + + if filename_base and str(filename_base).strip(): + filename = f"{filename_base}.{file_ext}" + else: + filename = self.generate_filename(row_index, file_ext) + + await self._write_recovered_file(row_index, filename, body) + self.logger.info( + "Recovered browser-gated download via browser mode: %s -> %s", + url, + filename, + ) + return True, filename, file_ext, "", retry_count diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index 5afba9c..ffce858 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -141,6 +141,8 @@ def __init__( error_burst_window: int = 20, error_burst_threshold: float = 0.5, park_403_seconds: float = 600.0, + download_policy_file: Optional[Union[str, Path]] = None, + download_policy: Optional[Any] = None, _used_filename_bases: Optional[Set[str]] = None, ): """ @@ -241,6 +243,8 @@ def verbose_log(self, message, level=logging.DEBUG): self.checkpoint_seconds = float(checkpoint_seconds) if checkpoint_seconds else None # Warnings JSON path self.domain_warnings_path = self.output_dir / 'domain_scheduler_warnings.json' + self.download_policy_file = Path(download_policy_file).expanduser().resolve() if download_policy_file else None + self.download_policy = download_policy # Progress logger (separate file; default to output logs dir) self.progress_logger = self.logger @@ -836,41 +840,44 @@ def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: return "Expected a file-like response but received HTML instead" return None - - async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], - rate_limiter: RateLimiter, retry_count: int = 0, - filename_base: Optional[str] = None, - referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: - """ - Download a file from a URL - - Args: - row_index: Index in the dataframe - url: URL to download - semaphore: Semaphore for concurrency control - rate_limiter: Rate limiter for API limits - retry_count: Current retry count - Returns: - Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) - """ - if not url or pd.isna(url): - return False, "", "", "Empty URL", retry_count - - # Get a new user-agent for each request - user_agent = next(self.user_agents) - domain = urlparse(url).netloc - - # Ensure URL has scheme + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to recover from HTML interstitials via alternate fetch modes.""" + return None + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to short-circuit the direct HTTP path for known routes.""" + return None + + def _normalize_request_url(self, url: str) -> str: if not url.startswith(("http://", "https://")): - url = f"https://{url}" - - # Get base URL for referer header + return f"https://{url}" + return url + + def _build_request_headers(self, url: str, user_agent: str, referer: Optional[str]) -> Dict[str, str]: + domain = urlparse(url).netloc base_url = self.get_base_url(url) - - # Enhanced headers with common browser-like attributes to bypass 403 errors - # Prefer caller-provided referer (e.g., the external_link page) - _referer = (referer or '').strip() - headers = { + referer_value = (referer or '').strip() + return { 'User-Agent': user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', @@ -884,74 +891,296 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'TE': 'trailers', - 'Referer': _referer if _referer else f"https://www.google.com/search?q={domain}", + 'Referer': referer_value if referer_value else f"https://www.google.com/search?q={domain}", 'Origin': base_url, 'DNT': '1' } - - # Check for domain-specific cookies - cookies = {} + + def _resolve_request_cookies(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} for domain_pattern, domain_cookies in self.domain_cookies.items(): if domain_pattern in url: cookies.update(domain_cookies) # If the domain needs dynamic values like random IDs - for key, value in cookies.items(): + for key, value in list(cookies.items()): if 'random.randint' in str(value): # Replace with an actual random value (only supporting this pattern for now) - if 'session-id' in value: + if 'session-id' in str(value): cookies[key] = f"session-id-{random.randint(100000000, 999999999)}" + return cookies + + def _build_request_timeout(self, retry_count: int) -> aiohttp.ClientTimeout: + return aiohttp.ClientTimeout( + total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes + connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute + sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute + sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + ) + + def _build_session_connector(self, url: str) -> Optional[aiohttp.TCPConnector]: + connector = None + url_base = self._extract_base_domain(url) + force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) + if (not self.ssl_verify) or force_insecure: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + async def _bootstrap_download_session( + self, + session: aiohttp.ClientSession, + url: str, + headers: Dict[str, str], + ) -> Dict[str, str]: + headers = await self.setup_session(session, url, headers) + + # Set a shorter timeout for the initial connection attempt + base_timeout = aiohttp.ClientTimeout(total=10) + try: + # Visit the base domain to establish cookies if needed + base_domain = urlparse(url).netloc + if any(domain in base_domain for domain in self.domain_cookies.keys()): + base_url = f"https://{base_domain}" + async with session.get(base_url, headers=headers, timeout=base_timeout): + pass + except Exception as e: + # Non-fatal error, just log and continue + self.logger.debug(f"Initial base URL visit failed: {str(e)}") + return headers + + def _best_effort_url_extension(self, url: str) -> str: + try: + return self.get_file_extension_from_url(url) + except Exception: + return "" + + def _build_output_filename(self, row_index: int, file_ext: str, filename_base: Optional[str]) -> str: + if filename_base and str(filename_base).strip(): + return f"{filename_base}.{file_ext}" + return self.generate_filename(row_index, file_ext) + + def _cleanup_temp_file(self, tmp_path: Optional[Path]) -> None: + if not tmp_path: + return + try: + os.remove(tmp_path) + except Exception: + pass + + def _move_temp_file_to_final(self, tmp_path: Path, filename: str) -> None: + final_path = Path(self.downloads_dir) / filename + try: + os.replace(tmp_path, final_path) + except Exception: + try: + os.rename(tmp_path, final_path) + except Exception: + pass + + async def _finalize_download_result( + self, + *, + row_index: int, + url: str, + resp_headers: Dict[str, str], + content: bytes, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + tmp_path: Optional[Path] = None, + ) -> Tuple[bool, str, str, str, int]: + file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self._cleanup_temp_file(tmp_path) + recovered = await self._recover_html_interstitial( + row_index=row_index, + url=url, + headers=resp_headers, + content=content, + html_issue=html_issue, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + if recovered is not None: + return recovered + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count + if not self.is_supported_format(file_ext): + self._cleanup_temp_file(tmp_path) + self.logger.warning( + f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}" + ) + return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count + + filename = self._build_output_filename(row_index, file_ext, filename_base) + if tmp_path is not None: + self._move_temp_file_to_final(tmp_path, filename) + else: + await self.write_file(filename, content, self.downloads_dir) + self.logger.info(f"Successfully downloaded {filename} from {url}") + return True, filename, file_ext, "", retry_count + + async def _download_via_streaming_get( + self, + *, + session: aiohttp.ClientSession, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + from tenacity import AsyncRetrying + + head = bytearray() + async for attempt in AsyncRetrying( + stop=stop_after_attempt(max(1, int(self.max_retries))), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=(retry_if_exception_type(aiohttp.ClientError) | + retry_if_exception_type(asyncio.TimeoutError)), + before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), + reraise=True, + ): + with attempt: + async with session.get(url, headers=headers, timeout=timeout) as response: + response.raise_for_status() + resp_headers = dict(response.headers or {}) + tmp_path = Path(self.downloads_dir) / f".part_{row_index}" + async with aiofiles.open(tmp_path, 'wb') as f: + async for chunk in response.content.iter_chunked(1 << 16): + if chunk: + if len(head) < (1 << 16): + need = (1 << 16) - len(head) + head.extend(chunk[:need]) + await f.write(chunk) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=bytes(head), + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + tmp_path=tmp_path, + ) + return False, "", "", "Retry exhaustion", retry_count + 1 + + async def _download_via_buffered_request( + self, + *, + session: aiohttp.ClientSession, + requester: str, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + content, status, resp_headers = await self.make_request( + session, requester, url, headers, timeout + ) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=content, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + def _build_http_error_result( + self, + url: str, + error: aiohttp.ClientResponseError, + retry_count: int, + ) -> Tuple[bool, str, str, str, int]: + status = error.status + self.logger.warning(f"Received {status} for {url}") + + if self.verbose: + self.logger.debug(f"HTTP Error Details - Status: {error.status}, Message: {error.message}") + self.logger.debug(f"Headers: {error.headers if hasattr(error, 'headers') else 'No headers available'}") + self.logger.debug(f"Request info: {error.request_info if hasattr(error, 'request_info') else 'No request info available'}") + + retry_after = None + try: + hdrs = dict(getattr(error, 'headers', {}) or {}) + for k, v in hdrs.items(): + if k.lower() == 'retry-after': + val = str(v).strip() + if val.isdigit(): + retry_after = int(val) + else: + try: + dt = parsedate_to_datetime(val) + retry_after = max(0, int((dt.timestamp() - time.time()))) + except Exception: + retry_after = None + break + except Exception: + retry_after = None + error_msg = f"HTTP {status}: {str(error)}" + if status in (429, 503) and retry_after is not None: + error_msg += f" retry_after={retry_after}" + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + + async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], + rate_limiter: RateLimiter, retry_count: int = 0, + filename_base: Optional[str] = None, + referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + """ + Download a file from a URL + + Args: + row_index: Index in the dataframe + url: URL to download + semaphore: Semaphore for concurrency control + rate_limiter: Rate limiter for API limits + retry_count: Current retry count + Returns: + Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + """ + if not url or pd.isna(url): + return False, "", "", "Empty URL", retry_count + + url = self._normalize_request_url(url) + user_agent = next(self.user_agents) + headers = self._build_request_headers(url, user_agent, referer) + cookies = self._resolve_request_cookies(url) if semaphore: await semaphore.acquire() try: - # Apply rate limiting await rate_limiter.acquire() - - # Implement exponential backoff sleep_time = self.sleep * (2 ** retry_count) await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) - - # Set up timeout with exponential backoff - timeout = aiohttp.ClientTimeout( - total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes - connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute - sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute - sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + preflight = await self._preflight_download( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - + if preflight is not None: + return preflight + timeout = self._build_request_timeout(retry_count) + try: - # Prepare optional SSL connector - connector = None - # Domain-specific insecure override (discovered via ping) - url_base = self._extract_base_domain(url) - _force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) - if (not self.ssl_verify) or _force_insecure: - connector = aiohttp.TCPConnector(ssl=False) - elif self.ssl_cafile: - import ssl as _ssl - ctx = _ssl.create_default_context(cafile=self.ssl_cafile) - connector = aiohttp.TCPConnector(ssl=ctx) - # Create a new session for each download to avoid cookie contamination + connector = self._build_session_connector(url) async with aiohttp.ClientSession(cookies=cookies, connector=connector) as session: try: - # Try to access the base domain first to establish cookies - headers = await self.setup_session(session, url, headers) - - # Set a shorter timeout for the initial connection attempt - base_timeout = aiohttp.ClientTimeout(total=10) - try: - # Visit the base domain to establish cookies if needed - base_domain = urlparse(url).netloc - if any(domain in base_domain for domain in self.domain_cookies.keys()): - base_url = f"https://{base_domain}" - async with session.get(base_url, headers=headers, timeout=base_timeout): - pass - except Exception as e: - # Non-fatal error, just log and continue - self.logger.debug(f"Initial base URL visit failed: {str(e)}") - pass - - # Choose request method and perform streaming for GET + headers = await self._bootstrap_download_session(session, url, headers) requester = self.request_method.lower() try: @@ -960,126 +1189,30 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn self.verbose_log(f"Headers: {headers}") if requester == 'get': - # Streaming GET with retries - from tenacity import AsyncRetrying - head = bytearray() - resp_headers = {} - async for attempt in AsyncRetrying( - stop=stop_after_attempt(max(1, int(self.max_retries))), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=(retry_if_exception_type(aiohttp.ClientError) | - retry_if_exception_type(asyncio.TimeoutError)), - before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), - reraise=True, - ): - with attempt: - async with session.get(url, headers=headers, timeout=timeout) as response: - response.raise_for_status() - resp_headers = dict(response.headers or {}) - # Write to a temp file first - tmp_path = Path(self.downloads_dir) / f".part_{row_index}" - async with aiofiles.open(tmp_path, 'wb') as f: - async for chunk in response.content.iter_chunked(1 << 16): - if chunk: - if len(head) < (1 << 16): - need = (1 << 16) - len(head) - head.extend(chunk[:need]) - await f.write(chunk) - # Infer extension using URL, headers and first bytes - file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) - if file_ext == 'html': - html_issue = self._detect_html_interstitial(url, resp_headers, bytes(head)) - if html_issue: - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") - return False, "", file_ext, html_issue, retry_count - if not self.is_supported_format(file_ext): - # Clean up temp and report - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - # Decide final filename - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - final_path = Path(self.downloads_dir) / filename - try: - os.replace(tmp_path, final_path) - except Exception: - # Fallback to copy/rename - try: - os.rename(tmp_path, final_path) - except Exception: - pass - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count - else: - # Fallback to non-streaming POST - content, status, resp_headers = await self.make_request( - session, requester, url, headers, timeout + return await self._download_via_streaming_get( + session=session, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - file_ext = self.infer_file_extension(url, resp_headers, content) - if file_ext == 'html': - html_issue = self._detect_html_interstitial(url, resp_headers, content) - if html_issue: - self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") - return False, "", file_ext, html_issue, retry_count - if not self.is_supported_format(file_ext): - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - await self.write_file(filename, content, self.downloads_dir) - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count + return await self._download_via_buffered_request( + session=session, + requester=requester, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) except aiohttp.ClientResponseError as e: - # Handle HTTP errors - status = e.status - self.logger.warning(f"Received {status} for {url}") - - # Detailed verbose logging for HTTP errors - if self.verbose: - self.logger.debug(f"HTTP Error Details - Status: {e.status}, Message: {e.message}") - self.logger.debug(f"Headers: {e.headers if hasattr(e, 'headers') else 'No headers available'}") - self.logger.debug(f"Request info: {e.request_info if hasattr(e, 'request_info') else 'No request info available'}") - - # Build error with optional Retry-After info - retry_after = None - try: - hdrs = dict(getattr(e, 'headers', {}) or {}) - for k, v in hdrs.items(): - if k.lower() == 'retry-after': - val = str(v).strip() - if val.isdigit(): - retry_after = int(val) - else: - try: - dt = parsedate_to_datetime(val) - retry_after = max(0, int((dt.timestamp() - time.time()))) - except Exception: - retry_after = None - break - except Exception: - retry_after = None - error_msg = f"HTTP {status}: {str(e)}" - if status in (429, 503) and retry_after is not None: - error_msg += f" retry_after={retry_after}" - # Best-effort ext from URL if possible - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return self._build_http_error_result(url, e, retry_count) except Exception as e: error_msg = str(e) @@ -1092,11 +1225,7 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn import traceback self.logger.debug(f"Traceback: {traceback.format_exc()}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Overall timeout exceeded for {url}") @@ -1108,22 +1237,14 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn except aiohttp.ClientError as e: error_msg = str(e) self.logger.error(f"ClientError while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Timeout while downloading {url}") return False, "", "", "Timeout", retry_count + 1 except Exception as e: error_msg = str(e) self.logger.error(f"Error while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 finally: if semaphore: try: diff --git a/src/glossapi/scripts/install_glossapi.py b/src/glossapi/scripts/install_glossapi.py new file mode 100644 index 0000000..195d662 --- /dev/null +++ b/src/glossapi/scripts/install_glossapi.py @@ -0,0 +1,230 @@ +"""Guided installer for GlossAPI extras.""" + +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Set + + +PHASE_TO_EXTRAS: Dict[str, Set[str]] = { + "download": set(), + "browser_download": {"browser"}, + "extract": {"docling"}, + "ocr": set(), + "docs": {"docs"}, +} + + +@dataclass(frozen=True) +class InstallPlan: + phases: tuple[str, ...] + extras: tuple[str, ...] + editable: bool + include_cuda: bool + needs_deepseek_runtime: bool + + +def _supports_color() -> bool: + return sys.stdout.isatty() and os.environ.get("TERM") not in {"", "dumb", None} + + +def _style(text: str, code: str) -> str: + if not _supports_color(): + return text + return f"\033[{code}m{text}\033[0m" + + +def _prompt_yes_no(question: str, default: bool = False) -> bool: + suffix = "[Y/n]" if default else "[y/N]" + while True: + raw = input(f"{question} {suffix} ").strip().lower() + if not raw: + return default + if raw in {"y", "yes"}: + return True + if raw in {"n", "no"}: + return False + print("Please answer 'y' or 'n'.") + + +def _resolve_phase_selection(tokens: Iterable[str]) -> List[str]: + resolved: List[str] = [] + seen: Set[str] = set() + for token in tokens: + phase = str(token).strip().lower() + if not phase: + continue + if phase not in PHASE_TO_EXTRAS: + raise ValueError(f"Unsupported phase '{token}'. Valid phases: {', '.join(sorted(PHASE_TO_EXTRAS))}") + if phase not in seen: + seen.add(phase) + resolved.append(phase) + return resolved + + +def build_install_plan( + *, + phases: Sequence[str], + editable: bool, + include_cuda: bool, +) -> InstallPlan: + selected = _resolve_phase_selection(phases) + extras: Set[str] = set() + for phase in selected: + extras.update(PHASE_TO_EXTRAS[phase]) + if include_cuda: + extras.add("cuda") + return InstallPlan( + phases=tuple(selected), + extras=tuple(sorted(extras)), + editable=bool(editable), + include_cuda=bool(include_cuda), + needs_deepseek_runtime=("ocr" in selected), + ) + + +def build_pip_command(plan: InstallPlan, repo_root: Path) -> List[str]: + target = "." + if plan.extras: + target = f".[{','.join(plan.extras)}]" + cmd = [sys.executable, "-m", "pip", "install"] + if plan.editable: + cmd.append("-e") + cmd.append(target) + return cmd + + +def build_deepseek_command(repo_root: Path) -> Optional[List[str]]: + script = repo_root / "dependency_setup" / "setup_deepseek_uv.sh" + if not script.exists(): + return None + shell = shutil.which("bash") or shutil.which("sh") + if not shell: + return None + return [shell, str(script)] + + +def _interactive_plan(default_editable: bool) -> InstallPlan: + print(_style("GlossAPI Installer", "1;36")) + print("Select only the phases you plan to use so optional dependencies stay minimal.\n") + + selected: List[str] = ["download"] + print(_style("Core", "1;37")) + print(" download: base downloader/data pipeline dependencies") + if _prompt_yes_no("Add browser-gated download support?", default=False): + selected.append("browser_download") + if _prompt_yes_no("Add extraction support (Docling)?", default=False): + selected.append("extract") + if _prompt_yes_no("Add OCR support (DeepSeek backend)?", default=False): + selected.append("ocr") + if _prompt_yes_no("Add docs tooling?", default=False): + selected.append("docs") + include_cuda = _prompt_yes_no("Include CUDA extras where relevant?", default=False) + editable = _prompt_yes_no("Install in editable mode?", default=default_editable) + return build_install_plan(phases=selected, editable=editable, include_cuda=include_cuda) + + +def _plan_summary(plan: InstallPlan, command: Sequence[str]) -> str: + extras = ", ".join(plan.extras) if plan.extras else "(none)" + phases = ", ".join(plan.phases) if plan.phases else "(none)" + return "\n".join( + [ + _style("Install plan", "1;32"), + f" phases: {phases}", + f" extras: {extras}", + f" editable: {'yes' if plan.editable else 'no'}", + f" command: {shlex.join(command)}", + f" deepseek runtime: {'separate setup required' if plan.needs_deepseek_runtime else 'not requested'}", + ] + ) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python install_glossapi.py", + description="Guided installer for GlossAPI optional dependency groups.", + ) + parser.add_argument( + "--phases", + default="", + help=( + "Comma-separated phases to install. Valid values: " + + ", ".join(sorted(PHASE_TO_EXTRAS)) + + ". If omitted, an interactive wizard is shown." + ), + ) + parser.add_argument( + "--cuda", + action="store_true", + help="Include the CUDA extra.", + ) + parser.add_argument( + "--editable", + dest="editable", + action="store_true", + help="Install in editable mode.", + ) + parser.add_argument( + "--no-editable", + dest="editable", + action="store_false", + help="Install as a regular package.", + ) + parser.set_defaults(editable=True) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the computed pip command without running it.", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Skip confirmation prompts in non-interactive mode.", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + repo_root = Path(__file__).resolve().parents[3] + + if args.phases.strip(): + plan = build_install_plan( + phases=[token for token in args.phases.split(",") if token.strip()], + editable=args.editable, + include_cuda=bool(args.cuda), + ) + else: + plan = _interactive_plan(default_editable=bool(args.editable)) + + command = build_pip_command(plan, repo_root) + print(_plan_summary(plan, command)) + deepseek_command = build_deepseek_command(repo_root) if plan.needs_deepseek_runtime else None + if deepseek_command: + print(f" deepseek command: {shlex.join(deepseek_command)}") + + if args.dry_run: + return 0 + if not args.yes and not args.phases.strip(): + if not _prompt_yes_no("Run this install command now?", default=True): + print("Aborted.") + return 1 + + completed = subprocess.run(command, cwd=repo_root) + if completed.returncode != 0: + return int(completed.returncode) + if plan.needs_deepseek_runtime and deepseek_command: + print(_style("Provisioning dedicated DeepSeek runtime…", "1;33")) + completed = subprocess.run(deepseek_command, cwd=repo_root) + return int(completed.returncode) + + +if __name__ == "__main__": # pragma: no cover - CLI entrypoint + raise SystemExit(main()) diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py new file mode 100644 index 0000000..9412d23 --- /dev/null +++ b/tests/test_browser_gloss_downloader.py @@ -0,0 +1,297 @@ +import asyncio + +import pandas as pd + +from glossapi import Corpus +from glossapi.download_policy import build_download_policy +from glossapi.gloss_browser_downloader import BrowserGlossDownloader, BrowserSessionState +import glossapi.corpus.phase_download as phase_download_mod + + +def test_browser_downloader_skips_viewer_interstitial(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + called = False + + async def _fake_browser_download(**kwargs): + nonlocal called + called = True + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue=( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ), + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result is None + assert called is False + + +def test_browser_downloader_recovers_challenge_page(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + async def _fake_browser_download(**kwargs): + return ( + b"%PDF-1.7\n%dummy\n", + {"Content-Type": "application/pdf"}, + {"candidate_url": "https://example.org/file.pdf"}, + ) + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://example.org/file.pdf", + headers={"Content-Type": "text/html"}, + content=b"challenge", + html_issue=( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ), + retry_count=1, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 1) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.7") + assert not (tmp_path / "downloads" / ".part_browser_0").exists() + + +def test_browser_downloader_domain_cookie_lookup(tmp_path): + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + domain_cookies={"eur-lex.europa.eu": {"token": "abc123"}}, + ) + + cookies = downloader._domain_cookies_for_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) + + assert cookies == {"token": "abc123"} + + +def test_browser_downloader_bootstrap_url_uses_base_for_file_endpoints(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._choose_browser_bootstrap_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) == "https://eur-lex.europa.eu" + + +def test_browser_downloader_ignores_err_aborted_for_file_navigation(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._should_ignore_navigation_exception( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + assert not downloader._should_ignore_navigation_exception( + "https://example.org/article", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + + +def test_browser_downloader_uses_default_browser_route_for_preflight(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="browser") + + async def _fake_download_browser_route(**kwargs): + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://example.org/file.pdf", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + + +def test_browser_downloader_reuses_cached_domain_session(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="auto") + bootstraps = 0 + fetches = 0 + + async def _fake_fetch_with_browser_session_state(**kwargs): + nonlocal fetches + fetches += 1 + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + async def _bootstrap(**kwargs): + nonlocal bootstraps + bootstraps += 1 + return BrowserSessionState(user_agent="UA", cookie_header="a=b", cached_at=10_000.0), [] + + monkeypatch.setattr(downloader, "_bootstrap_browser_session_state", _bootstrap) + monkeypatch.setattr(downloader, "_fetch_with_browser_session_state", _fake_fetch_with_browser_session_state) + monkeypatch.setattr("glossapi.gloss_browser_downloader.time.time", lambda: 10_100.0) + + first = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file.pdf", referer=None) + ) + second = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file2.pdf", referer=None) + ) + + assert first[0].startswith(b"%PDF") + assert second[0].startswith(b"%PDF") + assert bootstraps == 1 + assert fetches == 2 + + +def test_browser_downloader_policy_routes_domain_to_browser(tmp_path, monkeypatch): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["eur-lex.europa.eu"]}, + "downloader": "browser", + "browser_timeout_ms": 1234, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + observed = {} + + async def _fake_download_browser_route(**kwargs): + observed.update(kwargs) + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert observed["route_options"]["browser_timeout_ms"] == 1234 + + +def test_corpus_download_mode_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + result = corpus.download(input_parquet=input_parquet, download_mode="browser") + + assert observed["cls"] == "browser" + assert observed["kwargs"]["default_download_route"] == "browser" + assert bool(result["download_success"].iloc[0]) is True + assert (tmp_path / "download_results" / f"download_results_{input_parquet.name}").exists() + + +def test_corpus_browser_mode_alias_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, browser_mode=True) + + assert observed["cls"] == "browser" + + +def test_corpus_policy_file_selects_browser_router(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://eur-lex.europa.eu/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + policy_path = tmp_path / "download_policy.yml" + policy_path.write_text( + "default:\n downloader: standard\nrules:\n - match:\n domains: [eur-lex.europa.eu]\n downloader: browser\n", + encoding="utf-8", + ) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://eur-lex.europa.eu/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, download_policy_file=policy_path) + + assert observed["kwargs"]["download_policy_file"] == policy_path.resolve() + assert observed["kwargs"]["default_download_route"] == "standard" diff --git a/tests/test_install_glossapi.py b/tests/test_install_glossapi.py new file mode 100644 index 0000000..5226429 --- /dev/null +++ b/tests/test_install_glossapi.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from glossapi.scripts.install_glossapi import ( + build_deepseek_command, + build_install_plan, + build_pip_command, +) + + +def test_build_install_plan_collects_phase_extras(): + plan = build_install_plan( + phases=["download", "browser_download", "extract", "ocr"], + editable=True, + include_cuda=False, + ) + + assert plan.phases == ("download", "browser_download", "extract", "ocr") + assert set(plan.extras) == {"browser", "docling"} + assert plan.editable is True + assert plan.needs_deepseek_runtime is True + + +def test_build_install_plan_adds_cuda_extra(): + plan = build_install_plan( + phases=["download"], + editable=False, + include_cuda=True, + ) + + assert set(plan.extras) == {"cuda"} + assert plan.editable is False + assert plan.needs_deepseek_runtime is False + + +def test_build_pip_command_uses_editable_install(): + plan = build_install_plan( + phases=["download", "browser_download"], + editable=True, + include_cuda=False, + ) + command = build_pip_command(plan, Path("/tmp/repo")) + + assert command[:4] == [command[0], "-m", "pip", "install"] + assert "-e" in command + assert command[-1] == ".[browser]" + + +def test_build_deepseek_command_points_to_setup_script(): + command = build_deepseek_command(Path("/tmp/repo")) + + assert command is None or command[0] From 96241f97cdf7db51c1e4cdc56cdaff0af71fbbad Mon Sep 17 00:00:00 2001 From: adidev001 Date: Sat, 21 Mar 2026 02:07:27 +0530 Subject: [PATCH 11/26] docs: document pipeline artifact contract and runtime outputs --- docs/pipeline.md | 51 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/docs/pipeline.md b/docs/pipeline.md index cacc8c4..2f4b9dd 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -53,16 +53,21 @@ The `Corpus` class is the stable surface of the project. New functionality shoul ### 3. Clean - Main code: `Corpus.clean()` -- Purpose: run the Rust cleaner, compute quality/noise signals, and decide what should continue downstream. +- Purpose: run the Rust cleaner, remove low-quality or noisy markdown, + and mark documents that may need OCR retry before moving on. - Typical inputs: - `markdown/*.md` - - metadata parquet if one exists + - metadata parquet, if available - Important parameters: - `threshold` and `drop_bad` - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions - Main outputs: - cleaned markdown in `clean_markdown/` - - merged parquet metadata including OCR-related flags + - updated parquet metadata with quality and OCR-related flags +- Runtime/debug artifacts: + - `.processing_state.pkl` keeps track of progress so interrupted runs can resume + - `problematic_files/` keeps files that could not be cleaned successfully + - `timeout_files/` keeps files that exceeded the cleaning time limit ### 4. OCR Retry and Phase‑2 Enrichment @@ -91,26 +96,40 @@ The `Corpus` class is the stable surface of the project. New functionality shoul ## Artifact Layout -``` +The tree below shows the main folders and files GlossAPI can create under +the output directory. + +To make the layout easier to follow, artifacts are grouped by the role they +play in the pipeline: + +- canonical — the main outputs a stage is expected to produce, and the + files later stages usually depend on +- runtime — state files used to resume work safely if a run is interrupted +- debug — extra files kept around when something fails or needs a closer look + OUT/ -├── downloads/ -│ └── problematic_math/ -├── download_results/ -├── markdown/ +├── downloads/ (canonical) +│ └── problematic_math/ (debug) +├── download_results/ (canonical) +├── markdown/ (canonical) +│ └── .md +├── clean_markdown/ (canonical) │ └── .md -├── json/ +├── json/ (canonical) │ ├── .docling.json(.zst) │ ├── .formula_index.jsonl │ ├── .latex_map.jsonl │ ├── metrics/ -│ ├── .metrics.json -│ └── .per_page.metrics.json -│ └── problematic_math/ -├── sections/ +│ │ ├── .metrics.json +│ │ └── .per_page.metrics.json +│ └── problematic_math/ (debug) +├── sections/ (canonical) │ └── sections_for_annotation.parquet -├── classified_sections.parquet -└── fully_annotated_sections.parquet -``` +├── classified_sections.parquet (canonical) +├── fully_annotated_sections.parquet (canonical) +├── .processing_state.pkl (runtime) +├── problematic_files/ (debug) +└── timeout_files/ (debug) Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). From 00aed533af81d314e7bdb7f905a0e46f565bbf3e Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 24 Mar 2026 15:33:50 +0200 Subject: [PATCH 12/26] Upgrade Docling and simplify OCR runtime --- .gitignore | 1 - README.md | 3 +- dependency_setup/deepseek_uv/pyproject.toml | 2 +- dependency_setup/deepseek_uv/uv.lock | 964 +----------------- dependency_setup/dependency_notes.md | 2 +- .../requirements-glossapi-docling.txt | 10 +- .../requirements-glossapi-vanilla.txt | 10 +- dependency_setup/setup_glossapi.sh | 3 - docs/api/corpus.md | 6 +- docs/api_corpus_tmp.md | 4 +- .../deepseek_only_upgrade_roadmap.md | 262 ----- docs/code_map.md | 23 +- docs/configuration.md | 12 + docs/getting_started.md | 1 + docs/multi_gpu.md | 3 +- docs/ocr_and_math_enhancement.md | 2 +- docs/pipeline.md | 6 +- docs/testing/compatibility_matrix.md | 8 +- pyproject.toml | 4 +- requirements.txt | 15 +- src/glossapi/corpus/corpus_orchestrator.py | 56 +- src/glossapi/corpus/phase_extract.py | 105 +- src/glossapi/gloss_extract.py | 34 +- src/glossapi/ocr/deepseek/runner.py | 5 +- src/glossapi/ocr/docling/pipeline.py | 47 + src/glossapi/ocr/docling_pipeline.py | 81 +- src/glossapi/scripts/ocr_gpu_batch.py | 14 +- tests/test_corpus_guards.py | 31 +- tests/test_ocr_dispatch_backends.py | 2 +- tests/test_pipeline_smoke.py | 4 - 30 files changed, 274 insertions(+), 1446 deletions(-) delete mode 100644 docs/architecture/deepseek_only_upgrade_roadmap.md diff --git a/.gitignore b/.gitignore index 929a8c5..74f3edc 100644 --- a/.gitignore +++ b/.gitignore @@ -82,4 +82,3 @@ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ deepseek-ocr-2/ -repro_rapidocr_onnx/ diff --git a/README.md b/README.md index 953c03b..04be81a 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend ``` `setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +The uv-managed DeepSeek runtime is OCR-only on purpose: it installs `glossapi[deepseek]` and does not carry the Docling layout stack. If you want a guided install that asks which phases you plan to use, run: @@ -153,7 +154,7 @@ Use this as the shortest path from a documentation concept to the public call th | Stage | Main call | Important parameters | Writes | | --- | --- | --- | --- | | Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | -| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `use_gpus`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | | OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | | Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml index 809b499..a1caa65 100644 --- a/dependency_setup/deepseek_uv/pyproject.toml +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" requires-python = ">=3.11,<3.13" dependencies = [ - "glossapi[docling,deepseek]", + "glossapi[deepseek]", "torch==2.6.0", "torchvision==0.21.0", "torchaudio==2.6.0", diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock index f5eefaa..4f99980 100644 --- a/dependency_setup/deepseek_uv/uv.lock +++ b/dependency_setup/deepseek_uv/uv.lock @@ -119,15 +119,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - [[package]] name = "attrs" version = "25.4.0" @@ -137,19 +128,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] -[[package]] -name = "beautifulsoup4" -version = "4.14.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, -] - [[package]] name = "certifi" version = "2026.2.25" @@ -270,132 +248,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, ] -[[package]] -name = "dill" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, -] - -[[package]] -name = "docling" -version = "2.48.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accelerate" }, - { name = "beautifulsoup4" }, - { name = "certifi" }, - { name = "docling-core", extra = ["chunking"] }, - { name = "docling-ibm-models" }, - { name = "docling-parse" }, - { name = "easyocr" }, - { name = "filetype" }, - { name = "huggingface-hub" }, - { name = "lxml" }, - { name = "marko" }, - { name = "openpyxl" }, - { name = "pandas" }, - { name = "pillow" }, - { name = "pluggy" }, - { name = "pydantic" }, - { name = "pydantic-settings" }, - { name = "pylatexenc" }, - { name = "pypdfium2" }, - { name = "python-docx" }, - { name = "python-pptx" }, - { name = "requests" }, - { name = "rtree" }, - { name = "scipy" }, - { name = "tqdm" }, - { name = "typer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/be/32/e117cb0dcc76c93828d2cd9b45c3f8ccf6c86314a60e9c65f16067d3df26/docling-2.48.0.tar.gz", hash = "sha256:e94a5f75c544ec1bbb9169d2f4da72e1f497fd2fcda57cfacc454c93b1c92a8e", size = 189422, upload-time = "2025-08-26T05:31:02.666Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/32/a9c6677c66178a397b89b5b6fe1e7b3d3de98ddc2b331fbcd7440419b9f0/docling-2.48.0-py3-none-any.whl", hash = "sha256:8a1c1dfd5ed84cadb0f81fcb1464e5d501c4bfaa121e15306e09e3c0c983cc3e", size = 212266, upload-time = "2025-08-26T05:31:00.779Z" }, -] - -[[package]] -name = "docling-core" -version = "2.68.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "defusedxml" }, - { name = "jsonref" }, - { name = "jsonschema" }, - { name = "latex2mathml" }, - { name = "pandas" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "pyyaml" }, - { name = "tabulate" }, - { name = "typer" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5e/b7/95e329d143528decd8f6af5d4db6c2d6bc3dc40f9d53ee5b7d5b901dfe11/docling_core-2.68.0.tar.gz", hash = "sha256:261ecb6281d45fcf0559640297eda728f8f7dd4fe8c8bf7ced42dbf9b4e46223", size = 267551, upload-time = "2026-03-07T12:20:24.523Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/66/d8bbe25dec2bb91d9090b939349b1c9b94c307edceada46c5bc6f213a569/docling_core-2.68.0-py3-none-any.whl", hash = "sha256:175145398c005399819a7cfe7b634257caaaecfbb4451840b8ddb31fc2f5ac12", size = 247092, upload-time = "2026-03-07T12:20:23.172Z" }, -] - -[package.optional-dependencies] -chunking = [ - { name = "semchunk" }, - { name = "transformers" }, - { name = "tree-sitter" }, - { name = "tree-sitter-c" }, - { name = "tree-sitter-javascript" }, - { name = "tree-sitter-python" }, - { name = "tree-sitter-typescript" }, -] - -[[package]] -name = "docling-ibm-models" -version = "3.11.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accelerate" }, - { name = "docling-core" }, - { name = "huggingface-hub" }, - { name = "jsonlines" }, - { name = "numpy" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "rtree" }, - { name = "safetensors", extra = ["torch"] }, - { name = "torch" }, - { name = "torchvision" }, - { name = "tqdm" }, - { name = "transformers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b6/91/f883e0a2b3466e1126dfd4463f386c70f5b90d271c27b6f5a97d2f8312e6/docling_ibm_models-3.11.0.tar.gz", hash = "sha256:454401563a8e79cb33b718bc559d9bacca8a0183583e48f8e616c9184c1f5eb1", size = 87721, upload-time = "2026-01-23T12:29:35.384Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/5d/97e9c2e10fbd3ee1723ac82c335f8211a9633c0397cc11ed057c3ba4006e/docling_ibm_models-3.11.0-py3-none-any.whl", hash = "sha256:68f7961069d643bfdab21b1c9ef24a979db293496f4c2283d95b1025a9ac5347", size = 87352, upload-time = "2026-01-23T12:29:34.045Z" }, -] - -[[package]] -name = "docling-parse" -version = "4.7.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docling-core" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "tabulate" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bb/7a/653c3b11920113217724fab9b4740f9f8964864f92a2a27590accecec5ac/docling_parse-4.7.3.tar.gz", hash = "sha256:5936e6bcb7969c2a13f38ecc75cada3b0919422dc845e96da4b0b7b3bbc394ce", size = 67646746, upload-time = "2026-01-14T14:18:19.376Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6c/81/dd317e0bce475153dc08a60a9a8615b1a04d4d3c9803175e6cb7b7e9b49b/docling_parse-4.7.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66896bbe925073e4d48f18ec29dcd611a390d6b2378fae72125e77b020cd5664", size = 14615974, upload-time = "2026-01-14T14:17:30.246Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b5/088590e0b32fd0a393ca419c644d1435a1c99fa6b2a87888eef4d0fdea33/docling_parse-4.7.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:281347b3e937c1a5ffa6f8774ee603b64a0899fe8a6885573dec7eb48a3421d8", size = 14981051, upload-time = "2026-01-14T14:17:32.426Z" }, - { url = "https://files.pythonhosted.org/packages/b7/63/2b6c9127924487573d5419d58ec77955f0b7c0a923c8232ad461d71039aa/docling_parse-4.7.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3d86c51f9ce35a1b40b2f410f7271d9bd5fc58e7240f4cae7fdd2cef757e671", size = 15092586, upload-time = "2026-01-14T14:17:34.634Z" }, - { url = "https://files.pythonhosted.org/packages/af/89/ed27a83eb113bdf0b0f82f3c30a0db3c005df58b236f6487b232dacdb57a/docling_parse-4.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:3b04459cc97a8a4929622e341b9981e23987a63af07db599afc5e1c4d389060b", size = 16144866, upload-time = "2026-01-14T14:17:36.742Z" }, - { url = "https://files.pythonhosted.org/packages/d6/26/9d86ae12699a25b7233f76ce062253e9c14e57781e00166b792b3a9d56db/docling_parse-4.7.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d89231aa4fba3e38b80c11beb8edc07569e934c1f3935b51f57904fefe958ba5", size = 14616739, upload-time = "2026-01-14T14:17:38.567Z" }, - { url = "https://files.pythonhosted.org/packages/f2/fd/1aebb8a7f15d658f3be858ddbbc4ef7206089d540a7df0dcd4b846b99901/docling_parse-4.7.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dffd19ed373b0da5cea124606b183489a8686c3d18643e94485be1bdda5713ea", size = 14980782, upload-time = "2026-01-14T14:17:40.659Z" }, - { url = "https://files.pythonhosted.org/packages/3e/47/a722527c9f89c65f69f8a463be4f12ad73bae18132f29d8de8b2d9f6f082/docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc32b6f25a673e41b9a8112b6b841284f60dbac9427b7848a03b435460f74aee", size = 15092450, upload-time = "2026-01-14T14:17:42.838Z" }, - { url = "https://files.pythonhosted.org/packages/91/c7/316373a92ba42c2aeaee128fc77a34333449fe3e820b9d524e0ee396ea35/docling_parse-4.7.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef691045623863624f2cb7347572d0262a53cb84940ef7dd851d9f13a2eb8833", size = 16147359, upload-time = "2026-01-14T14:17:44.906Z" }, -] - [[package]] name = "easydict" version = "1.13" @@ -405,28 +257,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, ] -[[package]] -name = "easyocr" -version = "1.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ninja" }, - { name = "numpy" }, - { name = "opencv-python-headless" }, - { name = "pillow" }, - { name = "pyclipper" }, - { name = "python-bidi" }, - { name = "pyyaml" }, - { name = "scikit-image" }, - { name = "scipy" }, - { name = "shapely" }, - { name = "torch" }, - { name = "torchvision" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, -] - [[package]] name = "einops" version = "0.8.2" @@ -436,15 +266,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, ] -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, -] - [[package]] name = "filelock" version = "3.25.0" @@ -454,15 +275,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, ] -[[package]] -name = "filetype" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, -] - [[package]] name = "fonttools" version = "4.61.1" @@ -597,9 +409,6 @@ deepseek = [ { name = "tokenizers" }, { name = "transformers" }, ] -docling = [ - { name = "docling" }, -] [package.metadata] requires-dist = [ @@ -608,7 +417,7 @@ requires-dist = [ { name = "aiofiles", specifier = ">=23.0.0" }, { name = "aiohttp", specifier = ">=3.8.0" }, { name = "dask", specifier = ">=2022.1.0" }, - { name = "docling", marker = "extra == 'docling'", specifier = "==2.48.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.81.0" }, { name = "easydict", marker = "extra == 'deepseek'" }, { name = "einops", marker = "extra == 'deepseek'" }, { name = "ftfy", specifier = ">=6.0.0" }, @@ -616,9 +425,10 @@ requires-dist = [ { name = "joblib", specifier = ">=1.0.0" }, { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, - { name = "numpy", specifier = "<2" }, + { name = "numpy", specifier = ">=1.26,<3" }, { name = "pandas", specifier = ">=1.3.0" }, { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "playwright", marker = "extra == 'browser'", specifier = ">=1.52,<2" }, { name = "pyarrow", specifier = ">=7.0.0" }, { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, { name = "pypdfium2", specifier = ">=4.0.0" }, @@ -632,14 +442,14 @@ requires-dist = [ { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, { name = "zstandard", specifier = ">=0.22.0" }, ] -provides-extras = ["docling", "cuda", "deepseek", "docs"] +provides-extras = ["browser", "docling", "cuda", "deepseek", "docs"] [[package]] name = "glossapi-deepseek-runtime" version = "0.1.0" source = { virtual = "." } dependencies = [ - { name = "glossapi", extra = ["deepseek", "docling"] }, + { name = "glossapi", extra = ["deepseek"] }, { name = "torch" }, { name = "torchaudio" }, { name = "torchvision" }, @@ -653,7 +463,7 @@ test = [ [package.metadata] requires-dist = [ - { name = "glossapi", extras = ["docling", "deepseek"], editable = "../../" }, + { name = "glossapi", extras = ["deepseek"], editable = "../../" }, { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, @@ -709,19 +519,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] -[[package]] -name = "imageio" -version = "2.37.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "pillow" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/6f/606be632e37bf8d05b253e8626c2291d74c691ddc7bcdf7d6aaf33b32f6a/imageio-2.37.2.tar.gz", hash = "sha256:0212ef2727ac9caa5ca4b2c75ae89454312f440a756fcfc8ef1993e718f50f8a", size = 389600, upload-time = "2025-11-04T14:29:39.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/fe/301e0936b79bcab4cacc7548bf2853fc28dced0a578bab1f7ef53c9aa75b/imageio-2.37.2-py3-none-any.whl", hash = "sha256:ad9adfb20335d718c03de457358ed69f141021a333c40a53e57273d8a5bd0b9b", size = 317646, upload-time = "2025-11-04T14:29:37.948Z" }, -] - [[package]] name = "img2pdf" version = "0.6.3" @@ -777,75 +574,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] -[[package]] -name = "jsonlines" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" }, -] - -[[package]] -name = "jsonref" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, -] - -[[package]] -name = "jsonschema" -version = "4.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "jsonschema-specifications" }, - { name = "referencing" }, - { name = "rpds-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, -] - -[[package]] -name = "jsonschema-specifications" -version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "referencing" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, -] - -[[package]] -name = "latex2mathml" -version = "3.78.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/26/57b1034c08922d0aefea79430a5e0006ffaee4f0ec59d566613f667ab2f7/latex2mathml-3.78.1.tar.gz", hash = "sha256:f941db80bf41db33f31df87b304e8b588f8166b813b0257c11c98f7a9d0aac71", size = 74030, upload-time = "2025-08-29T23:34:23.178Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/76/d661ea2e529c3d464f9efd73f9ac31626b45279eb4306e684054ea20e3d4/latex2mathml-3.78.1-py3-none-any.whl", hash = "sha256:f089b6d75e85b937f99693c93e8c16c0804008672c3dd2a3d25affd36f238100", size = 73892, upload-time = "2025-08-29T23:34:21.98Z" }, -] - -[[package]] -name = "lazy-loader" -version = "0.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" }, -] - [[package]] name = "locket" version = "1.0.0" @@ -897,27 +625,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, ] -[[package]] -name = "markdown-it-py" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, -] - -[[package]] -name = "marko" -version = "2.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/2f/050b6d485f052ddf17d76a41f9334d6fb2a8a85df35347a12d97ed3bc5c1/marko-2.2.2.tar.gz", hash = "sha256:6940308e655f63733ca518c47a68ec9510279dbb916c83616e4c4b5829f052e8", size = 143641, upload-time = "2026-01-05T11:04:41.935Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/83/f8/36d79bac5701e6786f9880c61bbe57574760a13c1af84ab71e5ed21faecc/marko-2.2.2-py3-none-any.whl", hash = "sha256:f064ae8c10416285ad1d96048dc11e98ef04e662d3342ae416f662b70aa7959e", size = 42701, upload-time = "2026-01-05T11:04:40.75Z" }, -] - [[package]] name = "markupsafe" version = "3.0.3" @@ -948,34 +655,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, ] -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - -[[package]] -name = "mpire" -version = "2.10.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pygments" }, - { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270, upload-time = "2024-05-07T14:00:31.815Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/14/1db1729ad6db4999c3a16c47937d601fcb909aaa4224f5eca5a2f145a605/mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb", size = 272756, upload-time = "2024-05-07T14:00:29.633Z" }, -] - -[package.optional-dependencies] -dill = [ - { name = "multiprocess" }, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -1030,24 +709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, ] -[[package]] -name = "multiprocess" -version = "0.70.19" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "dill" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" }, - { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" }, - { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" }, - { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" }, - { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" }, - { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" }, - { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" }, -] - [[package]] name = "networkx" version = "3.6.1" @@ -1057,32 +718,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, ] -[[package]] -name = "ninja" -version = "1.13.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" }, - { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" }, - { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" }, - { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" }, - { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" }, - { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" }, - { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" }, - { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" }, - { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" }, - { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" }, - { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" }, - { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" }, - { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" }, - { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" }, - { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" }, - { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" }, - { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" }, - { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" }, -] - [[package]] name = "numpy" version = "1.26.4" @@ -1210,35 +845,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, ] -[[package]] -name = "opencv-python-headless" -version = "4.11.0.86" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/36/2f/5b2b3ba52c864848885ba988f24b7f105052f68da9ab0e693cc7c25b0b30/opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798", size = 95177929, upload-time = "2025-01-16T13:53:40.22Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/53/2c50afa0b1e05ecdb4603818e85f7d174e683d874ef63a6abe3ac92220c8/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca", size = 37326460, upload-time = "2025-01-16T13:52:57.015Z" }, - { url = "https://files.pythonhosted.org/packages/3b/43/68555327df94bb9b59a1fd645f63fafb0762515344d2046698762fc19d58/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81", size = 56723330, upload-time = "2025-01-16T13:55:45.731Z" }, - { url = "https://files.pythonhosted.org/packages/45/be/1438ce43ebe65317344a87e4b150865c5585f4c0db880a34cdae5ac46881/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb", size = 29487060, upload-time = "2025-01-16T13:51:59.625Z" }, - { url = "https://files.pythonhosted.org/packages/dd/5c/c139a7876099916879609372bfa513b7f1257f7f1a908b0bdc1c2328241b/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b", size = 49969856, upload-time = "2025-01-16T13:53:29.654Z" }, - { url = "https://files.pythonhosted.org/packages/95/dd/ed1191c9dc91abcc9f752b499b7928aacabf10567bb2c2535944d848af18/opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b", size = 29324425, upload-time = "2025-01-16T13:52:49.048Z" }, - { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, -] - -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, -] - [[package]] name = "packaging" version = "26.0" @@ -1433,111 +1039,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, ] -[[package]] -name = "pyclipper" -version = "1.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/21/3c06205bb407e1f79b73b7b4dfb3950bd9537c4f625a68ab5cc41177f5bc/pyclipper-1.4.0.tar.gz", hash = "sha256:9882bd889f27da78add4dd6f881d25697efc740bf840274e749988d25496c8e1", size = 54489, upload-time = "2025-12-01T13:15:35.015Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/de/e3/64cf7794319b088c288706087141e53ac259c7959728303276d18adc665d/pyclipper-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:adcb7ca33c5bdc33cd775e8b3eadad54873c802a6d909067a57348bcb96e7a2d", size = 264281, upload-time = "2025-12-01T13:14:55.47Z" }, - { url = "https://files.pythonhosted.org/packages/34/cd/44ec0da0306fa4231e76f1c2cb1fa394d7bde8db490a2b24d55b39865f69/pyclipper-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fd24849d2b94ec749ceac7c34c9f01010d23b6e9d9216cf2238b8481160e703d", size = 139426, upload-time = "2025-12-01T13:14:56.683Z" }, - { url = "https://files.pythonhosted.org/packages/ad/88/d8f6c6763ea622fe35e19c75d8b39ed6c55191ddc82d65e06bc46b26cb8e/pyclipper-1.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b6c8d75ba20c6433c9ea8f1a0feb7e4d3ac06a09ad1fd6d571afc1ddf89b869", size = 989649, upload-time = "2025-12-01T13:14:58.28Z" }, - { url = "https://files.pythonhosted.org/packages/ff/e9/ea7d68c8c4af3842d6515bedcf06418610ad75f111e64c92c1d4785a1513/pyclipper-1.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e29d7443d7cc0e83ee9daf43927730386629786d00c63b04fe3b53ac01462c", size = 962842, upload-time = "2025-12-01T13:15:00.044Z" }, - { url = "https://files.pythonhosted.org/packages/4e/b7/0b4a272d8726e51ab05e2b933d8cc47f29757fb8212e38b619e170e6015c/pyclipper-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a8d2b5fb75ebe57e21ce61e79a9131edec2622ff23cc665e4d1d1f201bc1a801", size = 95098, upload-time = "2025-12-01T13:15:01.359Z" }, - { url = "https://files.pythonhosted.org/packages/3a/76/4901de2919198bb2bd3d989f86d4a1dff363962425bb2d63e24e6c990042/pyclipper-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:e9b973467d9c5fa9bc30bb6ac95f9f4d7c3d9fc25f6cf2d1cc972088e5955c01", size = 104362, upload-time = "2025-12-01T13:15:02.439Z" }, - { url = "https://files.pythonhosted.org/packages/90/1b/7a07b68e0842324d46c03e512d8eefa9cb92ba2a792b3b4ebf939dafcac3/pyclipper-1.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:222ac96c8b8281b53d695b9c4fedc674f56d6d4320ad23f1bdbd168f4e316140", size = 265676, upload-time = "2025-12-01T13:15:04.15Z" }, - { url = "https://files.pythonhosted.org/packages/6b/dd/8bd622521c05d04963420ae6664093f154343ed044c53ea260a310c8bb4d/pyclipper-1.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f3672dbafbb458f1b96e1ee3e610d174acb5ace5bd2ed5d1252603bb797f2fc6", size = 140458, upload-time = "2025-12-01T13:15:05.76Z" }, - { url = "https://files.pythonhosted.org/packages/7a/06/6e3e241882bf7d6ab23d9c69ba4e85f1ec47397cbbeee948a16cf75e21ed/pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d1f807e2b4760a8e5c6d6b4e8c1d71ef52b7fe1946ff088f4fa41e16a881a5ca", size = 978235, upload-time = "2025-12-01T13:15:06.993Z" }, - { url = "https://files.pythonhosted.org/packages/cf/f4/3418c1cd5eea640a9fa2501d4bc0b3655fa8d40145d1a4f484b987990a75/pyclipper-1.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce1f83c9a4e10ea3de1959f0ae79e9a5bd41346dff648fee6228ba9eaf8b3872", size = 961388, upload-time = "2025-12-01T13:15:08.467Z" }, - { url = "https://files.pythonhosted.org/packages/ac/94/c85401d24be634af529c962dd5d781f3cb62a67cd769534df2cb3feee97a/pyclipper-1.4.0-cp312-cp312-win32.whl", hash = "sha256:3ef44b64666ebf1cb521a08a60c3e639d21b8c50bfbe846ba7c52a0415e936f4", size = 95169, upload-time = "2025-12-01T13:15:10.098Z" }, - { url = "https://files.pythonhosted.org/packages/97/77/dfea08e3b230b82ee22543c30c35d33d42f846a77f96caf7c504dd54fab1/pyclipper-1.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:d1e5498d883b706a4ce636247f0d830c6eb34a25b843a1b78e2c969754ca9037", size = 104619, upload-time = "2025-12-01T13:15:11.592Z" }, - { url = "https://files.pythonhosted.org/packages/18/59/81050abdc9e5b90ffc2c765738c5e40e9abd8e44864aaa737b600f16c562/pyclipper-1.4.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98b2a40f98e1fc1b29e8a6094072e7e0c7dfe901e573bf6cfc6eb7ce84a7ae87", size = 126495, upload-time = "2025-12-01T13:15:33.743Z" }, -] - -[[package]] -name = "pydantic" -version = "2.12.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.41.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, - { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, - { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, - { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, - { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, - { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, - { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, - { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, - { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, - { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, - { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, - { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, - { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, - { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, - { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, - { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, - { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, - { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, - { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, - { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, - { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, - { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, - { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, - { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, - { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, - { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, - { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, - { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, - { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, - { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, - { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, - { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, - { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, - { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, - { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, - { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, -] - -[[package]] -name = "pydantic-settings" -version = "2.13.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, -] - [[package]] name = "pygments" version = "2.19.2" @@ -1547,12 +1048,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] -[[package]] -name = "pylatexenc" -version = "2.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" } - [[package]] name = "pymupdf" version = "1.24.10" @@ -1629,54 +1124,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] -[[package]] -name = "python-bidi" -version = "0.6.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ed/e3/c0c8bf6fca79ac946a28d57f116e3b9e5b10a4469b6f70bf73f3744c49bf/python_bidi-0.6.7.tar.gz", hash = "sha256:c10065081c0e137975de5d9ba2ff2306286dbf5e0c586d4d5aec87c856239b41", size = 45503, upload-time = "2025-10-22T09:52:49.624Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/de/c30a13ad95239507af472a5fc2cadd2e5e172055068f12ac39b37922c7f8/python_bidi-0.6.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a8892a7da0f617135fe9c92dc7070d13a0f96ab3081f9db7ff5b172a3905bd78", size = 274420, upload-time = "2025-10-22T09:51:58.262Z" }, - { url = "https://files.pythonhosted.org/packages/ad/9f/be5efef7eea5f1e2a6415c4052a988f594dcf5a11a15103f2718d324a35b/python_bidi-0.6.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:06650a164e63e94dc8a291cc9d415b4027cb1cce125bc9b02dac0f34d535ed47", size = 264586, upload-time = "2025-10-22T09:51:49.255Z" }, - { url = "https://files.pythonhosted.org/packages/87/ec/2c374b6de35870817ffb3512c0666ea8c3794ef923b5586c69451e0e5395/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6df7be07af867ec1d121c92ea827efad4d77b25457c06eeab477b601e82b2340", size = 293672, upload-time = "2025-10-22T09:50:58.504Z" }, - { url = "https://files.pythonhosted.org/packages/29/1a/722d7d7128bdc9a530351a0d2fdf2ff5f4af66a865a6bca925f99832e2cc/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:73a88dc333efc42281bd800d5182c8625c6e11d109fc183fe3d7a11d48ab1150", size = 302643, upload-time = "2025-10-22T09:51:06.419Z" }, - { url = "https://files.pythonhosted.org/packages/24/d7/5b9b593dd58fc745233d8476e9f4e0edd437547c78c58340619868470349/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f24189dc3aea3a0a94391a047076e1014306b39ba17d7a38ebab510553cd1a97", size = 441692, upload-time = "2025-10-22T09:51:15.39Z" }, - { url = "https://files.pythonhosted.org/packages/08/b9/16e7a1db5f022da6654e89875d231ec2e044d42ef7b635feeff61cee564c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a507fe6928a27a308e04ebf2065719b7850d1bf9ff1924f4e601ef77758812bd", size = 326933, upload-time = "2025-10-22T09:51:23.631Z" }, - { url = "https://files.pythonhosted.org/packages/e0/a6/45aaec301292c6a07a9cc3168f5d1a92c8adc2ef36a3cd1f227b9caa980c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbbffb948a32f9783d1a28bc0c53616f0a76736ed1e7c1d62e3e99a8dfaab869", size = 302034, upload-time = "2025-10-22T09:51:41.347Z" }, - { url = "https://files.pythonhosted.org/packages/71/a3/7e42cce6e153c21b4e5cc96d429a5910909823f6fedd174b64ff67bc76a7/python_bidi-0.6.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7e507e1e798ebca77ddc9774fd405107833315ad802cfdaa1ab07b6d9154fc8", size = 315738, upload-time = "2025-10-22T09:51:33.409Z" }, - { url = "https://files.pythonhosted.org/packages/43/7c/a5e4c0acc8e6ca61953b4add0576f0483f63b809b5389154e5da13927b0b/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:849a57d39feaf897955d0b19bbf4796bea53d1bcdf83b82e0a7b059167eb2049", size = 473968, upload-time = "2025-10-22T09:52:07.624Z" }, - { url = "https://files.pythonhosted.org/packages/b1/aa/a18bc3cbab7a0e598cbe7b89f2c0913aedcc66dcafce9a4c357465c87859/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5ebc19f24e65a1f5c472e26d88e78b9d316e293bc6f205f32de4c4e99276336e", size = 567038, upload-time = "2025-10-22T09:52:18.594Z" }, - { url = "https://files.pythonhosted.org/packages/92/46/fc6c54a8b5bfbee50e650f885ddef4f8c4f92880467ea0bc2bf133747048/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:24388c77cb00b8aa0f9c84beb7e3e523a3dac4f786ece64a1d8175a07b24da72", size = 493970, upload-time = "2025-10-22T09:52:29.815Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f1/2c15f5b938b2e087e4e950cc14dcead5bedbaabfc6c576dac15739bc0c91/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:19737d217088ef27014f98eac1827c5913e6fb1dea96332ed84ede61791070d9", size = 465161, upload-time = "2025-10-22T09:52:40.517Z" }, - { url = "https://files.pythonhosted.org/packages/56/d7/73a70a1fb819152485521b8dfe627e14ba9d3d5a65213244ab099adf3600/python_bidi-0.6.7-cp311-cp311-win32.whl", hash = "sha256:95c9de7ebc55ffb777548f2ecaf4b96b0fa0c92f42bf4d897b9f4cd164ec7394", size = 157033, upload-time = "2025-10-22T09:52:59.228Z" }, - { url = "https://files.pythonhosted.org/packages/68/84/06999dc54ea047fe33209af7150df4202ab7ad52deeb66b2c2040ac07884/python_bidi-0.6.7-cp311-cp311-win_amd64.whl", hash = "sha256:898db0ea3e4aaa95b7fecba02a7560dfbf368f9d85053f2875f6d610c4d4ec2c", size = 161282, upload-time = "2025-10-22T09:52:51.467Z" }, - { url = "https://files.pythonhosted.org/packages/e5/03/5b2f3e73501d0f41ebc2b075b49473047c6cdfc3465cf890263fc69e3915/python_bidi-0.6.7-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:11c51579e01f768446a7e13a0059fea1530936a707abcbeaad9467a55cb16073", size = 272536, upload-time = "2025-10-22T09:51:59.721Z" }, - { url = "https://files.pythonhosted.org/packages/31/77/c6048e938a73e5a7c6fa3d5e3627a5961109daa728c2e7d050567cecdc26/python_bidi-0.6.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47deaada8949af3a790f2cd73b613f9bfa153b4c9450f91c44a60c3109a81f73", size = 263258, upload-time = "2025-10-22T09:51:50.328Z" }, - { url = "https://files.pythonhosted.org/packages/57/56/ed4dc501cab7de70ce35cd435c86278e4eb1caf238c80bc72297767c9219/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b38ddfab41d10e780edb431edc30aec89bee4ce43d718e3896e99f33dae5c1d3", size = 292700, upload-time = "2025-10-22T09:50:59.628Z" }, - { url = "https://files.pythonhosted.org/packages/77/6a/1bf06d7544c940ffddd97cd0e02c55348a92163c5495fa18e34217dfbebe/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a93b0394cc684d64356b0475858c116f1e335ffbaba388db93bf47307deadfa", size = 300881, upload-time = "2025-10-22T09:51:07.507Z" }, - { url = "https://files.pythonhosted.org/packages/22/1d/ce7577a8f50291c06e94f651ac5de0d1678fc2642af26a5dad9901a0244f/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec1694134961b71ac05241ac989b49ccf08e232b5834d5fc46f8a7c3bb1c13a9", size = 439125, upload-time = "2025-10-22T09:51:16.559Z" }, - { url = "https://files.pythonhosted.org/packages/a3/87/4cf6dcd58e22f0fd904e7a161c6b73a5f9d17d4d49073fcb089ba62f1469/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8047c33b85f7790474a1f488bef95689f049976a4e1c6f213a8d075d180a93e4", size = 325816, upload-time = "2025-10-22T09:51:25.12Z" }, - { url = "https://files.pythonhosted.org/packages/2a/0a/4028a088e29ce8f1673e85ec9f64204fc368355c3207e6a71619c2b4579a/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d9de35eb5987da27dd81e371c52142dd8e924bd61c1006003071ea05a735587", size = 300550, upload-time = "2025-10-22T09:51:42.739Z" }, - { url = "https://files.pythonhosted.org/packages/1f/05/cac15eba462d5a2407ac4ef1c792c45a948652b00c6bd81eaab3834a62d2/python_bidi-0.6.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a99d898ad1a399d9c8cab5561b3667fd24f4385820ac90c3340aa637aa5adfc9", size = 313017, upload-time = "2025-10-22T09:51:34.905Z" }, - { url = "https://files.pythonhosted.org/packages/4b/b1/3ba91b9ea60fa54a9aa730a5fe432bd73095d55be371244584fc6818eae1/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5debaab33562fdfc79ffdbd8d9c51cf07b8529de0e889d8cd145d78137aab21e", size = 472798, upload-time = "2025-10-22T09:52:09.079Z" }, - { url = "https://files.pythonhosted.org/packages/50/40/4bf5fb7255e35c218174f322a4d4c80b63b2604d73adc6e32f843e700824/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c11c62a3cdb9d1426b1536de9e3446cb09c7d025bd4df125275cae221f214899", size = 565234, upload-time = "2025-10-22T09:52:19.703Z" }, - { url = "https://files.pythonhosted.org/packages/bd/81/ad23fb85bff69d0a25729cd3834254b87c3c7caa93d657c8f8edcbed08f6/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6c051f2d28ca542092d01da8b5fe110fb6191ff58d298a54a93dc183bece63bf", size = 491844, upload-time = "2025-10-22T09:52:31.216Z" }, - { url = "https://files.pythonhosted.org/packages/65/85/103baaf142b2838f583b71904a2454fa31bd2a912ff505c25874f45d6c3e/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95867a07c5dee0ea2340fe1d0e4f6d9f5c5687d473193b6ee6f86fa44aac45d1", size = 463753, upload-time = "2025-10-22T09:52:41.943Z" }, - { url = "https://files.pythonhosted.org/packages/54/c3/6a5c3b9f42a6b188430c83a7e70a76bc7c0db3354302fce7c8ed94a0c062/python_bidi-0.6.7-cp312-cp312-win32.whl", hash = "sha256:4c73cd980d45bb967799c7f0fc98ea93ae3d65b21ef2ba6abef6a057720bf483", size = 155820, upload-time = "2025-10-22T09:53:00.254Z" }, - { url = "https://files.pythonhosted.org/packages/45/c4/683216398ee3abf6b9bb0f26ae15c696fabbe36468ba26d5271f0c11b343/python_bidi-0.6.7-cp312-cp312-win_amd64.whl", hash = "sha256:d524a4ba765bae9b950706472a77a887a525ed21144fe4b41f6190f6e57caa2c", size = 159966, upload-time = "2025-10-22T09:52:52.547Z" }, - { url = "https://files.pythonhosted.org/packages/b8/4e/6135798d84b62eea70c0f9435301c2a4ba854e87be93a3fcd1d935266d24/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c9a679b24f5c6f366a0dec75745e1abeae2f597f033d0d54c74cbe62e7e6ae28", size = 276275, upload-time = "2025-10-22T09:52:05.078Z" }, - { url = "https://files.pythonhosted.org/packages/74/83/2123596d43e552af9e2806e361646fa579f34a1d1e9e2c1707a0ab6a02dd/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fe5971110013610f0db40505d0b204edc756e92eafac1372a464f8b9162b11", size = 266951, upload-time = "2025-10-22T09:51:56.216Z" }, - { url = "https://files.pythonhosted.org/packages/5c/8c/8d1e1501717227a6d52fc7b9c47a3de61486b024fbdd4821bfad724c0699/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17572944e6d8fb616d111fc702c759da2bf7cedab85a3e4fa2af0c9eb95ed438", size = 295745, upload-time = "2025-10-22T09:51:04.438Z" }, - { url = "https://files.pythonhosted.org/packages/fd/ff/ef04e7f9067c2c5d862b9f8d9a192486c500c8aa295f0fb756c25ab47fc8/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3b63d19f3f56ff7f99bce5ca9ef8c811dbf0f509d8e84c1bc06105ed26a49528", size = 304123, upload-time = "2025-10-22T09:51:12.559Z" }, - { url = "https://files.pythonhosted.org/packages/be/72/b973895e257a7d4cc8365ab094612f6ee885df863a4964d8865b9f534b67/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1350033431d75be749273236dcfc808e54404cd6ece6204cdb1bc4ccc163455", size = 442484, upload-time = "2025-10-22T09:51:21.575Z" }, - { url = "https://files.pythonhosted.org/packages/c1/1a/68ca9d10bc309828e8cdb2d57a30dd7e5753ac8520c8d7a0322daeb9eef7/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c5fb99f774748de283fadf915106f130b74be1bade934b7f73a7a8488b95da1", size = 329149, upload-time = "2025-10-22T09:51:31.232Z" }, - { url = "https://files.pythonhosted.org/packages/03/40/ab450c06167a7de596d99b1ba5cee2c605b3ff184baccf08210ede706b1b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d28e2bdcadf5b6161bb4ee9313ce41eac746ba57e744168bf723a415a11af05", size = 303529, upload-time = "2025-10-22T09:51:46.997Z" }, - { url = "https://files.pythonhosted.org/packages/ec/c5/585b5c413e3b77a32500fb877ea30aa23c45a6064dbd7fe77d87b72cd90b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3777ae3e088e94df854fbcbd8d59f9239b74aac036cb6bbd19f8035c8e42478", size = 317753, upload-time = "2025-10-22T09:51:39.272Z" }, - { url = "https://files.pythonhosted.org/packages/f9/05/b7b4b447890d614ccb40633f4d65f334bcf9fe3ad13be33aaa54dcbc34f3/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:77bb4cbadf4121db395189065c58c9dd5d1950257cc1983004e6df4a3e2f97ad", size = 476054, upload-time = "2025-10-22T09:52:15.856Z" }, - { url = "https://files.pythonhosted.org/packages/ca/94/64f6d2c09c4426918345b54ca8902f94b663eadd744c9dd89070f546c9bc/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:f1fe71c203f66bc169a393964d5702f9251cfd4d70279cb6453fdd42bd2e675f", size = 568365, upload-time = "2025-10-22T09:52:27.556Z" }, - { url = "https://files.pythonhosted.org/packages/fc/d2/c39a6b82aa0fcedac7cbe6078b78bb9089b43d903f8e00859e42b504bb8e/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:d87ed09e5c9b6d2648e8856a4e556147b9d3cd4d63905fa664dd6706bc414256", size = 495292, upload-time = "2025-10-22T09:52:38.306Z" }, - { url = "https://files.pythonhosted.org/packages/0a/8d/a80f37ab92118e305d7b574306553599f81534c50b4eb23ef34ebe09c09c/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:766d5f5a686eb99b53168a7bdfb338035931a609bdbbcb537cef9e050a86f359", size = 467159, upload-time = "2025-10-22T09:52:48.603Z" }, -] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1689,43 +1136,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] -[[package]] -name = "python-docx" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, -] - -[[package]] -name = "python-dotenv" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, -] - -[[package]] -name = "python-pptx" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml" }, - { name = "pillow" }, - { name = "typing-extensions" }, - { name = "xlsxwriter" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, -] - [[package]] name = "pytz" version = "2026.1.post1" @@ -1735,19 +1145,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, ] -[[package]] -name = "pywin32" -version = "311" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" }, - { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" }, - { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" }, - { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, - { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, -] - [[package]] name = "pyyaml" version = "6.0.3" @@ -1775,20 +1172,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, ] -[[package]] -name = "referencing" -version = "0.37.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "rpds-py" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, -] - [[package]] name = "regex" version = "2026.2.28" @@ -1844,85 +1227,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] -[[package]] -name = "rich" -version = "14.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, -] - -[[package]] -name = "rpds-py" -version = "0.30.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" }, - { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" }, - { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" }, - { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" }, - { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" }, - { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" }, - { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" }, - { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" }, - { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" }, - { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" }, - { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" }, - { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" }, - { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" }, - { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" }, - { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" }, - { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, - { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, - { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, - { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, - { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, - { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, - { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, - { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, - { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, - { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, - { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, - { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, - { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, - { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, - { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, - { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" }, - { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" }, - { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" }, - { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" }, - { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" }, - { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" }, - { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" }, - { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" }, - { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" }, - { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" }, - { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" }, - { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, -] - -[[package]] -name = "rtree" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/09/7302695875a019514de9a5dd17b8320e7a19d6e7bc8f85dcfb79a4ce2da3/rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46", size = 52425, upload-time = "2025-08-13T19:32:01.413Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/d9/108cd989a4c0954e60b3cdc86fd2826407702b5375f6dfdab2802e5fed98/rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4", size = 468484, upload-time = "2025-08-13T19:31:50.593Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cf/2710b6fd6b07ea0aef317b29f335790ba6adf06a28ac236078ed9bd8a91d/rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d", size = 436325, upload-time = "2025-08-13T19:31:52.367Z" }, - { url = "https://files.pythonhosted.org/packages/55/e1/4d075268a46e68db3cac51846eb6a3ab96ed481c585c5a1ad411b3c23aad/rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65", size = 459789, upload-time = "2025-08-13T19:31:53.926Z" }, - { url = "https://files.pythonhosted.org/packages/d1/75/e5d44be90525cd28503e7f836d077ae6663ec0687a13ba7810b4114b3668/rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c", size = 507644, upload-time = "2025-08-13T19:31:55.164Z" }, - { url = "https://files.pythonhosted.org/packages/fd/85/b8684f769a142163b52859a38a486493b05bafb4f2fb71d4f945de28ebf9/rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967", size = 1454478, upload-time = "2025-08-13T19:31:56.808Z" }, - { url = "https://files.pythonhosted.org/packages/e9/a4/c2292b95246b9165cc43a0c3757e80995d58bc9b43da5cb47ad6e3535213/rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc", size = 1555140, upload-time = "2025-08-13T19:31:58.031Z" }, - { url = "https://files.pythonhosted.org/packages/74/25/5282c8270bfcd620d3e73beb35b40ac4ab00f0a898d98ebeb41ef0989ec8/rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489", size = 389358, upload-time = "2025-08-13T19:31:59.247Z" }, - { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" }, -] - [[package]] name = "safetensors" version = "0.7.0" @@ -1945,47 +1249,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, ] -[package.optional-dependencies] -torch = [ - { name = "numpy" }, - { name = "packaging" }, - { name = "torch" }, -] - -[[package]] -name = "scikit-image" -version = "0.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "imageio" }, - { name = "lazy-loader" }, - { name = "networkx" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "scipy" }, - { name = "tifffile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a1/b4/2528bb43c67d48053a7a649a9666432dc307d66ba02e3a6d5c40f46655df/scikit_image-0.26.0.tar.gz", hash = "sha256:f5f970ab04efad85c24714321fcc91613fcb64ef2a892a13167df2f3e59199fa", size = 22729739, upload-time = "2025-12-20T17:12:21.824Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/16/8a407688b607f86f81f8c649bf0d68a2a6d67375f18c2d660aba20f5b648/scikit_image-0.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b1ede33a0fb3731457eaf53af6361e73dd510f449dac437ab54573b26788baf0", size = 12355510, upload-time = "2025-12-20T17:10:31.628Z" }, - { url = "https://files.pythonhosted.org/packages/6b/f9/7efc088ececb6f6868fd4475e16cfafc11f242ce9ab5fc3557d78b5da0d4/scikit_image-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7af7aa331c6846bd03fa28b164c18d0c3fd419dbb888fb05e958ac4257a78fdd", size = 12056334, upload-time = "2025-12-20T17:10:34.559Z" }, - { url = "https://files.pythonhosted.org/packages/9f/1e/bc7fb91fb5ff65ef42346c8b7ee8b09b04eabf89235ab7dbfdfd96cbd1ea/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ea6207d9e9d21c3f464efe733121c0504e494dbdc7728649ff3e23c3c5a4953", size = 13297768, upload-time = "2025-12-20T17:10:37.733Z" }, - { url = "https://files.pythonhosted.org/packages/a5/2a/e71c1a7d90e70da67b88ccc609bd6ae54798d5847369b15d3a8052232f9d/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74aa5518ccea28121f57a95374581d3b979839adc25bb03f289b1bc9b99c58af", size = 13711217, upload-time = "2025-12-20T17:10:40.935Z" }, - { url = "https://files.pythonhosted.org/packages/d4/59/9637ee12c23726266b91296791465218973ce1ad3e4c56fc81e4d8e7d6e1/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d5c244656de905e195a904e36dbc18585e06ecf67d90f0482cbde63d7f9ad59d", size = 14337782, upload-time = "2025-12-20T17:10:43.452Z" }, - { url = "https://files.pythonhosted.org/packages/e7/5c/a3e1e0860f9294663f540c117e4bf83d55e5b47c281d475cc06227e88411/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21a818ee6ca2f2131b9e04d8eb7637b5c18773ebe7b399ad23dcc5afaa226d2d", size = 14805997, upload-time = "2025-12-20T17:10:45.93Z" }, - { url = "https://files.pythonhosted.org/packages/d3/c6/2eeacf173da041a9e388975f54e5c49df750757fcfc3ee293cdbbae1ea0a/scikit_image-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:9490360c8d3f9a7e85c8de87daf7c0c66507960cf4947bb9610d1751928721c7", size = 11878486, upload-time = "2025-12-20T17:10:48.246Z" }, - { url = "https://files.pythonhosted.org/packages/c3/a4/a852c4949b9058d585e762a66bf7e9a2cd3be4795cd940413dfbfbb0ce79/scikit_image-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:0baa0108d2d027f34d748e84e592b78acc23e965a5de0e4bb03cf371de5c0581", size = 11346518, upload-time = "2025-12-20T17:10:50.575Z" }, - { url = "https://files.pythonhosted.org/packages/99/e8/e13757982264b33a1621628f86b587e9a73a13f5256dad49b19ba7dc9083/scikit_image-0.26.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d454b93a6fa770ac5ae2d33570f8e7a321bb80d29511ce4b6b78058ebe176e8c", size = 12376452, upload-time = "2025-12-20T17:10:52.796Z" }, - { url = "https://files.pythonhosted.org/packages/e3/be/f8dd17d0510f9911f9f17ba301f7455328bf13dae416560126d428de9568/scikit_image-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3409e89d66eff5734cd2b672d1c48d2759360057e714e1d92a11df82c87cba37", size = 12061567, upload-time = "2025-12-20T17:10:55.207Z" }, - { url = "https://files.pythonhosted.org/packages/b3/2b/c70120a6880579fb42b91567ad79feb4772f7be72e8d52fec403a3dde0c6/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c717490cec9e276afb0438dd165b7c3072d6c416709cc0f9f5a4c1070d23a44", size = 13084214, upload-time = "2025-12-20T17:10:57.468Z" }, - { url = "https://files.pythonhosted.org/packages/f4/a2/70401a107d6d7466d64b466927e6b96fcefa99d57494b972608e2f8be50f/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df650e79031634ac90b11e64a9eedaf5a5e06fcd09bcd03a34be01745744466", size = 13561683, upload-time = "2025-12-20T17:10:59.49Z" }, - { url = "https://files.pythonhosted.org/packages/13/a5/48bdfd92794c5002d664e0910a349d0a1504671ef5ad358150f21643c79a/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cefd85033e66d4ea35b525bb0937d7f42d4cdcfed2d1888e1570d5ce450d3932", size = 14112147, upload-time = "2025-12-20T17:11:02.083Z" }, - { url = "https://files.pythonhosted.org/packages/ee/b5/ac71694da92f5def5953ca99f18a10fe98eac2dd0a34079389b70b4d0394/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3f5bf622d7c0435884e1e141ebbe4b2804e16b2dd23ae4c6183e2ea99233be70", size = 14661625, upload-time = "2025-12-20T17:11:04.528Z" }, - { url = "https://files.pythonhosted.org/packages/23/4d/a3cc1e96f080e253dad2251bfae7587cf2b7912bcd76fd43fd366ff35a87/scikit_image-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:abed017474593cd3056ae0fe948d07d0747b27a085e92df5474f4955dd65aec0", size = 11911059, upload-time = "2025-12-20T17:11:06.61Z" }, - { url = "https://files.pythonhosted.org/packages/35/8a/d1b8055f584acc937478abf4550d122936f420352422a1a625eef2c605d8/scikit_image-0.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:4d57e39ef67a95d26860c8caf9b14b8fb130f83b34c6656a77f191fa6d1d04d8", size = 11348740, upload-time = "2025-12-20T17:11:09.118Z" }, -] - [[package]] name = "scikit-learn" version = "1.6.1" @@ -2041,19 +1304,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, ] -[[package]] -name = "semchunk" -version = "2.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mpire", extra = ["dill"] }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/62/96/c418c322730b385e81d4ab462e68dd48bb2dbda4d8efa17cad2ca468d9ac/semchunk-2.2.2.tar.gz", hash = "sha256:940e89896e64eeb01de97ba60f51c8c7b96c6a3951dfcf574f25ce2146752f52", size = 12271, upload-time = "2024-12-17T22:54:30.332Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/84/94ca7896c7df20032bcb09973e9a4d14c222507c0aadf22e89fa76bb0a04/semchunk-2.2.2-py3-none-any.whl", hash = "sha256:94ca19020c013c073abdfd06d79a7c13637b91738335f3b8cdb5655ee7cc94d2", size = 10271, upload-time = "2024-12-17T22:54:27.689Z" }, -] - [[package]] name = "setuptools" version = "82.0.0" @@ -2063,42 +1313,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, ] -[[package]] -name = "shapely" -version = "2.1.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" }, - { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" }, - { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" }, - { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" }, - { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" }, - { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" }, - { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" }, - { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" }, - { url = "https://files.pythonhosted.org/packages/24/c0/f3b6453cf2dfa99adc0ba6675f9aaff9e526d2224cbd7ff9c1a879238693/shapely-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fe2533caae6a91a543dec62e8360fe86ffcdc42a7c55f9dfd0128a977a896b94", size = 1833550, upload-time = "2025-09-24T13:50:30.019Z" }, - { url = "https://files.pythonhosted.org/packages/86/07/59dee0bc4b913b7ab59ab1086225baca5b8f19865e6101db9ebb7243e132/shapely-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba4d1333cc0bc94381d6d4308d2e4e008e0bd128bdcff5573199742ee3634359", size = 1643556, upload-time = "2025-09-24T13:50:32.291Z" }, - { url = "https://files.pythonhosted.org/packages/26/29/a5397e75b435b9895cd53e165083faed5d12fd9626eadec15a83a2411f0f/shapely-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bd308103340030feef6c111d3eb98d50dc13feea33affc8a6f9fa549e9458a3", size = 2988308, upload-time = "2025-09-24T13:50:33.862Z" }, - { url = "https://files.pythonhosted.org/packages/b9/37/e781683abac55dde9771e086b790e554811a71ed0b2b8a1e789b7430dd44/shapely-2.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1e7d4d7ad262a48bb44277ca12c7c78cb1b0f56b32c10734ec9a1d30c0b0c54b", size = 3099844, upload-time = "2025-09-24T13:50:35.459Z" }, - { url = "https://files.pythonhosted.org/packages/d8/f3/9876b64d4a5a321b9dc482c92bb6f061f2fa42131cba643c699f39317cb9/shapely-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e9eddfe513096a71896441a7c37db72da0687b34752c4e193577a145c71736fc", size = 3988842, upload-time = "2025-09-24T13:50:37.478Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a0/704c7292f7014c7e74ec84eddb7b109e1fbae74a16deae9c1504b1d15565/shapely-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:980c777c612514c0cf99bc8a9de6d286f5e186dcaf9091252fcd444e5638193d", size = 4152714, upload-time = "2025-09-24T13:50:39.9Z" }, - { url = "https://files.pythonhosted.org/packages/53/46/319c9dc788884ad0785242543cdffac0e6530e4d0deb6c4862bc4143dcf3/shapely-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9111274b88e4d7b54a95218e243282709b330ef52b7b86bc6aaf4f805306f454", size = 1542745, upload-time = "2025-09-24T13:50:41.414Z" }, - { url = "https://files.pythonhosted.org/packages/ec/bf/cb6c1c505cb31e818e900b9312d514f381fbfa5c4363edfce0fcc4f8c1a4/shapely-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:743044b4cfb34f9a67205cee9279feaf60ba7d02e69febc2afc609047cb49179", size = 1722861, upload-time = "2025-09-24T13:50:43.35Z" }, -] - -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - [[package]] name = "six" version = "1.17.0" @@ -2108,15 +1322,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] -[[package]] -name = "soupsieve" -version = "2.8.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, -] - [[package]] name = "sympy" version = "1.13.1" @@ -2129,15 +1334,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, ] -[[package]] -name = "tabulate" -version = "0.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, -] - [[package]] name = "tenacity" version = "9.1.4" @@ -2156,18 +1352,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] -[[package]] -name = "tifffile" -version = "2026.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c5/cb/2f6d79c7576e22c116352a801f4c3c8ace5957e9aced862012430b62e14f/tifffile-2026.3.3.tar.gz", hash = "sha256:d9a1266bed6f2ee1dd0abde2018a38b4f8b2935cb843df381d70ac4eac5458b7", size = 388745, upload-time = "2026-03-03T19:14:38.134Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" }, -] - [[package]] name = "tokenizers" version = "0.20.3" @@ -2252,10 +1436,10 @@ dependencies = [ { name = "torch" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, ] [[package]] @@ -2268,10 +1452,10 @@ dependencies = [ { name = "torch" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, ] [[package]] @@ -2307,90 +1491,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, ] -[[package]] -name = "tree-sitter" -version = "0.25.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, - { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, - { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, - { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, - { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, - { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, - { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, - { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, - { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, - { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, - { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, - { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, - { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, - { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, -] - -[[package]] -name = "tree-sitter-c" -version = "0.24.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, - { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, - { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, - { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, - { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, - { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, - { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, -] - -[[package]] -name = "tree-sitter-javascript" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, - { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, - { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, - { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, - { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, - { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, - { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, -] - -[[package]] -name = "tree-sitter-python" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, - { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, - { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, - { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, - { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, - { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, - { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, -] - -[[package]] -name = "tree-sitter-typescript" -version = "0.23.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1e/fc/bb52958f7e399250aee093751e9373a6311cadbe76b6e0d109b853757f35/tree_sitter_typescript-0.23.2.tar.gz", hash = "sha256:7b167b5827c882261cb7a50dfa0fb567975f9b315e87ed87ad0a0a3aedb3834d", size = 773053, upload-time = "2024-11-11T02:36:11.396Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/28/95/4c00680866280e008e81dd621fd4d3f54aa3dad1b76b857a19da1b2cc426/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3cd752d70d8e5371fdac6a9a4df9d8924b63b6998d268586f7d374c9fba2a478", size = 286677, upload-time = "2024-11-11T02:35:58.839Z" }, - { url = "https://files.pythonhosted.org/packages/8f/2f/1f36fda564518d84593f2740d5905ac127d590baf5c5753cef2a88a89c15/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c7cc1b0ff5d91bac863b0e38b1578d5505e718156c9db577c8baea2557f66de8", size = 302008, upload-time = "2024-11-11T02:36:00.733Z" }, - { url = "https://files.pythonhosted.org/packages/96/2d/975c2dad292aa9994f982eb0b69cc6fda0223e4b6c4ea714550477d8ec3a/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1eed5b0b3a8134e86126b00b743d667ec27c63fc9de1b7bb23168803879e31", size = 351987, upload-time = "2024-11-11T02:36:02.669Z" }, - { url = "https://files.pythonhosted.org/packages/49/d1/a71c36da6e2b8a4ed5e2970819b86ef13ba77ac40d9e333cb17df6a2c5db/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e96d36b85bcacdeb8ff5c2618d75593ef12ebaf1b4eace3477e2bdb2abb1752c", size = 344960, upload-time = "2024-11-11T02:36:04.443Z" }, - { url = "https://files.pythonhosted.org/packages/7f/cb/f57b149d7beed1a85b8266d0c60ebe4c46e79c9ba56bc17b898e17daf88e/tree_sitter_typescript-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8d4f0f9bcb61ad7b7509d49a1565ff2cc363863644a234e1e0fe10960e55aea0", size = 340245, upload-time = "2024-11-11T02:36:06.473Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ab/dd84f0e2337296a5f09749f7b5483215d75c8fa9e33738522e5ed81f7254/tree_sitter_typescript-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:3f730b66396bc3e11811e4465c41ee45d9e9edd6de355a58bbbc49fa770da8f9", size = 278015, upload-time = "2024-11-11T02:36:07.631Z" }, - { url = "https://files.pythonhosted.org/packages/9f/e4/81f9a935789233cf412a0ed5fe04c883841d2c8fb0b7e075958a35c65032/tree_sitter_typescript-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:05db58f70b95ef0ea126db5560f3775692f609589ed6f8dd0af84b7f19f1cbb7", size = 274052, upload-time = "2024-11-11T02:36:09.514Z" }, -] - [[package]] name = "triton" version = "3.2.0" @@ -2400,21 +1500,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, ] -[[package]] -name = "typer" -version = "0.16.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "rich" }, - { name = "shellingham" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, -] - [[package]] name = "typing-extensions" version = "4.15.0" @@ -2424,18 +1509,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] -[[package]] -name = "typing-inspection" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, -] - [[package]] name = "tzdata" version = "2025.3" @@ -2494,15 +1567,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, ] -[[package]] -name = "xlsxwriter" -version = "3.2.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, -] - [[package]] name = "yarl" version = "1.23.0" diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index c7a5b58..e0f7707 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -2,7 +2,7 @@ ## Environment Profiles - **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. -- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml` and intentionally excludes the Docling layout stack. Recommended installation commands: ```bash diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt index 402261a..73cb17f 100644 --- a/dependency_setup/requirements-glossapi-docling.txt +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling extraction/layout) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/requirements-glossapi-vanilla.txt b/dependency_setup/requirements-glossapi-vanilla.txt index b13df49..eca76ba 100644 --- a/dependency_setup/requirements-glossapi-vanilla.txt +++ b/dependency_setup/requirements-glossapi-vanilla.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling without GPU OCR extras) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/setup_glossapi.sh b/dependency_setup/setup_glossapi.sh index 024095e..70e9754 100755 --- a/dependency_setup/setup_glossapi.sh +++ b/dependency_setup/setup_glossapi.sh @@ -77,9 +77,6 @@ case "${MODE}" in warn "Mode 'vanilla' is deprecated; using 'docling' instead." MODE="docling" ;; - rapidocr) - error "RapidOCR setup has been removed. Use --mode docling or --mode deepseek." - ;; docling|deepseek) ;; *) echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 2fb796c..8b740d6 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -40,6 +40,7 @@ extract( skip_existing: bool = True, use_gpus: str = 'single', # 'single'|'multi' devices: list[int] | None = None, + workers_per_device: int = 1, use_cls: bool = False, benchmark_mode: bool = False, export_doc_json: bool = True, @@ -52,9 +53,10 @@ extract( - files already present in `downloads/` - or explicit `file_paths` - Important parameters: - - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout/OCR - - `force_ocr=True`: turn on OCR during extraction + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout extraction + - `force_ocr`: deprecated no-op kept for compatibility; OCR remediation now lives in `Corpus.ocr(backend='deepseek')` - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput - `export_doc_json=True`: write `json/.docling.json(.zst)` - `emit_formula_index=True`: also write `json/.formula_index.jsonl` - Main outputs: diff --git a/docs/api_corpus_tmp.md b/docs/api_corpus_tmp.md index 4181094..e584308 100644 --- a/docs/api_corpus_tmp.md +++ b/docs/api_corpus_tmp.md @@ -44,7 +44,7 @@ extract( ) -> None ``` -- Phase‑1 extraction; set `force_ocr=True` for OCR. +- Phase‑1 extraction; `force_ocr` is deprecated and ignored. - Docling layout JSON now writes by default (`json/.docling.json(.zst)`); set `emit_formula_index=True` to also produce `json/.formula_index.jsonl`. - Set `use_gpus='multi'` to use all visible GPUs (shared queue). @@ -85,7 +85,7 @@ ocr( ) -> None ``` -- Convenience shim that re‑runs `extract(force_ocr=True)` on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. +- Convenience shim that re-runs OCR on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. ## formula_enrich_from_json() diff --git a/docs/architecture/deepseek_only_upgrade_roadmap.md b/docs/architecture/deepseek_only_upgrade_roadmap.md deleted file mode 100644 index 6ebac64..0000000 --- a/docs/architecture/deepseek_only_upgrade_roadmap.md +++ /dev/null @@ -1,262 +0,0 @@ -# DeepSeek-Only Upgrade Roadmap - -This document describes the planned migration from a mixed OCR stack to a simpler pipeline that keeps Docling for extraction and structure, but uses DeepSeek as the only OCR backend. - -## Current status - -As of March 9, 2026, the following work has already been completed: - -- DeepSeek is the only supported OCR remediation backend in the pipeline -- stub execution is rejected for real OCR runs -- the dedicated DeepSeek runtime is managed through the uv-based setup flow -- RapidOCR implementation files and install profile have been removed -- real extract -> clean/evaluate -> OCR -> section validation has been run on capped Pergamos samples -- OCR progress artifacts were moved out of the canonical `markdown/` tree so downstream stages no longer treat them as real documents - -The following work is intentionally not part of the completed set yet: - -- Docling dependency upgrades -- page-level OCR reevaluation experiments -- broader corpus-level comparative benchmarking beyond the capped validation runs - -## Remaining TODO to wrap up the implemented changes - -These are the remaining tasks for closing out the already-implemented migration work: - -1. review and curate the final commit contents -2. keep only source, docs, and test changes that belong in the `development` branch -3. exclude local artifacts, downloaded models, disposable environments, and ad hoc validation output from the commit -4. optionally run one more small real-PDF compatibility slice if an extra release-confidence check is desired -5. create or switch to the `development` branch and push the finalized change set there - -This means the migration implementation itself is effectively done; what remains is mainly release hygiene and branch preparation. - -## Target architecture - -The target shape is: - -1. `download()` -2. `extract()` via safe backend or Docling -3. `clean()` and compute Greek-quality routing -4. `ocr()` via DeepSeek only for documents that need remediation -5. `section()` -6. `annotate()` -7. `export()` - -Important boundary: - -- keep `Docling` for extraction, layout, Markdown, JSON artifacts, and optional formula/code enrichment -- remove `RapidOCR` from the OCR path and installation surface -- enforce `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` for production and release validation - -This is a simplification, not a redesign of the entire pipeline contract. - -## Why this direction - -The current mixed OCR surface adds complexity in three places: - -- dependency installation and CUDA compatibility -- runtime branching and operational support burden -- validation burden when one OCR path succeeds and another fails differently - -The simplified design still preserves the important current properties: - -- selective OCR after Greek-quality validation -- Docling-generated layout and JSON artifacts for downstream stages -- explicit operational metadata and rerun semantics - -## Stage 1: DeepSeek-only OCR - -Goal: - -- make DeepSeek the only OCR remediation backend -- remove silent stub fallback from production paths - -Changes: - -- remove `rapidocr` as a supported OCR backend -- route `Corpus.ocr()` to DeepSeek only -- fail hard when DeepSeek runtime, weights, or CLI are unavailable -- keep the current document-level `needs_ocr` selection model - -Do not change in this stage: - -- Docling extraction contract -- sectioning and annotation behavior -- page-level routing policy -- formula/code enrichment policy - -Why this stage exists: - -- it gives the desired simplification without changing the rest of the pipeline contract at the same time -- it isolates OCR-engine risk from Docling-upgrade risk - -Success criteria: - -- no remaining production path imports or dispatches RapidOCR -- no final validation run succeeds via stub output -- documents flagged `needs_ocr=True` can still be remediated through DeepSeek - -Status: - -- completed - -## Stage 2: Installation simplification - -Goal: - -- reduce the environment surface to what the simplified pipeline actually needs - -Changes: - -- remove the `rapidocr` install profile and `onnxruntime-gpu` -- simplify setup profiles around: - - Docling extraction/runtime - - DeepSeek OCR runtime -- remove unused requirement baggage where it is not imported by GlossAPI itself -- make Python version constraints match current upstream reality - -Current constraint to fix: - -- GlossAPI currently declares `requires-python = ">=3.8"` while current Docling requires Python `>=3.10` - -Do not change in this stage: - -- pipeline behavior -- artifact layout -- OCR routing logic - -Why this stage exists: - -- environment simplification should follow architectural simplification -- it is easier to reason about required packages once RapidOCR is gone - -Success criteria: - -- setup documentation exposes only the supported environments -- install instructions no longer mention removed OCR components -- Python floor and dependency pins are internally consistent - -Status: - -- completed for the currently supported DeepSeek-only flow -- final branch hygiene and commit curation still remain - -## Stage 3: Docling upgrade - -Goal: - -- upgrade Docling after the OCR surface has already been simplified - -Changes: - -- update `docling` -- update `docling-core` -- update `docling-parse` -- update `docling-ibm-models` -- adapt any compatibility shims required by changed public APIs - -Do not change in this stage: - -- DeepSeek-only OCR decision -- page-level experiment -- formula/code enrichment policy unless explicitly validated - -Why this stage exists: - -- upgrading Docling before removing RapidOCR combines two unrelated breakage sources -- after Stage 1 the Docling integration surface is smaller and easier to validate - -Success criteria: - -- Phase-1 extraction still produces the documented canonical artifacts -- downstream sectioning, annotation, and export still consume the outputs -- metadata and resumability behavior do not regress - -Status: - -- deferred - -## Stage 4: Re-evaluate retained Docling capabilities - -Goal: - -- decide which Docling-powered features remain justified after the simplification - -Features to evaluate: - -- formula enrichment -- code enrichment -- table structure extraction -- any extra model/artifact prefetch currently required for non-default functionality - -Why this stage exists: - -- some capabilities may still be valuable for technical corpora -- some may only be increasing runtime and failure surface - -Rule: - -- do not remove formula/code enrichment just because it simplifies the stack -- remove it only if real-corpus evaluation shows little or no value - -Success criteria: - -- every retained capability has a measurable purpose -- every removed capability has an explicit evaluation-based justification - -Status: - -- pending - -## Stage 5: Page-level reevaluation experiment - -Goal: - -- test whether whole-document OCR reruns should be replaced or complemented by page-level escalation - -Experiment shape: - -- baseline branch: current document-level `needs_ocr` routing -- experiment branch: page-level or ROI-level routing - -What stays fixed: - -- DeepSeek remains the only OCR backend -- Docling remains the structured extraction/layout path - -Why this is separate: - -- it is an architectural experiment, not a prerequisite for the OCR simplification -- it should be compared against the stabilized DeepSeek-only baseline - -Primary evaluation questions: - -- does page-level escalation improve quality on long PDFs -- does it reduce OCR runtime and GPU cost -- does it preserve downstream sectioning and annotation quality - -Status: - -- pending - -## Non-goals for the first pass - -These are intentionally out of scope for the initial migration: - -- replacing Docling JSON/layout artifacts with DeepSeek-native structured artifacts -- merging all runtime concerns into one universal environment regardless of ecosystem constraints -- changing artifact layout at the same time as OCR simplification -- treating synthetic, mocked, or stubbed tests as sufficient release validation - -## Release sequence - -The intended order is: - -1. DeepSeek-only OCR and no-stub enforcement -2. installation simplification -3. Docling upgrade -4. retained-capability review -5. page-level experiment - -This order keeps one major architectural assumption changing at a time. diff --git a/docs/code_map.md b/docs/code_map.md index 97f12d5..8616def 100644 --- a/docs/code_map.md +++ b/docs/code_map.md @@ -8,8 +8,8 @@ without reading the entire repo. | Area | Main code | Responsibility | | --- | --- | --- | -| Public package entry | `src/glossapi/__init__.py` | Applies the RapidOCR patch on import and exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes. | -| High-level orchestration | `src/glossapi/corpus.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Public package entry | `src/glossapi/__init__.py` | Lazy-exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes without pulling heavy runtime dependencies at import time. | +| High-level orchestration | `src/glossapi/corpus/corpus_orchestrator.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | | Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | ## Pipeline Stages @@ -28,12 +28,11 @@ without reading the entire repo. | File | Responsibility | | --- | --- | -| `src/glossapi/_pipeline.py` | Canonical builders for layout-only and RapidOCR-backed Docling pipelines. | -| `src/glossapi/rapidocr_safe.py` | Monkey-patch/shim for Docling 2.48.x so problematic OCR crops do not crash whole documents. | -| `src/glossapi/_rapidocr_paths.py` | Resolves packaged RapidOCR ONNX models and Greek keys, with env-var override support. | -| `src/glossapi/ocr_pool.py` | Reuses RapidOCR model instances where possible. | -| `src/glossapi/json_io.py` | Writes and reads compressed Docling JSON artifacts. | -| `src/glossapi/triage.py` | Summarizes per-page formula density and updates parquet routing metadata. | +| `src/glossapi/ocr/docling/pipeline.py` | Canonical builder for the layout-only Docling Phase-1 pipeline, including runtime tuning knobs for the current Docling API. | +| `src/glossapi/ocr/docling_pipeline.py` | Compatibility re-export for the canonical Docling pipeline builder. | +| `src/glossapi/ocr/deepseek/runner.py` | Launches the DeepSeek OCR remediation path from `Corpus.ocr()`. | +| `src/glossapi/ocr/utils/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/corpus/phase_ocr_math.py` | Runs DeepSeek OCR remediation, math/code enrichment, and parquet status updates. | | `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | ## Rust Extensions @@ -50,12 +49,12 @@ without reading the entire repo. | `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | | `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | | `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | -| `tests/test_rapidocr_patch.py` | Covers the Docling/RapidOCR compatibility patch and fallback paths. | +| `tests/test_ocr_dispatch_backends.py` | Covers the DeepSeek-only OCR dispatch contract and backend validation. | ## If You Need To Change... - Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. -- Phase-1 parsing, OCR selection, or artifact generation: start in `src/glossapi/corpus.py` and `src/glossapi/gloss_extract.py`. -- Docling/RapidOCR wiring or provider issues: start in `src/glossapi/_pipeline.py`, `src/glossapi/rapidocr_safe.py`, and `src/glossapi/_rapidocr_paths.py`. +- Phase-1 parsing, worker fanout, or artifact generation: start in `src/glossapi/corpus/phase_extract.py`, `src/glossapi/corpus/corpus_orchestrator.py`, and `src/glossapi/gloss_extract.py`. +- Docling pipeline wiring or runtime tuning: start in `src/glossapi/ocr/docling/pipeline.py` and `src/glossapi/gloss_extract.py`. - Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. -- Output folder contracts or stage sequencing: start in `src/glossapi/corpus.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus/corpus_orchestrator.py`. diff --git a/docs/configuration.md b/docs/configuration.md index 0810530..f8dd8de 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -18,6 +18,17 @@ GlossAPI exposes two Phase‑1 profiles. Use `Corpus.extract(..., phase1_backend Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread per worker so multi‑GPU runs do not explode thread counts. +### Docling Runtime Tuning + +These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: + +- `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. +- `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. +- `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. +- `GLOSSAPI_DOCLING_QUEUE_MAX_SIZE`: override Docling `queue_max_size`. +- `GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT`: override Docling `document_timeout`. +- `GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL`: override Docling `batch_polling_interval_seconds`. + ### DeepSeek optional dependencies Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: @@ -27,6 +38,7 @@ Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. +The dedicated uv profile is OCR-only and does not install the Docling extraction stack. ### DeepSeek runtime controls diff --git a/docs/getting_started.md b/docs/getting_started.md index 94a2325..e86d492 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -27,6 +27,7 @@ Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `d ``` `setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `glossapi[deepseek]` and leaves Docling in the main environment. **DeepSeek runtime checklist** - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index b1b8956..feb3283 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -8,10 +8,11 @@ file paths**, so no worker rescans directories. ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. +- `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index f401829..1c2b630 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -86,7 +86,7 @@ If you need Phase‑2 math on files that do not require OCR, run `math_only` aft Phase‑1 (extract): ```python -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. diff --git a/docs/pipeline.md b/docs/pipeline.md index 2f4b9dd..2c00354 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -11,7 +11,7 @@ The `Corpus` class is the stable surface of the project. New functionality shoul | Stage | Main code | Typical inputs | Important parameters | Main outputs | | --- | --- | --- | --- | --- | | Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | -| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `devices`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `use_gpus`, `devices`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | | OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | | Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | @@ -42,9 +42,11 @@ The `Corpus` class is the stable surface of the project. New functionality shoul - or explicit `file_paths` - Important parameters: - `phase1_backend='safe'|'docling'|'auto'` - - `force_ocr=True` to turn on OCR during extraction - `use_gpus='single'|'multi'` + - `workers_per_device` to fan out more than one extraction worker onto each GPU - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Operational note: + - `force_ocr` is deprecated and ignored in Phase‑1; use `Corpus.ocr(backend='deepseek')` after `clean()` for OCR remediation - Main outputs: - canonical markdown in `markdown/.md` - optional Docling JSON and index artifacts in `json/` diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md index 0c00d59..29a5e15 100644 --- a/docs/testing/compatibility_matrix.md +++ b/docs/testing/compatibility_matrix.md @@ -97,8 +97,8 @@ The following must remain true unless a change explicitly revises the contract a | ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | | --- | --- | --- | --- | --- | --- | --- | -| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed RapidOCR profile | -| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no runtime import of removed RapidOCR modules | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed legacy OCR install modes | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no dead imports from removed OCR integrations | | `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | | `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | | `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | @@ -129,7 +129,7 @@ Critical checks: - packaging metadata uses a supported Python minimum - setup docs expose only supported install paths -- removal of RapidOCR does not leave dead imports or entrypoints +- removal of the old OCR integration does not leave dead GlossAPI imports or entrypoints ## Extraction contract @@ -256,7 +256,7 @@ This keeps low-level compatibility failures from being confused with downstream - DeepSeek-only OCR path works on real PDFs - no-stub enforcement verified -- no remaining release dependency on RapidOCR +- no supported GlossAPI OCR backend remains besides DeepSeek ### Stage 2 exit criteria diff --git a/pyproject.toml b/pyproject.toml index 3c045db..f1613ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", + "numpy>=1.26,<3", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -43,7 +43,7 @@ browser = [ ] # Docling extraction/layout stack docling = [ - "docling==2.48.0", + "docling==2.81.0", ] # Optional CUDA layout acceleration (Docling) cuda = [ diff --git a/requirements.txt b/requirements.txt index 95f4678..32b555c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -### GlossAPI runtime requirements (aligned with repro_rapidocr_onnx) +### GlossAPI runtime requirements # Core pipeline deps pandas>=1.3.0 -numpy<2 +numpy>=1.26,<3 python-dateutil>=2.8.2 pytz>=2021.1 scikit-learn==1.6.1 @@ -15,17 +15,12 @@ ftfy>=6.0.0 tenacity>=8.0.0 tqdm>=4.67.0 -# Docling + RapidOCR ONNX stack -docling==2.48.0 -# Prefer RapidOCR core package; it works with the GPU ORT wheel without pulling -# the CPU-only 'onnxruntime' dependency. -rapidocr>=3.3.0 -onnxruntime-gpu==1.18.1 +# Docling extraction/layout stack +docling==2.81.0 pyyaml>=6.0 # Enrichment & JSON compression (required for Phase-2 math/code and JSON zstd) pypdfium2>=4.0.0 zstandard>=0.22.0 -# Optional: install Torch CUDA for GPU layout (not required for OCR) -# pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 +# Optional: install Torch CUDA for GPU-backed Docling layout / enrichment diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index dd2fad6..7f254f1 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -350,6 +350,8 @@ def _load_metadata(self) -> None: # Top-level worker function for multi-GPU extraction (picklable by multiprocessing) def gpu_extract_worker_queue( device_id: int, + worker_slot: int, + worker_key: str, in_dir: str, out_dir: str, work_q, # multiprocessing Queue of filename strings @@ -392,12 +394,13 @@ def _ensure_thread_caps(): _ensure_thread_caps() _status_proxy = status_map - _marker_path = _Path(marker_dir).expanduser() / f"gpu{device_id}.current" if marker_dir else None + _worker_label = worker_key or f"gpu{device_id}-w{worker_slot}" + _marker_path = _Path(marker_dir).expanduser() / f"{_worker_label}.current" if marker_dir else None def _update_current(batch_items: List[str]) -> None: if _status_proxy is not None: try: - _status_proxy[device_id] = list(batch_items) + _status_proxy[_worker_label] = list(batch_items) except Exception: pass if _marker_path is not None: @@ -409,7 +412,7 @@ def _update_current(batch_items: List[str]) -> None: def _clear_current() -> None: if _status_proxy is not None: try: - _status_proxy.pop(device_id, None) + _status_proxy.pop(_worker_label, None) except Exception: pass if _marker_path is not None: @@ -423,7 +426,7 @@ def _clear_current() -> None: if _log_dir: _log_path = _Path(_log_dir).expanduser() _log_path.mkdir(parents=True, exist_ok=True) - _worker_log_file = _log_path / f"gpu{device_id}_{_os.getpid()}.log" + _worker_log_file = _log_path / f"{_worker_label}_{_os.getpid()}.log" _worker_log_handle = open(_worker_log_file, "a", encoding="utf-8", buffering=1) _sys.stdout = _worker_log_handle _sys.stderr = _worker_log_handle @@ -458,9 +461,13 @@ def _clear_current() -> None: except Exception: _phys = "" try: - print(f"[GPU{device_id}] bound: CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}") + print( + f"[GPU{device_id}/W{worker_slot}] bound: " + f"CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} " + f"pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}" + ) if _phys: - print(f"[GPU{device_id}] physical: {_phys}") + print(f"[GPU{device_id}/W{worker_slot}] physical: {_phys}") except Exception: pass except Exception: @@ -475,13 +482,15 @@ def _clear_current() -> None: _ensure_thread_caps() from glossapi import Corpus as _Corpus # type: ignore except Exception as _e: - print(f"[GPU{device_id}] Cannot import glossapi in worker: {_e}") + print(f"[{_worker_label}] Cannot import glossapi in worker: {_e}") if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -507,14 +516,16 @@ def _clear_current() -> None: phase1_backend=backend, ) except Exception as _e: - msg = f"[GPU{device_id}] Prime failed: {_e}" + msg = f"[{_worker_label}] Prime failed: {_e}" print(msg) if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -534,7 +545,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [str(x) for x in ok_list], "problematic": [str(x) for x in bad_list], "pid": _os.getpid(), @@ -553,15 +566,12 @@ def _report_batch(ok_list, bad_list): _batch_env = int(str(_os.environ.get("GLOSSAPI_GPU_BATCH_SIZE", "")).strip() or 0) except Exception: _batch_env = 0 - default_batch = 5 if not force else 1 + default_batch = 5 try: extractor = getattr(c, "extractor", None) if extractor is not None: configured = int(getattr(extractor, "max_batch_files", default_batch)) - if force: - default_batch = 1 - else: - default_batch = max(1, configured) + default_batch = max(1, configured) except Exception: pass BATCH_SIZE = max(1, _batch_env) if _batch_env else max(1, default_batch) @@ -605,7 +615,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -653,7 +665,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -667,7 +681,7 @@ def _report_batch(ok_list, bad_list): # Occasional heartbeat if _time.time() - last_progress > 30: try: - print(f"[GPU{device_id}] processed ~{processed} files…") + print(f"[{_worker_label}] processed ~{processed} files...") except Exception: pass last_progress = _time.time() @@ -692,7 +706,9 @@ def _report_batch(ok_list, bad_list): try: result_q.put({ "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": exit_code, "pid": _os.getpid(), }) diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a748dcc..296429a 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -98,14 +98,14 @@ def prime_extractor( if force_ocr: self.logger.warning( - "Phase-1 Docling OCR is deprecated and no longer executes OCR. " + "Corpus.extract(force_ocr=True) is deprecated and no longer executes OCR. " "Use Corpus.ocr(backend='deepseek') for OCR remediation." ) # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, - require_ocr=bool(force_ocr), + require_ocr=False, require_math=bool(formula_enrichment or code_enrichment), require_backend_gpu=(backend_choice == "docling"), ) @@ -119,8 +119,8 @@ def prime_extractor( # Ensure converter exists (reuse when unchanged) self.extractor.ensure_extractor( - enable_ocr=bool(force_ocr), - force_full_page_ocr=bool(force_ocr), + enable_ocr=False, + force_full_page_ocr=False, formula_enrichment=bool(formula_enrichment), code_enrichment=bool(code_enrichment), images_scale=float(images_scale_env), @@ -142,12 +142,12 @@ def _resolve_phase1_backend( raise ValueError( f"Invalid phase1_backend='{requested}'. Expected one of: 'auto', 'safe', 'docling'." ) - needs_gpu = bool(force_ocr or formula_enrichment or code_enrichment) + needs_gpu = bool(formula_enrichment or code_enrichment) if choice == "auto": choice = "docling" if needs_gpu else "safe" if choice == "safe" and needs_gpu: self.logger.info( - "Phase-1 backend 'safe' overridden to 'docling' because OCR/math enrichment was requested." + "Phase-1 backend 'safe' overridden to 'docling' because math/code enrichment was requested." ) choice = "docling" return choice @@ -227,6 +227,7 @@ def extract( export_doc_json: bool = True, emit_formula_index: bool = False, phase1_backend: str = "auto", + workers_per_device: int = 1, _prepared: bool = False, ) -> None: """ @@ -240,8 +241,9 @@ def extract( export_doc_json: When True (default), writes Docling layout JSON to `json/.docling.json(.zst)` emit_formula_index: Also emit `json/.formula_index.jsonl` (default: False) phase1_backend: Selects the Phase-1 backend. ``"auto"`` (default) keeps the safe backend unless - OCR/math is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` forces the - Docling backend. + math/code enrichment is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` + forces the Docling backend. + workers_per_device: Number of extraction workers to bind to each visible GPU when ``use_gpus='multi'``. """ if not file_paths: @@ -415,12 +417,14 @@ def extract( except Exception: threads_effective = int(num_threads) if isinstance(num_threads, int) else max(2, 2 * max(1, len(devs))) - batch_hint = 5 if backend_choice == "docling" and not force_ocr else 1 + workers_per_device = max(1, int(workers_per_device or 1)) + batch_hint = 1 self.logger.info( - "Phase-1 config: backend=%s batch_size=%s threads=%s skip_existing=%s benchmark=%s", + "Phase-1 config: backend=%s batch_size=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, batch_hint, threads_effective, + workers_per_device, bool(skip_existing), bool(benchmark_mode), ) @@ -454,6 +458,7 @@ def extract( return # Dynamic work queue across GPUs + from .corpus_orchestrator import gpu_extract_worker_queue from multiprocessing import get_context ctx = get_context("spawn") manager = ctx.Manager() @@ -484,14 +489,29 @@ def extract( marker_base.mkdir(parents=True, exist_ok=True) except Exception as exc: self.logger.debug("Unable to prepare marker directory %s: %s", marker_base, exc) - procs: List[Any] = [] - proc_gpu: Dict[int, int] = {} - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} + worker_specs: List[Dict[str, Any]] = [] for dev_id in devs: + for worker_slot in range(workers_per_device): + worker_specs.append( + { + "device_id": int(dev_id), + "worker_slot": int(worker_slot), + "worker_key": f"gpu{dev_id}-w{worker_slot}", + } + ) + procs: List[Any] = [] + proc_specs: Dict[int, Dict[str, Any]] = {} + marker_files: Dict[str, Path] = { + spec["worker_key"]: marker_base / f"{spec['worker_key']}.current" + for spec in worker_specs + } + for spec in worker_specs: p = ctx.Process( target=gpu_extract_worker_queue, args=( - dev_id, + spec["device_id"], + spec["worker_slot"], + spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -514,7 +534,7 @@ def extract( p.start() procs.append(p) if p.pid is not None: - proc_gpu[p.pid] = dev_id + proc_specs[p.pid] = dict(spec) active = list(procs) any_fail = False last_summary = time.time() @@ -531,20 +551,21 @@ def extract( procs.remove(p) pid = p.pid or -1 heartbeat[pid] = time.time() - gpu_id = proc_gpu.pop(pid, None) + worker_spec = proc_specs.pop(pid, None) + worker_key = worker_spec["worker_key"] if worker_spec else None if p.exitcode not in (0, None): any_fail = True self.logger.warning("GPU worker pid=%s exited with code %s", p.pid, p.exitcode) current_paths: List[str] = [] stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) + if worker_key is not None: + current_entry = status_map.pop(worker_key, None) if current_entry: if not isinstance(current_entry, (list, tuple, set)): current_entry = [current_entry] current_paths = [str(x) for x in current_entry] stems_for_skip = [canonical_stem(path) for path in current_paths] - marker_path = marker_files.get(gpu_id) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -555,12 +576,17 @@ def extract( state_mgr.save(processed_files, problematic_files) if stems_for_skip: skip_mgr.add(stems_for_skip) - if gpu_id is not None: - self.logger.info("Respawning GPU%s worker after crash.", gpu_id) + if worker_spec is not None: + self.logger.info( + "Respawning %s after crash.", + worker_spec["worker_key"], + ) replacement = ctx.Process( target=gpu_extract_worker_queue, args=( - gpu_id, + worker_spec["device_id"], + worker_spec["worker_slot"], + worker_spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -584,13 +610,13 @@ def extract( procs.append(replacement) active.append(replacement) if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id + proc_specs[replacement.pid] = dict(worker_spec) heartbeat[replacement.pid] = time.time() continue else: - if gpu_id is not None: - status_map.pop(gpu_id, None) - marker_path = marker_files.get(gpu_id) + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -618,7 +644,7 @@ def extract( skip_mgr.add(bad_stems) state_mgr.save(processed_files, problematic_files) self.logger.info( - "GPU%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", + "%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", result.get("worker"), len(ok_stems), len(bad_stems), @@ -632,25 +658,20 @@ def extract( if result.get("exitcode", 0) not in (0, None): any_fail = True self.logger.warning( - "GPU%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") + "%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") ) worker_pid = result.get("pid") if worker_pid is not None: heartbeat[worker_pid] = time.time() - worker_gpu = result.get("worker") - if worker_gpu is not None: - try: - worker_gpu_int = int(worker_gpu) - except Exception: - worker_gpu_int = None - else: - status_map.pop(worker_gpu_int, None) - marker_path = marker_files.get(worker_gpu_int) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass + worker_key = result.get("worker") + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(str(worker_key)) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass now = time.time() if now - last_summary > 30: diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 861f28b..1c21cf1 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -46,9 +46,9 @@ def _maybe_import_torch(*, force: bool = False): MarkdownFormatOption = None CsvFormatOption = None StandardPdfPipeline = None -DoclingParseV2DocumentBackend = None DoclingParseDocumentBackend = None PyPdfiumDocumentBackend = None +_DOCLING_PARSE_BACKEND_NAME = "docling_parse" class _NoOpOption: # minimal stand-ins for optional helpers @@ -83,19 +83,23 @@ def _ensure_docling_converter_loaded() -> None: def _ensure_docling_pipeline_loaded() -> None: global _DOC_PIPELINE_LOADED, StandardPdfPipeline - global DoclingParseV2DocumentBackend, DoclingParseDocumentBackend, PyPdfiumDocumentBackend + global DoclingParseDocumentBackend, PyPdfiumDocumentBackend, _DOCLING_PARSE_BACKEND_NAME if _DOC_PIPELINE_LOADED: return try: StandardPdfPipeline = importlib.import_module( "docling.pipeline.standard_pdf_pipeline" ).StandardPdfPipeline - DoclingParseV2DocumentBackend = importlib.import_module( - "docling.backend.docling_parse_v2_backend" - ).DoclingParseV2DocumentBackend - DoclingParseDocumentBackend = importlib.import_module( - "docling.backend.docling_parse_backend" - ).DoclingParseDocumentBackend + try: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_backend" + ).DoclingParseDocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse" + except Exception: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_v2_backend" + ).DoclingParseV2DocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse_v2" PyPdfiumDocumentBackend = importlib.import_module( "docling.backend.pypdfium2_backend" ).PyPdfiumDocumentBackend @@ -382,7 +386,7 @@ def _convert_all_with_timeout(self, files: Iterable[Path], timeout_s: int, **kwa timeout_kw = None backend_cls = getattr(self, "_active_pdf_backend", None) - is_native_backend = backend_cls is DoclingParseV2DocumentBackend if backend_cls else False + is_native_backend = backend_cls is DoclingParseDocumentBackend if backend_cls else False if timeout_kw and not is_native_backend and len(set(budgets)) == 1: kw = dict(raises_on_error=False) @@ -556,8 +560,8 @@ def create_extractor( pass # Record the PDF backend name for provenance (default to native backend) - self.pdf_backend_name = "docling_parse_v2" - self._active_pdf_backend = DoclingParseV2DocumentBackend + self.pdf_backend_name = _DOCLING_PARSE_BACKEND_NAME + self._active_pdf_backend = DoclingParseDocumentBackend # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: @@ -582,7 +586,7 @@ def create_extractor( except Exception: pass - active_backend = DoclingParseV2DocumentBackend + active_backend = DoclingParseDocumentBackend device_str = self._current_device_str() or "cuda:0" _, opts = build_layout_pipeline( device=device_str, @@ -599,13 +603,13 @@ def create_extractor( self._active_pdf_options = opts self._current_ocr_enabled = False - pdf_backend = DoclingParseV2DocumentBackend + pdf_backend = DoclingParseDocumentBackend try: if getattr(self, "use_pypdfium_backend", False): pdf_backend = PyPdfiumDocumentBackend self.pdf_backend_name = "pypdfium" except Exception: - pdf_backend = DoclingParseV2DocumentBackend + pdf_backend = DoclingParseDocumentBackend active_backend = pdf_backend self.converter = DocumentConverter( @@ -1198,7 +1202,7 @@ def _update_extraction_metadata( if chunk_manifest_path is not None: data["chunk_manifest_path"] = str(chunk_manifest_path) # Backend and failure - backend_name = getattr(self, "pdf_backend_name", None) or ("docling_parse_v2" if getattr(self, "USE_V2", True) else "docling_parse") + backend_name = getattr(self, "pdf_backend_name", None) or _DOCLING_PARSE_BACKEND_NAME data["extraction_backend"] = backend_name if status in ("timeout", "error", "failure"): data["failure_mode"] = status diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 2568665..3005786 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -112,6 +112,7 @@ def run_for_files( return {} input_root = Path(getattr(self_ref, "input_dir", ".")).resolve() + pdf_root = (input_root / "downloads") if (input_root / "downloads").exists() else input_root out_root = Path(output_dir) if output_dir else Path(getattr(self_ref, "output_dir", input_root)) md_dir = out_root / "markdown" metrics_dir = out_root / "json" / "metrics" @@ -146,7 +147,7 @@ def run_for_files( raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") _run_cli( - input_dir=input_root, + input_dir=pdf_root, output_dir=out_root, files=file_list, model_dir=model_root, @@ -159,7 +160,7 @@ def run_for_files( results: Dict[str, Any] = {} for name in file_list: - pdf_path = (input_root / name).resolve() + pdf_path = (pdf_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py index aea64fd..8162e60 100644 --- a/src/glossapi/ocr/docling/pipeline.py +++ b/src/glossapi/ocr/docling/pipeline.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from typing import Tuple from docling.datamodel.pipeline_options import ( @@ -66,9 +67,55 @@ def _apply_common_pdf_options( setattr(opts, "images_scale", images_scale) except Exception: pass + _apply_runtime_overrides(opts) return opts +def _apply_runtime_overrides(opts: PdfPipelineOptions) -> None: + """Apply optional runtime tuning knobs exposed by newer Docling releases.""" + + int_env_map = { + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE": "layout_batch_size", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE": "table_batch_size", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE": "ocr_batch_size", + "GLOSSAPI_DOCLING_QUEUE_MAX_SIZE": "queue_max_size", + "GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT": "document_timeout", + } + float_env_map = { + "GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL": "batch_polling_interval_seconds", + } + + for env_name, attr_name in int_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = int(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + for env_name, attr_name in float_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = float(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + def build_layout_pipeline( *, device: str = "cuda:0", diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py index ef85950..4a96e09 100644 --- a/src/glossapi/ocr/docling_pipeline.py +++ b/src/glossapi/ocr/docling_pipeline.py @@ -1,82 +1,5 @@ -from __future__ import annotations - -from typing import Tuple - -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - PictureDescriptionApiOptions, - TableFormerMode, - TableStructureOptions, -) - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Build the Docling PDF pipeline used for Phase-1 extraction.""" - - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - acc, _ = _resolve_accelerator(device) - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", float(images_scale)) - except Exception: - pass - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts +"""Compatibility wrapper for the canonical Docling pipeline builder.""" +from .docling.pipeline import build_layout_pipeline __all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/scripts/ocr_gpu_batch.py b/src/glossapi/scripts/ocr_gpu_batch.py index 2183664..2646baa 100644 --- a/src/glossapi/scripts/ocr_gpu_batch.py +++ b/src/glossapi/scripts/ocr_gpu_batch.py @@ -115,15 +115,21 @@ def main(argv: Optional[List[str]] = None) -> int: "--force-ocr", dest="force_ocr", action="store_true", - help="Force GPU OCR during extraction (default).", + help="Deprecated no-op retained for compatibility; OCR now runs through Corpus.ocr(...).", ) parser.add_argument( "--no-force-ocr", dest="force_ocr", action="store_false", - help="Skip forced OCR (only run math/layout).", + help="Explicitly disable the deprecated Phase-1 OCR flag.", + ) + parser.set_defaults(force_ocr=False) + parser.add_argument( + "--workers-per-device", + type=int, + default=1, + help="Number of extraction workers to bind to each visible GPU (default: 1).", ) - parser.set_defaults(force_ocr=True) parser.add_argument( "--dry-run", action="store_true", @@ -182,6 +188,7 @@ def main(argv: Optional[List[str]] = None) -> int: export_doc_json=True, emit_formula_index=emit_formula_index, phase1_backend=args.phase1_backend, + workers_per_device=max(1, int(args.workers_per_device)), ) print("[ocr_gpu_batch] Extraction complete.") @@ -190,4 +197,3 @@ def main(argv: Optional[List[str]] = None) -> int: if __name__ == "__main__": # pragma: no cover - CLI entrypoint raise SystemExit(main()) - diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 424d359..a5ea0b1 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -60,21 +60,23 @@ def set_torch_stub(monkeypatch, *, available: bool, device_count: int): return torch_ns -def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_force_ocr_is_ignored_for_backend_selection(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - with pytest.raises(RuntimeError) as exc: - corpus.prime_extractor( - input_format="pdf", - accel_type="CUDA", - force_ocr=True, - phase1_backend="docling", - ) + corpus.prime_extractor( + input_format="pdf", + accel_type="CPU", + force_ocr=True, + phase1_backend="auto", + ) - assert "Torch CUDA is not available" in str(exc.value) + assert corpus.extractor.last_policy == "safe" + ensure_kwargs = corpus.extractor.ensure_calls[0] + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -109,7 +111,7 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey assert corpus.extractor.ensure_calls[0]["enable_ocr"] is False -def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_configures_docling_backend_explicitly(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() @@ -117,16 +119,15 @@ def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatc corpus.prime_extractor( input_format="pdf", accel_type="CUDA", - force_ocr=True, - phase1_backend="auto", + phase1_backend="docling", ) assert corpus.extractor.last_policy == "docling" assert corpus.extractor.last_max_batch_files == 1 assert corpus.extractor.last_prefer_safe_backend is False ensure_kwargs = corpus.extractor.ensure_calls[0] - assert ensure_kwargs["enable_ocr"] is True - assert ensure_kwargs["force_full_page_ocr"] is True + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypatch): @@ -188,6 +189,8 @@ def extract(self, *, file_paths=None, **kwargs): with pytest.raises(SystemExit) as exit_info: corpus_mod.gpu_extract_worker_queue( device_id=0, + worker_slot=0, + worker_key="gpu0-w0", in_dir=str(tmp_path), out_dir=str(tmp_path), work_q=work_q, diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 3779d07..e2198b7 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -54,4 +54,4 @@ def fail_math(*args, **kwargs): def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) + corpus.ocr(backend="bogus", fix_bad=True, math_enhance=False) diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 7dae1b7..f673a83 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -126,7 +126,6 @@ def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -244,7 +243,6 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -378,7 +376,6 @@ def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): accel_type="CUDA", num_threads=1, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -469,7 +466,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) From efd169835a414441b487e02948f73e09984d24e4 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 19:13:29 +0300 Subject: [PATCH 13/26] add multi-worker deepseek gpu sharding --- src/glossapi/corpus/phase_ocr_math.py | 12 ++ src/glossapi/ocr/deepseek/runner.py | 278 +++++++++++++++++++++++-- tests/test_deepseek_runner_contract.py | 88 ++++++++ tests/test_ocr_dispatch_backends.py | 39 ++++ 4 files changed, 405 insertions(+), 12 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 80afc7f..722f39a 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -48,6 +48,7 @@ def ocr( math_dpi_base: int = 220, use_gpus: str = "single", devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -74,6 +75,10 @@ def ocr( Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). + - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use + ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. + Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers + per visible GPU. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -581,6 +586,13 @@ def _run_math(stems: List[str]) -> None: self, bad_files, model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=persist_engine, + precision=precision, + device=device, + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=workers_per_gpu, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 2568665..95cd2ae 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -2,6 +2,7 @@ from __future__ import annotations +from contextlib import ExitStack import json import logging import os @@ -30,7 +31,7 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, @@ -41,7 +42,7 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], -) -> None: +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -61,8 +62,19 @@ def _run_cli( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + return cmd + +def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" @@ -70,11 +82,228 @@ def _run_cli( ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device) LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None + + +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, +) -> None: + lanes = _plan_lanes( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + for lane in lanes: + lane_files = list(lane["files"]) + if not lane_files: + continue + visible_device = int(lane["visible_device"]) + log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=lane_files, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s files=%d weight=%d: %s", + lane["lane_id"], + visible_device, + len(lane_files), + lane["weight"], + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) + + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + + def run_for_files( self_ref: Any, files: Iterable[str], @@ -91,6 +320,9 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, # reserved disable_fp8_kv: bool = False, # reserved **_: Any, @@ -98,7 +330,7 @@ def run_for_files( """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) - del log_dir, allow_stub, allow_cli, persist_engine, precision + del allow_stub, allow_cli, persist_engine, precision del gpu_memory_utilization, disable_fp8_kv if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": @@ -145,17 +377,39 @@ def run_for_files( if not python_exe.exists(): raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - _run_cli( - input_dir=input_root, - output_dir=out_root, - files=file_list, - model_dir=model_root, - python_bin=python_exe, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), device=device, ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=input_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ) + else: + _run_cli( + input_dir=input_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) results: Dict[str, Any] = {} for name in file_list: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index a5a93e4..783f4e6 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path import pandas as pd @@ -60,3 +61,90 @@ def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): assert canonical_markdown.exists() assert canonical_markdown.read_text(encoding="utf-8") == "final\n" assert not progress_markdown.exists() + + +def test_deepseek_runner_multi_uses_visible_device_isolation(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + input_dir = tmp_path / "input" + output_dir = tmp_path / "output" + input_dir.mkdir() + output_dir.mkdir() + + files = ["a.pdf", "b.pdf", "c.pdf", "d.pdf"] + weights = {"a.pdf": 40, "b.pdf": 30, "c.pdf": 20, "d.pdf": 10} + for name in files: + (input_dir / name).write_bytes(b"%PDF-1.4\n%stub\n") + + class DummyCorpus: + def __init__(self, input_dir: Path, output_dir: Path): + self.input_dir = input_dir + self.output_dir = output_dir + + class FakePopen: + calls = [] + + def __init__(self, cmd, stdout=None, stderr=None, env=None): + self.cmd = list(cmd) + self.env = dict(env or {}) + self.returncode = 0 + FakePopen.calls.append(self) + + args = list(cmd) + out_root = Path(args[args.index("--output-dir") + 1]) + lane_files = [] + idx = args.index("--files") + 1 + while idx < len(args) and not args[idx].startswith("--"): + lane_files.append(args[idx]) + idx += 1 + md_dir = out_root / "markdown" + metrics_dir = out_root / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in lane_files: + stem = Path(name).stem + (md_dir / f"{stem}.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + "{\n \"page_count\": 1\n}\n", + encoding="utf-8", + ) + + def wait(self): + return self.returncode + + script = tmp_path / "run_pdf_ocr_transformers.py" + script.write_text("# stub\n", encoding="utf-8") + model_dir = tmp_path / "DeepSeek-OCR-2" + model_dir.mkdir() + + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + monkeypatch.setattr(runner.subprocess, "Popen", FakePopen) + + results = runner.run_for_files( + DummyCorpus(input_dir, output_dir), + files, + model_dir=model_dir, + python_bin=Path(sys.executable), + vllm_script=script, + use_gpus="multi", + devices=[2, 5], + workers_per_gpu=2, + ) + + assert sorted(results) == ["a", "b", "c", "d"] + assert len(FakePopen.calls) == 4 + + seen_files = [] + seen_visible_devices = [] + for call in FakePopen.calls: + args = call.cmd + assert "--device" in args + assert args[args.index("--device") + 1] == "cuda" + seen_visible_devices.append(call.env.get("CUDA_VISIBLE_DEVICES")) + idx = args.index("--files") + 1 + while idx < len(args) and not args[idx].startswith("--"): + seen_files.append(args[idx]) + idx += 1 + + assert sorted(seen_files) == sorted(files) + assert sorted(seen_visible_devices) == ["2", "2", "5", "5"] diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 3779d07..89ad4d0 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -55,3 +55,42 @@ def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) + + +def test_deepseek_backend_forwards_parallelism_controls(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + use_gpus="multi", + devices=[1, 3], + workers_per_gpu=2, + max_pages=7, + ) + + assert calls["files"] == [fname] + assert calls["kwargs"]["use_gpus"] == "multi" + assert calls["kwargs"]["devices"] == [1, 3] + assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["max_pages"] == 7 From 8ed469b7f895499270680bcf0a5ff9ced340c593 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 23:25:21 +0300 Subject: [PATCH 14/26] add deepseek throughput tuning controls --- src/glossapi/corpus/phase_ocr_math.py | 15 +++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 105 +++++++++++++++--- src/glossapi/ocr/deepseek/runner.py | 62 +++++++++++ tests/test_deepseek_runner_contract.py | 39 +++++++ tests/test_ocr_dispatch_backends.py | 12 ++ 5 files changed, 218 insertions(+), 15 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 722f39a..719253f 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -49,6 +49,12 @@ def ocr( use_gpus: str = "single", devices: Optional[List[int]] = None, workers_per_gpu: int = 1, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -79,6 +85,9 @@ def ocr( ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. + - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + DeepSeek throughput and quality controls for benchmarking lighter OCR + modes and more efficient attention backends. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -590,6 +599,12 @@ def _run_math(stems: List[str]) -> None: persist_engine=persist_engine, precision=precision, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 0e0e868..80912d0 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -14,10 +14,28 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer -PROMPT = "\n<|grounding|>Convert the document to markdown. " +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": False, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--input-dir", required=True) @@ -26,6 +44,14 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--files", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -36,12 +62,12 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: return sorted(input_dir.glob("*.pdf")) -def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: +def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) - zoom = 144 / 72.0 + zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) for idx in range(page_count): page = doc[idx] @@ -65,12 +91,19 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() -def _load_model(model_dir: Path, device: str): - attn_impl = "flash_attention_2" +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested try: import flash_attn # noqa: F401 + return "flash_attention_2" except Exception: - attn_impl = "eager" + return "sdpa" + + +def _load_model(model_dir: Path, device: str, attn_backend: str): + attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) model = AutoModel.from_pretrained( model_dir, @@ -82,18 +115,28 @@ def _load_model(model_dir: Path, device: str): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - return tokenizer, model + return tokenizer, model, attn_impl -def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: result = model.infer( tokenizer, - prompt=PROMPT, + prompt=prompt, image_file=str(image_path), output_path=str(output_dir), - base_size=1024, - image_size=768, - crop_mode=True, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, save_results=False, eval_mode=True, ) @@ -155,10 +198,16 @@ def main() -> int: if not pdfs: return 0 - tokenizer, model = _load_model(model_dir, args.device) + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model(model_dir, args.device, args.attn_backend) for pdf_path in pdfs: - images = _render_pages(pdf_path, args.max_pages) + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) page_outputs: List[str] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) @@ -167,7 +216,16 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) @@ -180,6 +238,23 @@ def main() -> int: ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + metrics_path = output_dir / "json" / "metrics" / f"{pdf_path.stem}.metrics.json" + if metrics_path.exists(): + try: + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + metrics.update( + { + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + } + ) + metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8") + except Exception: + pass return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 95cd2ae..e2c677f 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -42,6 +42,12 @@ def _build_cli_command( max_pages: Optional[int], content_debug: bool, device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -62,6 +68,20 @@ def _build_cli_command( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] return cmd @@ -96,6 +116,12 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -108,6 +134,12 @@ def _run_cli( max_pages=max_pages, content_debug=content_debug, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -249,6 +281,12 @@ def _run_multi_cli( max_pages: Optional[int], content_debug: bool, log_dir: Path, + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -281,6 +319,12 @@ def _run_multi_cli( max_pages=max_pages, content_debug=content_debug, device="cuda", + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -320,6 +364,12 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -397,6 +447,12 @@ def run_for_files( max_pages=max_pages, content_debug=content_debug, log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) else: _run_cli( @@ -409,6 +465,12 @@ def run_for_files( max_pages=max_pages, content_debug=content_debug, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 783f4e6..c7c4d2f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -148,3 +148,42 @@ def wait(self): assert sorted(seen_files) == sorted(files) assert sorted(seen_visible_devices) == ["2", "2", "5", "5"] + + +def test_deepseek_runner_builds_speed_control_flags(tmp_path): + from glossapi.ocr.deepseek import runner + + script = tmp_path / "run_pdf_ocr_transformers.py" + script.write_text("# stub\n", encoding="utf-8") + model_dir = tmp_path / "DeepSeek-OCR-2" + model_dir.mkdir() + + cmd = runner._build_cli_command( + input_dir=tmp_path / "input", + output_dir=tmp_path / "output", + files=["doc.pdf"], + model_dir=model_dir, + python_bin=Path(sys.executable), + script=script, + max_pages=3, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=False, + render_dpi=120, + ) + + assert "--ocr-profile" in cmd + assert cmd[cmd.index("--ocr-profile") + 1] == "plain_ocr" + assert "--attn-backend" in cmd + assert cmd[cmd.index("--attn-backend") + 1] == "sdpa" + assert "--base-size" in cmd + assert cmd[cmd.index("--base-size") + 1] == "640" + assert "--image-size" in cmd + assert cmd[cmd.index("--image-size") + 1] == "448" + assert "--no-crop-mode" in cmd + assert "--render-dpi" in cmd + assert cmd[cmd.index("--render-dpi") + 1] == "120" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 89ad4d0..7774145 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -86,6 +86,12 @@ def fake_run_for_files(self_ref, files, **kwargs): use_gpus="multi", devices=[1, 3], workers_per_gpu=2, + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=False, + render_dpi=120, max_pages=7, ) @@ -93,4 +99,10 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["use_gpus"] == "multi" assert calls["kwargs"]["devices"] == [1, 3] assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["ocr_profile"] == "plain_ocr" + assert calls["kwargs"]["attn_backend"] == "sdpa" + assert calls["kwargs"]["base_size"] == 640 + assert calls["kwargs"]["image_size"] == 448 + assert calls["kwargs"]["crop_mode"] is False + assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 From b749225384f24f3f6084804c3ca11a87316344dd Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 23:50:53 +0300 Subject: [PATCH 15/26] fallback to eager when deepseek sdpa is unsupported --- .../ocr/deepseek/run_pdf_ocr_transformers.py | 41 ++++++++++++++++--- tests/test_deepseek_runner_contract.py | 36 ++++++++++++++++ 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 80912d0..2ac927f 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -4,6 +4,7 @@ import argparse import json +import logging import re import tempfile from pathlib import Path @@ -14,6 +15,7 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer +LOGGER = logging.getLogger(__name__) PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" @@ -102,15 +104,42 @@ def _resolve_attn_backend(attn_backend: str) -> str: return "sdpa" +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', + ) + return any(marker in message for marker in markers) + + def _load_model(model_dir: Path, device: str, attn_backend: str): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - model = AutoModel.from_pretrained( - model_dir, - _attn_implementation=attn_impl, - trust_remote_code=True, - use_safetensors=True, - ) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) if device.startswith("cuda"): model = model.eval().to(device).to(torch.bfloat16) else: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index c7c4d2f..bf30602 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -187,3 +187,39 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert "--no-crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" + + +def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + class DummyModel: + def eval(self): + return self + + def to(self, *_args, **_kwargs): + return self + + monkeypatch.setattr( + cli.AutoTokenizer, + "from_pretrained", + lambda *args, **kwargs: "tokenizer", + ) + + calls: list[str] = [] + + def fake_from_pretrained(*_args, **kwargs): + attn = kwargs.get("_attn_implementation") + calls.append(attn) + if attn == "sdpa": + raise ValueError( + "DeepseekOCR2ForCausalLM does not support an attention implementation through " + "torch.nn.functional.scaled_dot_product_attention yet." + ) + return DummyModel() + + monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) + + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto") + + assert calls == ["sdpa", "eager"] + assert attn_impl == "eager" From 864b0eaa10062a2a04787dde75b26c55c4a2dd37 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 00:14:50 +0300 Subject: [PATCH 16/26] fix deepseek plain ocr crop defaults --- src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py | 2 +- tests/test_deepseek_runner_contract.py | 4 ++-- tests/test_ocr_dispatch_backends.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 2ac927f..7e9391b 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -28,7 +28,7 @@ def _profile_defaults(profile: str) -> dict: "prompt": PROMPT_PLAIN_OCR, "base_size": 768, "image_size": 512, - "crop_mode": False, + "crop_mode": True, } return { "prompt": PROMPT_GROUNDED_MARKDOWN, diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index bf30602..1c7d987 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -172,7 +172,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): attn_backend="sdpa", base_size=640, image_size=448, - crop_mode=False, + crop_mode=True, render_dpi=120, ) @@ -184,7 +184,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert cmd[cmd.index("--base-size") + 1] == "640" assert "--image-size" in cmd assert cmd[cmd.index("--image-size") + 1] == "448" - assert "--no-crop-mode" in cmd + assert "--crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 7774145..7b7fd15 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -90,7 +90,7 @@ def fake_run_for_files(self_ref, files, **kwargs): attn_backend="sdpa", base_size=640, image_size=448, - crop_mode=False, + crop_mode=True, render_dpi=120, max_pages=7, ) @@ -103,6 +103,6 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["attn_backend"] == "sdpa" assert calls["kwargs"]["base_size"] == 640 assert calls["kwargs"]["image_size"] == 448 - assert calls["kwargs"]["crop_mode"] is False + assert calls["kwargs"]["crop_mode"] is True assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 From b319ae5ae54689d30c4706fb3eb431d71e926812 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 00:42:20 +0300 Subject: [PATCH 17/26] add deepseek max token cap control --- src/glossapi/corpus/phase_ocr_math.py | 5 ++++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 30 +++++++++++++++++-- src/glossapi/ocr/deepseek/runner.py | 10 +++++++ tests/test_deepseek_runner_contract.py | 22 +++++++++++++- tests/test_ocr_dispatch_backends.py | 2 ++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 719253f..f028a9a 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -55,6 +55,7 @@ def ocr( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -88,6 +89,9 @@ def ocr( - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek throughput and quality controls for benchmarking lighter OCR modes and more efficient attention backends. + - max_new_tokens: optional cap for DeepSeek generation per page. Useful + for benchmarking and for containing long-tail pages with pathological + output lengths. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -605,6 +609,7 @@ def _run_math(stems: List[str]) -> None: image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 7e9391b..cf380e8 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -51,6 +51,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") parser.set_defaults(crop_mode=None) @@ -115,7 +116,26 @@ def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: return any(marker in message for marker in markers) -def _load_model(model_dir: Path, device: str, attn_backend: str): +def _cap_generate_tokens(model, max_new_tokens: int | None): + if max_new_tokens is None: + return + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: int | None): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: @@ -144,6 +164,7 @@ def _load_model(model_dir: Path, device: str, attn_backend: str): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) + _cap_generate_tokens(model, max_new_tokens) return tokenizer, model, attn_impl @@ -233,7 +254,12 @@ def main() -> int: image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) - tokenizer, model, attn_impl = _load_model(model_dir, args.device, args.attn_backend) + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + ) for pdf_path in pdfs: images = _render_pages(pdf_path, args.max_pages, args.render_dpi) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index e2c677f..503d16d 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -48,6 +48,7 @@ def _build_cli_command( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -82,6 +83,8 @@ def _build_cli_command( cmd.append("--no-crop-mode") if render_dpi is not None: cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] return cmd @@ -122,6 +125,7 @@ def _run_cli( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -140,6 +144,7 @@ def _run_cli( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -287,6 +292,7 @@ def _run_multi_cli( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -325,6 +331,7 @@ def _run_multi_cli( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -370,6 +377,7 @@ def run_for_files( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -453,6 +461,7 @@ def run_for_files( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) else: _run_cli( @@ -471,6 +480,7 @@ def run_for_files( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 1c7d987..62c9a98 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -174,6 +174,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): image_size=448, crop_mode=True, render_dpi=120, + max_new_tokens=2048, ) assert "--ocr-profile" in cmd @@ -187,6 +188,8 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert "--crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" + assert "--max-new-tokens" in cmd + assert cmd[cmd.index("--max-new-tokens") + 1] == "2048" def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): @@ -219,7 +222,24 @@ def fake_from_pretrained(*_args, **kwargs): monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) - _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto") + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None) assert calls == ["sdpa", "eager"] assert attn_impl == "eager" + + +def test_deepseek_generate_cap_applies_max_new_tokens(): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + seen = {} + + class DummyModel: + def generate(self, *args, **kwargs): + seen["kwargs"] = dict(kwargs) + return "ok" + + model = DummyModel() + cli._cap_generate_tokens(model, 2048) + model.generate(max_new_tokens=8192, foo="bar") + assert seen["kwargs"]["max_new_tokens"] == 2048 + assert seen["kwargs"]["foo"] == "bar" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 7b7fd15..20efd77 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -93,6 +93,7 @@ def fake_run_for_files(self_ref, files, **kwargs): crop_mode=True, render_dpi=120, max_pages=7, + max_new_tokens=2048, ) assert calls["files"] == [fname] @@ -106,3 +107,4 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["crop_mode"] is True assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 + assert calls["kwargs"]["max_new_tokens"] == 2048 From 2635c0cdd8a42a8abc87a81df52a8951ac44a944 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 02:52:00 +0300 Subject: [PATCH 18/26] add deepseek generation guards and page metrics --- src/glossapi/corpus/phase_ocr_math.py | 7 + .../ocr/deepseek/run_pdf_ocr_transformers.py | 166 ++++++++++++++---- src/glossapi/ocr/deepseek/runner.py | 20 +++ tests/test_deepseek_runner_contract.py | 35 +++- tests/test_ocr_dispatch_backends.py | 4 + 5 files changed, 197 insertions(+), 35 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index f028a9a..420fe57 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -56,6 +56,8 @@ def ocr( crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -92,6 +94,9 @@ def ocr( - max_new_tokens: optional cap for DeepSeek generation per page. Useful for benchmarking and for containing long-tail pages with pathological output lengths. + - repetition_penalty/no_repeat_ngram_size: optional generation guards + for DeepSeek. These are useful when OCR runs fall into repeated or + looping output on difficult pages. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -610,6 +615,8 @@ def _run_math(stems: List[str]) -> None: crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index cf380e8..356fc38 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -6,7 +6,9 @@ import json import logging import re +import sys import tempfile +import time from pathlib import Path from typing import Iterable, List @@ -15,6 +17,17 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + LOGGER = logging.getLogger(__name__) PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." @@ -52,6 +65,8 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") parser.set_defaults(crop_mode=None) @@ -94,6 +109,21 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + def _resolve_attn_backend(attn_backend: str) -> str: requested = str(attn_backend or "auto").strip().lower() if requested != "auto": @@ -116,26 +146,60 @@ def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: return any(marker in message for marker in markers) -def _cap_generate_tokens(model, max_new_tokens: int | None): - if max_new_tokens is None: +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): return - capped = int(max_new_tokens) - if capped <= 0: - raise ValueError("max_new_tokens must be > 0") + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") original_generate = model.generate def _wrapped_generate(*args, **kwargs): - current = kwargs.get("max_new_tokens") - if current is None: - kwargs["max_new_tokens"] = capped - else: - kwargs["max_new_tokens"] = min(int(current), capped) + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value return original_generate(*args, **kwargs) model.generate = _wrapped_generate -def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: int | None): +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: @@ -164,7 +228,12 @@ def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - _cap_generate_tokens(model, max_new_tokens) + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) return tokenizer, model, attn_impl @@ -193,7 +262,13 @@ def _infer_page( return _clean_markdown(str(result)) -def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: md_dir = output_dir / "markdown" metrics_dir = output_dir / "json" / "metrics" progress_dir = output_dir / "sidecars" / "ocr_progress" @@ -205,6 +280,8 @@ def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) "page_count": page_count, "model": "deepseek-ai/DeepSeek-OCR-2", } + if extra_metrics: + metrics.update(extra_metrics) (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") partial_path = progress_dir / f"{stem}.partial.md" if partial_path.exists(): @@ -259,11 +336,17 @@ def main() -> int: args.device, args.attn_backend, args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, ) for pdf_path in pdfs: + doc_start = time.perf_counter() + render_start = time.perf_counter() images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start page_outputs: List[str] = [] + page_metrics: List[dict] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: @@ -271,7 +354,8 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page( + infer_start = time.perf_counter() + raw_page_text = _infer_page( model, tokenizer, page_png, @@ -281,9 +365,24 @@ def main() -> int: image_size=image_size, crop_mode=crop_mode, ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) _write_progress( output_dir, pdf_path.stem, @@ -292,24 +391,27 @@ def main() -> int: idx + 1, ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" - _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) - metrics_path = output_dir / "json" / "metrics" / f"{pdf_path.stem}.metrics.json" - if metrics_path.exists(): - try: - metrics = json.loads(metrics_path.read_text(encoding="utf-8")) - metrics.update( - { - "ocr_profile": args.ocr_profile, - "attn_backend": attn_impl, - "base_size": base_size, - "image_size": image_size, - "crop_mode": crop_mode, - "render_dpi": int(args.render_dpi), - } - ) - metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - except Exception: - pass + _write_outputs( + output_dir, + pdf_path.stem, + markdown, + len(images), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 503d16d..de52e24 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -49,6 +49,8 @@ def _build_cli_command( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -85,6 +87,10 @@ def _build_cli_command( cmd += ["--render-dpi", str(int(render_dpi))] if max_new_tokens is not None: cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] return cmd @@ -126,6 +132,8 @@ def _run_cli( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -145,6 +153,8 @@ def _run_cli( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -293,6 +303,8 @@ def _run_multi_cli( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -332,6 +344,8 @@ def _run_multi_cli( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -378,6 +392,8 @@ def run_for_files( crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -462,6 +478,8 @@ def run_for_files( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) else: _run_cli( @@ -481,6 +499,8 @@ def run_for_files( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 62c9a98..65d1d56 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -175,6 +175,8 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): crop_mode=True, render_dpi=120, max_new_tokens=2048, + repetition_penalty=1.05, + no_repeat_ngram_size=8, ) assert "--ocr-profile" in cmd @@ -190,6 +192,10 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert cmd[cmd.index("--render-dpi") + 1] == "120" assert "--max-new-tokens" in cmd assert cmd[cmd.index("--max-new-tokens") + 1] == "2048" + assert "--repetition-penalty" in cmd + assert cmd[cmd.index("--repetition-penalty") + 1] == "1.05" + assert "--no-repeat-ngram-size" in cmd + assert cmd[cmd.index("--no-repeat-ngram-size") + 1] == "8" def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): @@ -222,13 +228,13 @@ def fake_from_pretrained(*_args, **kwargs): monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) - _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None) + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None, None, None) assert calls == ["sdpa", "eager"] assert attn_impl == "eager" -def test_deepseek_generate_cap_applies_max_new_tokens(): +def test_deepseek_generate_controls_apply(): from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli seen = {} @@ -239,7 +245,30 @@ def generate(self, *args, **kwargs): return "ok" model = DummyModel() - cli._cap_generate_tokens(model, 2048) + cli._configure_generate( + model, + max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, + ) model.generate(max_new_tokens=8192, foo="bar") assert seen["kwargs"]["max_new_tokens"] == 2048 + assert seen["kwargs"]["repetition_penalty"] == 1.08 + assert seen["kwargs"]["no_repeat_ngram_size"] == 12 assert seen["kwargs"]["foo"] == "bar" + + +def test_postprocess_page_text_strips_prompt_and_truncates_repetition(): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + prompt = cli.PROMPT_PLAIN_OCR + raw = ( + "\nExtract the text from the document page in reading order.\n" + "Γραμμή 1\n" + + "\n".join(["ΕΠΑΝΑΛΗΨΗ"] * 12) + ) + cleaned, metrics = cli._postprocess_page_text(raw, prompt=prompt, content_debug=False) + + assert "Extract the text from the document page in reading order." not in cleaned + assert cleaned.splitlines().count("ΕΠΑΝΑΛΗΨΗ") <= 10 + assert metrics["early_stops"] == 1 diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 20efd77..2d075df 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -94,6 +94,8 @@ def fake_run_for_files(self_ref, files, **kwargs): render_dpi=120, max_pages=7, max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, ) assert calls["files"] == [fname] @@ -108,3 +110,5 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 assert calls["kwargs"]["max_new_tokens"] == 2048 + assert calls["kwargs"]["repetition_penalty"] == 1.08 + assert calls["kwargs"]["no_repeat_ngram_size"] == 12 From 4536e0e1995b4729c41d502360174f56b55029a8 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 03:44:19 +0300 Subject: [PATCH 19/26] Add DeepSeek OCR speed controls and sharding --- src/glossapi/corpus/phase_ocr_math.py | 37 ++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 281 +++++++++++-- src/glossapi/ocr/deepseek/runner.py | 370 +++++++++++++++++- tests/test_deepseek_runner_contract.py | 83 ++++ 4 files changed, 736 insertions(+), 35 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 80afc7f..1e75a1b 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -41,6 +41,16 @@ def ocr( limit: Optional[int] = None, dpi: Optional[int] = None, # reserved for future use precision: Optional[str] = None, # reserved for future use ("fp16","bf16") + workers_per_gpu: int = 1, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -74,6 +84,17 @@ def ocr( Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). + - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use + ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. + Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers + per visible GPU. + - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + DeepSeek rendering and attention controls used for throughput/quality + benchmarking. + - max_new_tokens/repetition_penalty/no_repeat_ngram_size: + Optional generation controls forwarded to DeepSeek. These are exposed + for runtime experiments; leave them unset unless a benchmark calls for + them explicitly. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -581,6 +602,22 @@ def _run_math(stems: List[str]) -> None: self, bad_files, model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=bool(persist_engine), + precision=precision, + device=device, + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 0e0e868..e46fadf 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -4,8 +4,11 @@ import argparse import json +import logging import re +import sys import tempfile +import time from pathlib import Path from typing import Iterable, List @@ -14,10 +17,40 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer -PROMPT = "\n<|grounding|>Convert the document to markdown. " +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + +LOGGER = logging.getLogger(__name__) +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": True, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--input-dir", required=True) @@ -26,6 +59,17 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--files", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -36,12 +80,12 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: return sorted(input_dir.glob("*.pdf")) -def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: +def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) - zoom = 144 / 72.0 + zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) for idx in range(page_count): page = doc[idx] @@ -65,42 +109,169 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() -def _load_model(model_dir: Path, device: str): - attn_impl = "flash_attention_2" +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested try: import flash_attn # noqa: F401 + return "flash_attention_2" except Exception: - attn_impl = "eager" - tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - model = AutoModel.from_pretrained( - model_dir, - _attn_implementation=attn_impl, - trust_remote_code=True, - use_safetensors=True, + # DeepSeek-OCR-2's custom decoder path has not behaved reliably with SDPA + # on the stacks we have exercised; if FA2 is unavailable, prefer the known + # fallback instead of silently selecting a backend that then downgrades. + return "eager" + + +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', ) + return any(marker in message for marker in markers) + + +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): + return + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + attn_impl = _resolve_attn_backend(attn_backend) + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) if device.startswith("cuda"): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - return tokenizer, model + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + return tokenizer, model, attn_impl -def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: result = model.infer( tokenizer, - prompt=PROMPT, + prompt=prompt, image_file=str(image_path), output_path=str(output_dir), - base_size=1024, - image_size=768, - crop_mode=True, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, save_results=False, eval_mode=True, ) return _clean_markdown(str(result)) -def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: md_dir = output_dir / "markdown" metrics_dir = output_dir / "json" / "metrics" progress_dir = output_dir / "sidecars" / "ocr_progress" @@ -112,6 +283,8 @@ def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) "page_count": page_count, "model": "deepseek-ai/DeepSeek-OCR-2", } + if extra_metrics: + metrics.update(extra_metrics) (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") partial_path = progress_dir / f"{stem}.partial.md" if partial_path.exists(): @@ -155,11 +328,28 @@ def main() -> int: if not pdfs: return 0 - tokenizer, model = _load_model(model_dir, args.device) + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, + ) for pdf_path in pdfs: - images = _render_pages(pdf_path, args.max_pages) + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start page_outputs: List[str] = [] + page_metrics: List[dict] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: @@ -167,10 +357,35 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + infer_start = time.perf_counter() + raw_page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) _write_progress( output_dir, pdf_path.stem, @@ -179,7 +394,27 @@ def main() -> int: idx + 1, ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" - _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + _write_outputs( + output_dir, + pdf_path.stem, + markdown, + len(images), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 3005786..61ba307 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -2,6 +2,7 @@ from __future__ import annotations +from contextlib import ExitStack import json import logging import os @@ -30,7 +31,7 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, @@ -41,7 +42,16 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], -) -> None: + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -61,8 +71,39 @@ def _run_cli( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + return cmd + +def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" @@ -70,11 +111,264 @@ def _run_cli( ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device) LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None + + +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], +) -> None: + lanes = _plan_lanes( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + for lane in lanes: + lane_files = list(lane["files"]) + if not lane_files: + continue + visible_device = int(lane["visible_device"]) + log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=lane_files, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s files=%d weight=%d: %s", + lane["lane_id"], + visible_device, + len(lane_files), + lane["weight"], + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) + + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + + def run_for_files( self_ref: Any, files: Iterable[str], @@ -91,6 +385,18 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, # reserved disable_fp8_kv: bool = False, # reserved **_: Any, @@ -98,7 +404,7 @@ def run_for_files( """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) - del log_dir, allow_stub, allow_cli, persist_engine, precision + del allow_stub, allow_cli, persist_engine, precision del gpu_memory_utilization, disable_fp8_kv if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": @@ -146,17 +452,57 @@ def run_for_files( if not python_exe.exists(): raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - _run_cli( - input_dir=pdf_root, - output_dir=out_root, - files=file_list, - model_dir=model_root, - python_bin=python_exe, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), device=device, ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=pdf_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + else: + _run_cli( + input_dir=pdf_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) results: Dict[str, Any] = {} for name in file_list: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index a5a93e4..81ec66f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path import pandas as pd @@ -60,3 +61,85 @@ def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): assert canonical_markdown.exists() assert canonical_markdown.read_text(encoding="utf-8") == "final\n" assert not progress_markdown.exists() + + +def test_auto_attn_backend_prefers_eager_when_flash_attn_is_unavailable(monkeypatch): + import builtins + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _resolve_attn_backend + + original_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "flash_attn": + raise ImportError("flash_attn unavailable") + return original_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert _resolve_attn_backend("auto") == "eager" + + +def test_runner_uses_downloads_subdir_when_present(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["input_dir"] = input_dir + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_transformers.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"]) + + assert calls["input_dir"] == downloads_dir.resolve() + assert result["doc"]["page_count"] == 1 + + +def test_build_cli_command_includes_speed_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + attn_backend="flash_attention_2", + base_size=768, + image_size=512, + crop_mode=True, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=1.05, + no_repeat_ngram_size=12, + ) + + assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--attn-backend" in cmd and "flash_attention_2" in cmd + assert "--base-size" in cmd and "768" in cmd + assert "--image-size" in cmd and "512" in cmd + assert "--crop-mode" in cmd + assert "--render-dpi" in cmd and "144" in cmd + assert "--max-new-tokens" in cmd and "1024" in cmd From 0ebabe7a774b39862e96e3d00f8ab2348f963139 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 04:05:01 +0300 Subject: [PATCH 20/26] Update DeepSeek runtime to Torch 2.9.1 cu130 --- dependency_setup/deepseek_uv/pyproject.toml | 16 +- dependency_setup/deepseek_uv/uv.lock | 280 +++++++++++++------- 2 files changed, 199 insertions(+), 97 deletions(-) diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml index a1caa65..0bfebb2 100644 --- a/dependency_setup/deepseek_uv/pyproject.toml +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -5,9 +5,9 @@ description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" requires-python = ">=3.11,<3.13" dependencies = [ "glossapi[deepseek]", - "torch==2.6.0", - "torchvision==0.21.0", - "torchaudio==2.6.0", + "torch==2.9.1", + "torchvision==0.24.1", + "torchaudio==2.9.1", ] [dependency-groups] @@ -18,11 +18,11 @@ test = [ [tool.uv.sources] glossapi = { path = "../..", editable = true } -torch = { index = "pytorch-cu118" } -torchvision = { index = "pytorch-cu118" } -torchaudio = { index = "pytorch-cu118" } +torch = { index = "pytorch-cu130" } +torchvision = { index = "pytorch-cu130" } +torchaudio = { index = "pytorch-cu130" } [[tool.uv.index]] -name = "pytorch-cu118" -url = "https://download.pytorch.org/whl/cu118" +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock index 4f99980..a136794 100644 --- a/dependency_setup/deepseek_uv/uv.lock +++ b/dependency_setup/deepseek_uv/uv.lock @@ -451,8 +451,10 @@ source = { virtual = "." } dependencies = [ { name = "glossapi", extra = ["deepseek"] }, { name = "torch" }, - { name = "torchaudio" }, - { name = "torchvision" }, + { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchaudio", version = "2.9.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] [package.dev-dependencies] @@ -464,9 +466,9 @@ test = [ [package.metadata] requires-dist = [ { name = "glossapi", extras = ["deepseek"], editable = "../../" }, - { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, - { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, - { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torch", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchaudio", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchvision", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu130" }, ] [package.metadata.requires-dev] @@ -743,106 +745,152 @@ wheels = [ ] [[package]] -name = "nvidia-cublas-cu11" -version = "11.11.3.6" +name = "nvidia-cublas" +version = "13.0.0.19" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/46/be/c222e33e60d28ecd496a46fc4d78ccae0ee28e1fd7dc705b6288b4cad27e/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl", hash = "sha256:39fb40e8f486dd8a2ddb8fdeefe1d5b28f5b99df01c87ab3676f057a74a5a6f3", size = 417870452, upload-time = "2022-10-18T21:17:48.638Z" }, - { url = "https://files.pythonhosted.org/packages/ea/2e/9d99c60771d275ecf6c914a612e9a577f740a615bc826bec132368e1d3ae/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:60252822adea5d0b10cd990a7dc7bedf7435f30ae40083c7a624a85a43225abc", size = 417870460, upload-time = "2024-08-17T00:00:26.889Z" }, + { url = "https://files.pythonhosted.org/packages/02/99/8447b9ee9f070522ee66604ee819d632ab4568c68b3134cebd3837a015cd/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:381b1a0ca636fdcb6920a871e8fc89dbfd1f6157f421ed0a6f2673e14cffd3bd", size = 539001158, upload-time = "2025-08-04T10:19:50.761Z" }, + { url = "https://files.pythonhosted.org/packages/5a/99/210e113dde53955e97042bd76dc4ad927eca04c5b4645ec157cc59f4f3ae/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:f6723af2e8e2600a11dc384037d90d9bf93070e346c24ef2e8f9001658c99896", size = 419392356, upload-time = "2025-08-04T10:20:19.449Z" }, ] [[package]] -name = "nvidia-cuda-cupti-cu11" -version = "11.8.87" +name = "nvidia-cuda-cupti" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/27/c9/b4b15f709a694ea9f84871c6c4fbeeb54bab225962d852665a2c6f77f90d/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl", hash = "sha256:0e50c707df56c75a2c0703dc6b886f3c97a22f37d6f63839f75b7418ba672a8d", size = 13093657, upload-time = "2022-10-03T21:46:12.544Z" }, - { url = "https://files.pythonhosted.org/packages/74/42/9f5c5cc084ce6f3073048c4f6806f45ba4c8c73f227c9587215d9c372e05/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux2014_x86_64.whl", hash = "sha256:4191a17913a706b5098681280cd089cd7d8d3df209a6f5cb79384974a96d24f2", size = 13093662, upload-time = "2024-08-16T23:56:38.082Z" }, + { url = "https://files.pythonhosted.org/packages/72/63/e9c12c3ae07c1f3a0821536bc188d7bf76e1b633b3bcd2bd393b00bb3426/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:67c22627ef436afcf080b48e4ad17b3f83d9e7c0d990ad0c6c0627b01fb92ccc", size = 10171189, upload-time = "2025-08-04T10:16:24.39Z" }, + { url = "https://files.pythonhosted.org/packages/ba/28/e37d62ff27b4462953fdd5713d8a78760578dfa12685c30b71b55fab57b1/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:417699e216b23d81bc0bbcb7032352f81b9c5372ef73c097a01abb83125a3d09", size = 10718148, upload-time = "2025-08-04T10:16:33.605Z" }, ] [[package]] -name = "nvidia-cuda-nvrtc-cu11" -version = "11.8.89" +name = "nvidia-cuda-nvrtc" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/08/a9833e4e9f9165bedb7f36033b47aa399b053b9cb2eaf7b84d1e28705cf7/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:1f27d67b0f72902e9065ae568b4f6268dfe49ba3ed269c9a3da99bb86d1d2008", size = 23173264, upload-time = "2022-10-03T21:47:00.705Z" }, - { url = "https://files.pythonhosted.org/packages/60/44/202e027c224c26e15a53f01c5c7604c7f6b4fd368882d3164ea08fead207/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a8d02f3cba345be56b1ffc3e74d8f61f02bb758dd31b0f20e12277a5a244f756", size = 23173745, upload-time = "2024-08-16T23:58:16.539Z" }, + { url = "https://files.pythonhosted.org/packages/be/5b/f7636b3d66caefade6a0a0dc5b705c259a2062c20ad18b432b3129d348e0/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:87e13d186905a35e7c04ad553a2abded0fba22f93b43d02e5da6f6cf73fb4d0a", size = 90214268, upload-time = "2025-08-04T10:18:09.305Z" }, + { url = "https://files.pythonhosted.org/packages/c0/bd/eb18593b43dae42312612ffbac24b8e68149e590102c3b6cc2e3d3792069/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6ccf1ef1b90a0763ac7536f3c17046659d89869d76b98ac358efc2e09b348365", size = 43013627, upload-time = "2025-08-04T10:17:57.338Z" }, ] [[package]] -name = "nvidia-cuda-runtime-cu11" -version = "11.8.89" +name = "nvidia-cuda-runtime" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/45/3e/84db02be49fe6d6df6e42f69fd64501c22d0f9ada9c9877f885612085d20/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:f587bd726eb2f7612cf77ce38a2c1e65cf23251ff49437f6161ce0d647f64f7c", size = 875585, upload-time = "2022-10-03T21:46:03.05Z" }, - { url = "https://files.pythonhosted.org/packages/a6/ec/a540f28b31de7bc1ed49eecc72035d4cb77db88ead1d42f7bfa5ae407ac6/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:92d04069a987e1fbc9213f8376d265df0f7bb42617d44f5eda1f496acea7f2d1", size = 875592, upload-time = "2024-08-16T23:56:18.774Z" }, + { url = "https://files.pythonhosted.org/packages/55/3b/c5e5d8aafd355e2ff9922472ba71251331af6cc866e5b04a3b1dc8f58977/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b807c0bb925a307bfa667a24f24d253aef8eda3ac4be66b333f2c9d357557008", size = 2260687, upload-time = "2025-08-04T10:15:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/cc/78/edb119083ca2ff0f09ab0cd597e97775ac3f575b8aa0caf10d68ed49e032/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b54d12087a1abff81a4cbfa6556876e3afea1fc60da2e0816da374619810c89", size = 2242632, upload-time = "2025-08-04T10:15:49.339Z" }, ] [[package]] -name = "nvidia-cudnn-cu11" -version = "9.1.0.70" +name = "nvidia-cudnn-cu13" +version = "9.13.0.50" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/00/3b/0b776f04e364cd99e4cf152c2a9eadb5934c67c9a91429da55169a9447fd/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e6135ac63fe9d5b0b89cfb35c3fc1c1349f2b995becadf2e9dc21bca89d9633d", size = 663919573, upload-time = "2024-04-22T15:20:24.839Z" }, + { url = "https://files.pythonhosted.org/packages/8a/9c/9e99c00dc23db324244ec257d1e84d79539202ee2f185dee2c1fa97c9549/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:33f0aa0b64230101b348648fd0693342188071d3f8a137c0cf50051c24b3584b", size = 412337597, upload-time = "2025-09-04T20:22:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/cf/68/2712854561170b2a81bea7b6b35cc1ae264d9794c0c218986e5c685d45f7/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:2150b4850725d30653ec3e365f0732e3e2e3eb8633cf3bd2d3117628dea8b4f9", size = 348571624, upload-time = "2025-09-04T20:23:26.544Z" }, ] [[package]] -name = "nvidia-cufft-cu11" -version = "10.9.0.58" +name = "nvidia-cufft" +version = "12.0.0.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/e9/4e49b1baf6899e42eeec324a49d7aa2219fec42076327c4e468000dd375a/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1885731254835797572ff075f3daf43a2a0a2801210dea26971940dae7e1a367", size = 214053580, upload-time = "2025-08-04T10:20:45.781Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9f/e298b66e584ad25bd78ad4a45b061fe7bb57a1ec011128089404ce3fcc7d/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9f160b1f018e80bcb0d7c0fa50564b042fa26b13edc1b1ff14b6375a9edd2812", size = 214085489, upload-time = "2025-08-04T10:21:02.975Z" }, +] + +[[package]] +name = "nvidia-cufile" +version = "1.15.0.42" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/0a/4adf0c9bb1241cd1314fc923fde00f3749c7fc785b1e3b3f4a104cd3090c/nvidia_cufile-1.15.0.42-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8f9813eff24d61586699c615e39817e2b4e4f642cace32733c2ab6f663a7eab", size = 1223104, upload-time = "2025-08-04T10:21:31.131Z" }, + { url = "https://files.pythonhosted.org/packages/bf/a5/636baa43399ea10d22b63e7454f22a92ace4a7eaa3c45b94607250857e2d/nvidia_cufile-1.15.0.42-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bced4036b5a8dbf57e4d78cd4fafefec58ad754b784a9eaa272b011896754c62", size = 1136527, upload-time = "2025-08-04T10:21:22.441Z" }, +] + +[[package]] +name = "nvidia-curand" +version = "10.4.0.35" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/74/79/b912a77e38e41f15a0581a59f5c3548d1ddfdda3225936fb67c342719e7a/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl", hash = "sha256:222f9da70c80384632fd6035e4c3f16762d64ea7a843829cb278f98b3cb7dd81", size = 168405414, upload-time = "2022-10-03T23:29:47.505Z" }, - { url = "https://files.pythonhosted.org/packages/64/c8/133717b43182ba063803e983e7680a94826a9f4ff5734af0ca315803f1b3/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e21037259995243cc370dd63c430d77ae9280bedb68d5b5a18226bfc92e5d748", size = 168405419, upload-time = "2024-08-17T00:02:03.562Z" }, + { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, ] [[package]] -name = "nvidia-curand-cu11" -version = "10.3.0.86" +name = "nvidia-cusolver" +version = "12.0.3.29" source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-cusparse", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] wheels = [ - { url = "https://files.pythonhosted.org/packages/49/28/c47f8e2439ddbcbeae3cf74d43ed572b651d630ea72863d5357f3759eb66/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:ac439548c88580269a1eb6aeb602a5aed32f0dbb20809a31d9ed7d01d77f6bf5", size = 58124493, upload-time = "2022-10-03T23:30:05.413Z" }, - { url = "https://files.pythonhosted.org/packages/58/e5/ce5806afc48a6e4e0dddd25316ac60b6fa94fd1791bdbf4ca17bf52696ea/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:cd4cffbf78bb06580206b4814d5dc696d1161c902aae37b2bba00056832379e6", size = 58124497, upload-time = "2024-08-17T00:03:01.833Z" }, + { url = "https://files.pythonhosted.org/packages/a7/bb/2e60de9bb1f0c3395eabd91ccad00f4ba3ef736dc9190a158a9d268419f5/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:3bb6e65ce0beaeafdd069b320246e8f17c1cd30ddb27a0539143a3706733a4d8", size = 193104180, upload-time = "2025-08-04T10:22:19.821Z" }, + { url = "https://files.pythonhosted.org/packages/a5/87/e3c9ee227b750e5b61572e7509f586cc8d494a4f7874b5163e734ed852c2/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:6f54c2eed5edab54c224dd1852dde80ba76b2b78e6d3ce7344fef5dfc66d16ab", size = 193474165, upload-time = "2025-08-04T10:22:47.976Z" }, ] [[package]] -name = "nvidia-cusolver-cu11" -version = "11.4.1.48" +name = "nvidia-cusparse" +version = "12.6.2.49" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/55/ee/939ff0104991dd7bdabb4c9767994c612ba0e1c9a55672a1ddd42f5e5b16/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl", hash = "sha256:ca538f545645b7e6629140786d3127fe067b3d5a085bd794cde5bfe877c8926f", size = 128240842, upload-time = "2022-10-03T23:30:24.348Z" }, - { url = "https://files.pythonhosted.org/packages/52/fe/866e87e6e6a1b0a5fcf8524a058042656702f2057e22bfdb8899a7c38e10/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea9fb1ad8c644ca9ed55af13cc39af3b7ba4c3eb5aef18471fe1fe77d94383cb", size = 128246438, upload-time = "2024-08-17T00:03:52.432Z" }, + { url = "https://files.pythonhosted.org/packages/fc/30/f32023427f2ef4ec27e8293dfddb5068de566912cd0a45eccfd400017a62/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d3269c19283a0057fb5ebfb003ae2a10c97a28a6958f4238354826b055827c7", size = 155888587, upload-time = "2025-08-04T10:23:04.091Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e8/b3f7a87cc719dca926c7baee92f2544de8909573a4126c85a9f1625431e8/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efcf0b01e3a0827c144feff5391456b8a06e9ce63dcd51c0943e32e605251952", size = 140247612, upload-time = "2025-08-04T10:23:29.844Z" }, ] [[package]] -name = "nvidia-cusparse-cu11" -version = "11.7.5.86" +name = "nvidia-cusparselt-cu13" +version = "0.8.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/e0/21b829c535d569831835a4ca5d049a19ba00d3e91f3e12ab4ad27bd7385f/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:4ae709fe78d3f23f60acaba8c54b8ad556cf16ca486e0cc1aa92dca7555d2d2b", size = 204126221, upload-time = "2022-10-18T21:19:28.04Z" }, - { url = "https://files.pythonhosted.org/packages/ed/5c/b0333b07c51ced77397c2fb0d9826072cea0da9d421aa7e792aa0f8ecc72/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8d7cf1628fd8d462b5d2ba6678fae34733a48ecb80495b9c68672ec6a6dde5ef", size = 204126227, upload-time = "2024-08-17T00:05:20.798Z" }, + { url = "https://files.pythonhosted.org/packages/46/10/8dcd1175260706a2fc92a16a52e306b71d4c1ea0b0cc4a9484183399818a/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:400c6ed1cf6780fc6efedd64ec9f1345871767e6a1a0a552a1ea0578117ea77c", size = 220791277, upload-time = "2025-08-13T19:22:40.982Z" }, + { url = "https://files.pythonhosted.org/packages/fd/53/43b0d71f4e702fa9733f8b4571fdca50a8813f1e450b656c239beff12315/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:25e30a8a7323935d4ad0340b95a0b69926eee755767e8e0b1cf8dd85b197d3fd", size = 169884119, upload-time = "2025-08-13T19:23:41.967Z" }, ] [[package]] -name = "nvidia-nccl-cu11" -version = "2.21.5" +name = "nvidia-nccl-cu13" +version = "2.27.7" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/9a/8b6a28b3b87d5fddab0e92cd835339eb8fbddaa71ae67518c8c1b3d05bae/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:49d8350629c7888701d1fd200934942671cb5c728f49acc5a0b3a768820bed29", size = 147811630, upload-time = "2024-04-03T15:33:12.879Z" }, + { url = "https://files.pythonhosted.org/packages/49/61/2c7762da6febee96341ea17d1f7309ac7559ac3cab00f3f7e1e7bd0e5d00/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5e3cc863e52bf9dd1e3ab1941bddb414098f489ae7342f6b3a274602303da123", size = 194014855, upload-time = "2025-09-23T16:30:27.56Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/dabb10684e60edfaf1a1c9984d12a668bc1091582099d4e03ac5b9983b51/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b28a524abd8389b76a4a3f133c76a7aaa7005e47fcaa9d9603b90103927a3f93", size = 193901479, upload-time = "2025-09-23T16:30:41.165Z" }, ] [[package]] -name = "nvidia-nvtx-cu11" -version = "11.8.86" +name = "nvidia-nvjitlink" +version = "13.0.39" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/a2/23214c23118784dc2189ac2d2e48190df3e4206e2f73eb17d47140797a2b/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:890656d8bd9b4e280231c832e1f0d03459200ba4824ddda3dcb59b1e1989b9f5", size = 99125, upload-time = "2022-10-03T21:47:19.565Z" }, - { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, + { url = "https://files.pythonhosted.org/packages/95/39/726edebeb76f3efc25c79f885429fa1227c9d200e20ea219bf724b382e19/nvidia_nvjitlink-13.0.39-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:bc3179be558329ef9687884c6faa27cdc0659bdbc642432ec8cc6cc00d182627", size = 40709605, upload-time = "2025-08-04T10:25:04.129Z" }, + { url = "https://files.pythonhosted.org/packages/bc/7a/0fb4c4413b3b14519f8934edd4dcd9f411c4e14e2a2c0ae58709e4dda255/nvidia_nvjitlink-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce0d63fa5ebedf542056e7491c49feed2297c900980aa6269b6a55f478056ad7", size = 38767126, upload-time = "2025-08-04T10:24:53.05Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu13" +version = "3.3.24" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/7e/b8797780e442eabd9046cd6eb54100b8d0cb047ebc2f70931710cb03bcfe/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:28ae82a4d14b322b93409535de62df6b7b83f4f7672ca97fc89107c2d40ce2c2", size = 60168129, upload-time = "2025-08-22T19:56:28.818Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e9/8530afb8ed38d16bbc89cec80a4dd6a52dbf59bc93e546c3658cfa8b1f9b/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c14d09571697d2e57cb079c8daec88ab1c68cb3586532bfbd4886125a08339b7", size = 60390470, upload-time = "2025-08-22T19:56:49.848Z" }, +] + +[[package]] +name = "nvidia-nvtx" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/37/0d103c84e7884382a79a569b720965141f83dd1c5df9e3e00cbc02d7099c/nvidia_nvtx-13.0.39-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc113127785c96db8a0fe715df92db9788777b4b3d1bd713d42f75969201b5ce", size = 147197, upload-time = "2025-08-04T10:18:39.829Z" }, + { url = "https://files.pythonhosted.org/packages/86/91/8b486ba85f71a2859dd705a4ec6aab38c37a389b8b7f94343db027732999/nvidia_nvtx-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cddd2e08b35144f1000631c3880c9ebbcb8a2863d762e76f92d47d30ecaf87cc", size = 148037, upload-time = "2025-08-04T10:18:31.763Z" }, ] [[package]] @@ -1324,14 +1372,14 @@ wheels = [ [[package]] name = "sympy" -version = "1.13.1" +version = "1.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mpmath" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] [[package]] @@ -1398,64 +1446,116 @@ wheels = [ [[package]] name = "torch" -version = "2.6.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufft", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, + { name = "nvidia-curand", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusolver", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvtx", marker = "sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:3e73419aab6dbcd888a3cc6a00d1f52f5950d918d7289ea6aeae751346613edc" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:6ab0417ce9b78ab0a34721a99734b5fd4cc3d7b62ff1c068a7d636fd829772db" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:9f7d170d6c78726945d95fcc3a3d7601f36aed0e6e0dc9ca377a64d6a8fd7b3a" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:6c040e4181c5dae73b965b61394ec431c93b2018165e2be8f15fc68d44444cb3" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fd6c7d297e21758a7fa07624f2b5bb15607ee3b1dcc52519e8e796c6d4fcf960" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f40778951ca1533dc634b3842392641fa0b641181ff2f71d62728ef33cc36a5c" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:8db2814e63f2b365bda88526587ca75a6083a0b957a24b2b0d45ddc5ee350176" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e7f84cb10c7e7d9f862c318f056d64840544ab4f0bcbf8cf7ed6047fe04051f" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e70e1b18881e6b3c1ce402d0a989da39f956a3a057526e03c354df23d704ce9b" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:cd3232a562ad2a2699d48130255e1b24c07dfe694a40dcd24fad683c752de121" }, ] [[package]] name = "torchaudio" -version = "2.6.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "2.9.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "torch" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:493421d061375074ce84840ca619605f625892e16dead63ec97181ef02da3357" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b3c75f87e325946276c952864dbce2c8fabc88a00d86730c3d5bc0999ebf7789" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1023bb6598fa6312e1990fdc78660f4b4ef128d8942a1f10c5827aea23d6bd7e" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:817e2660d35a3c9a2638dd80d63c7a488cbbe87446ddbb564a5cf88b9de632f7" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6c58d5e846da5a90d50bd425e2c24368747cd04297d95c6dd51d3f7f85fea26" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7533a17bed21e5b86b8c49fd79656779779f2c991aef2804af6f318d2022ea6a" }, ] [[package]] name = "torchvision" -version = "0.21.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "0.24.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "numpy" }, - { name = "pillow" }, - { name = "torch" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d4ba2532440a93c23a99c41423a765a0cdd47556afa3acf7c318dd1d3d6793e9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:89743dcee13e943f58b37c7647aff14b5bb24c11c84826376d457acf97586fec" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b0cc84c57c1fd54644698a70a74d1ea1eddfa44ee2df3354b7bb2c619a5d2923" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:f564b9fdbc336ac187780931331fb4253f8511deae914dde12dca5bf17b3045f" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6939dd403cc28ab0a46f53e6c86e2e852cf65771c1b0ddd09c44c541a1cdbad9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:d31ceaded0d9b737471fa680ccd9e1acb6d5f0f70f03ef3a8d786a99c79da7cf" }, ] [[package]] @@ -1493,11 +1593,13 @@ wheels = [ [[package]] name = "triton" -version = "3.2.0" +version = "3.5.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636, upload-time = "2025-01-22T19:12:51.322Z" }, - { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, + { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" }, + { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" }, + { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" }, + { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" }, ] [[package]] From 502f8bc065ce660e4b222d606b95cc8642d63f3d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 12:24:46 +0300 Subject: [PATCH 21/26] Add vLLM DeepSeek OCR runtime --- src/glossapi/corpus/phase_ocr_math.py | 11 + src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 239 ++++++++++++++++++ src/glossapi/ocr/deepseek/runner.py | 54 +++- tests/test_deepseek_runner_contract.py | 66 +++++ 4 files changed, 365 insertions(+), 5 deletions(-) create mode 100644 src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 1e75a1b..1897aa2 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -42,6 +42,7 @@ def ocr( dpi: Optional[int] = None, # reserved for future use precision: Optional[str] = None, # reserved for future use ("fp16","bf16") workers_per_gpu: int = 1, + runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", attn_backend: str = "auto", base_size: Optional[int] = None, @@ -51,6 +52,9 @@ def ocr( max_new_tokens: Optional[int] = None, repetition_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = None, + disable_fp8_kv: bool = False, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -88,6 +92,7 @@ def ocr( ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. + - runtime_backend: ``transformers`` (default) or ``vllm``. - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek rendering and attention controls used for throughput/quality benchmarking. @@ -95,6 +100,8 @@ def ocr( Optional generation controls forwarded to DeepSeek. These are exposed for runtime experiments; leave them unset unless a benchmark calls for them explicitly. + - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv: + Optional vLLM controls. These are ignored by the transformers runtime. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -609,6 +616,7 @@ def _run_math(stems: List[str]) -> None: use_gpus=use_gpus, devices=devices, workers_per_gpu=int(max(1, workers_per_gpu)), + runtime_backend=runtime_backend, ocr_profile=ocr_profile, attn_backend=attn_backend, base_size=base_size, @@ -618,6 +626,9 @@ def _run_math(stems: List[str]) -> None: max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py new file mode 100644 index 0000000..2c547b3 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -0,0 +1,239 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files using vLLM.""" + +from __future__ import annotations + +import argparse +import logging +import tempfile +import time +from pathlib import Path +from typing import Dict, List + +from PIL import Image + +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( + PAGE_SPLIT, + _iter_pdfs, + _postprocess_page_text, + _profile_defaults, + _render_pages, + _write_outputs, + _write_progress, +) + +LOGGER = logging.getLogger(__name__) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="vllm") + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: bool): + from vllm import LLM + + logits_processors = None + try: + from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor + + logits_processors = [NGramPerReqLogitsProcessor] + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("DeepSeek OCR logits processor unavailable in vLLM; continuing without it: %s", exc) + + engine_kwargs = { + "model": str(model_dir), + "tokenizer": str(model_dir), + "trust_remote_code": True, + "dtype": "bfloat16", + "enable_prefix_caching": False, + "mm_processor_cache_gb": 0, + "gpu_memory_utilization": float(gpu_memory_utilization), + "tensor_parallel_size": 1, + } + if disable_fp8_kv: + engine_kwargs["kv_cache_dtype"] = "auto" + if logits_processors: + engine_kwargs["logits_processors"] = logits_processors + return LLM(**engine_kwargs) + + +def _sampling_params(max_new_tokens: int | None): + from vllm import SamplingParams + + return SamplingParams( + temperature=0.0, + max_tokens=int(max_new_tokens or 8192), + skip_special_tokens=False, + extra_args={ + "ngram_size": 30, + "window_size": 90, + "whitelist_token_ids": {128821, 128822}, + }, + ) + + +def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: + size = max(1, int(batch_size)) + return [items[idx : idx + size] for idx in range(0, len(items), size)] + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + pdfs = _iter_pdfs(input_dir, args.files) + if not pdfs: + return 0 + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + llm = _load_vllm( + model_dir, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + ) + sampling_params = _sampling_params(args.max_new_tokens) + + with tempfile.TemporaryDirectory(prefix="deepseek_vllm_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + doc_states: Dict[str, dict] = {} + jobs: List[dict] = [] + + for pdf_path in pdfs: + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start + total_pages = len(images) + state = { + "stem": pdf_path.stem, + "page_outputs": [""] * total_pages, + "page_metrics": [], + "render_sec": float(render_sec), + "doc_start": float(doc_start), + "completed_pages": 0, + "total_pages": total_pages, + } + doc_states[pdf_path.stem] = state + _write_progress(output_dir, pdf_path.stem, [], total_pages, 0) + for idx, image in enumerate(images): + page_path = tmp_dir / f"{pdf_path.stem}_page_{idx + 1:04d}.png" + image.save(page_path, format="PNG") + image.close() + jobs.append( + { + "stem": pdf_path.stem, + "page_number": int(idx + 1), + "image_path": page_path, + } + ) + + for batch in _batched(jobs, args.batch_size): + prompt_batch = [] + images: List[Image.Image] = [] + for item in batch: + image = Image.open(item["image_path"]).convert("RGB") + images.append(image) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + infer_start = time.perf_counter() + outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + per_item_sec = infer_sec / max(1, len(batch)) + for image in images: + image.close() + + for item, output in zip(batch, outputs): + state = doc_states[item["stem"]] + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + state["page_outputs"][item["page_number"] - 1] = page_text + state["page_metrics"].append( + { + "page_number": int(item["page_number"]), + "infer_sec": float(per_item_sec), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) + state["completed_pages"] = int(state["completed_pages"]) + 1 + progress_pages = [page for page in state["page_outputs"] if page] + _write_progress( + output_dir, + item["stem"], + progress_pages, + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + for stem, state in doc_states.items(): + markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" + page_metrics = sorted(state["page_metrics"], key=lambda item: int(item["page_number"])) + _write_outputs( + output_dir, + stem, + markdown, + int(state["total_pages"]), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": "vllm", + "runtime_backend": "vllm", + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "batch_size": int(args.batch_size), + "gpu_memory_utilization": float(args.gpu_memory_utilization), + "disable_fp8_kv": bool(args.disable_fp8_kv), + "render_sec": float(state["render_sec"]), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "page_metrics": page_metrics, + }, + ) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 61ba307..fe60390 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -20,6 +20,7 @@ LOGGER = logging.getLogger(__name__) REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" def _page_count(pdf_path: Path) -> int: @@ -51,6 +52,10 @@ def _build_cli_command( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -91,6 +96,14 @@ def _build_cli_command( cmd += ["--repetition-penalty", str(float(repetition_penalty))] if no_repeat_ngram_size is not None: cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + if runtime_backend_norm == "vllm": + if vllm_batch_size is not None: + cmd += ["--batch-size", str(int(vllm_batch_size))] + if gpu_memory_utilization is not None: + cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] + if disable_fp8_kv: + cmd.append("--disable-fp8-kv") return cmd @@ -134,6 +147,10 @@ def _run_cli( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -155,6 +172,10 @@ def _run_cli( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -305,6 +326,10 @@ def _run_multi_cli( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -346,6 +371,10 @@ def _run_multi_cli( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -385,6 +414,7 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", attn_backend: str = "auto", base_size: Optional[int] = None, @@ -397,22 +427,27 @@ def run_for_files( use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, - gpu_memory_utilization: Optional[float] = None, # reserved - disable_fp8_kv: bool = False, # reserved + gpu_memory_utilization: Optional[float] = None, + disable_fp8_kv: bool = False, + vllm_batch_size: Optional[int] = None, **_: Any, ) -> Dict[str, Any]: """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) del allow_stub, allow_cli, persist_engine, precision - del gpu_memory_utilization, disable_fp8_kv - if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": raise RuntimeError( "DeepSeek stub execution has been removed. " "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." ) + runtime_backend_norm = str( + runtime_backend or os.environ.get("GLOSSAPI_DEEPSEEK_RUNTIME_BACKEND", "transformers") + ).strip().lower() + if runtime_backend_norm not in {"transformers", "vllm"}: + raise ValueError("runtime_backend must be 'transformers' or 'vllm'") + file_list = [str(f) for f in files or []] if not file_list: return {} @@ -435,10 +470,11 @@ def run_for_files( "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." ) + default_script = DEFAULT_VLLM_SCRIPT if runtime_backend_norm == "vllm" else DEFAULT_SCRIPT script_path = Path( vllm_script or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") - or DEFAULT_SCRIPT + or default_script ) if not script_path.exists(): raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") @@ -481,6 +517,10 @@ def run_for_files( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) else: _run_cli( @@ -502,6 +542,10 @@ def run_for_files( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 81ec66f..4629d2f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -134,6 +134,10 @@ def test_build_cli_command_includes_speed_flags(tmp_path): max_new_tokens=1024, repetition_penalty=1.05, no_repeat_ngram_size=12, + runtime_backend="transformers", + vllm_batch_size=None, + gpu_memory_utilization=None, + disable_fp8_kv=False, ) assert "--ocr-profile" in cmd and "plain_ocr" in cmd @@ -143,3 +147,65 @@ def test_build_cli_command_includes_speed_flags(tmp_path): assert "--crop-mode" in cmd assert "--render-dpi" in cmd and "144" in cmd assert "--max-new-tokens" in cmd and "1024" in cmd + + +def test_build_cli_command_includes_vllm_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=110, + max_new_tokens=768, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=16, + gpu_memory_utilization=0.92, + disable_fp8_kv=True, + ) + + assert "--batch-size" in cmd and "16" in cmd + assert "--gpu-memory-utilization" in cmd and "0.92" in cmd + assert "--disable-fp8-kv" in cmd + + +def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["script"] = kwargs["script"] + calls["runtime_backend"] = kwargs["runtime_backend"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["runtime_backend"] == "vllm" + assert Path(calls["script"]).name == "run_pdf_ocr_vllm.py" + assert result["doc"]["page_count"] == 1 From cbeb638913f43df13f4704e3646b6b6b75961101 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:11:17 +0300 Subject: [PATCH 22/26] Add DeepSeek markdown repair pipeline --- docs/ocr_and_math_enhancement.md | 50 +++ src/glossapi/corpus/phase_ocr_math.py | 13 +- .../ocr/deepseek/run_pdf_ocr_transformers.py | 3 +- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 404 ++++++++++++++++-- src/glossapi/ocr/deepseek/runner.py | 20 + tests/test_deepseek_runner_contract.py | 21 + 6 files changed, 460 insertions(+), 51 deletions(-) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 1c2b630..ac2a5b7 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -82,6 +82,40 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. +### DeepSeek fast path + +The current recommended high-throughput DeepSeek configuration is: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `repair_mode='auto'` to keep markdown as the primary output while selectively rerunning suspicious pages +- large `vllm_batch_size` chosen to keep `sec/page/GPU` at or below the best validated floor for the target hardware + +Example: + +```python +c.ocr( + backend='deepseek', + fix_bad=True, + math_enhance=False, + runtime_backend='vllm', + ocr_profile='markdown_grounded', + vllm_batch_size=160, + gpu_memory_utilization=0.9, + repair_mode='auto', + use_gpus='multi', +) +``` + +`repair_mode='auto'` runs the pipeline in distinct phases inside the vLLM runner: + +1. markdown first pass over all rendered pages +2. cheap per-page triage using output quality plus simple image density statistics +3. plain-text rerun bucket for garbage markdown pages +4. tiled markdown rerun bucket for short coverage failures + +This keeps the fast path batched while avoiding per-page sequential fallback overhead. + ## Multi‑GPU Phase‑1 (extract): @@ -105,9 +139,25 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning +### Validated benchmark floor + +The current non-regression metric is `sec/page/GPU`. + +Validated on 2026-03-30: + +- Host: AWS `g7e.48xlarge` +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Render DPI: `144` +- GPU memory utilization: `0.9` +- Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` + +That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. + - Batch sizes - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. + - DeepSeek vLLM: push `vllm_batch_size` as high as the hardware allows while tracking `sec/page/GPU`; on the validated `g7e.48xlarge` path, larger batches continued improving throughput through `batch_size=160`. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 1897aa2..cd261ed 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -44,6 +44,7 @@ def ocr( workers_per_gpu: int = 1, runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, attn_backend: str = "auto", base_size: Optional[int] = None, image_size: Optional[int] = None, @@ -55,6 +56,7 @@ def ocr( vllm_batch_size: Optional[int] = None, gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, + repair_mode: str = "auto", # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -93,15 +95,18 @@ def ocr( Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. - runtime_backend: ``transformers`` (default) or ``vllm``. - - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + - ocr_profile/prompt_override/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek rendering and attention controls used for throughput/quality benchmarking. - max_new_tokens/repetition_penalty/no_repeat_ngram_size: Optional generation controls forwarded to DeepSeek. These are exposed for runtime experiments; leave them unset unless a benchmark calls for them explicitly. - - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv: - Optional vLLM controls. These are ignored by the transformers runtime. + - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode: + Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first + repair pipeline (plain fallback for garbage pages, tiled fallback for + short coverage failures). These are ignored by the transformers runtime + except for ``prompt_override``. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -618,6 +623,7 @@ def _run_math(stems: List[str]) -> None: workers_per_gpu=int(max(1, workers_per_gpu)), runtime_backend=runtime_backend, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -629,6 +635,7 @@ def _run_math(stems: List[str]) -> None: vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index e46fadf..071b3b5 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -60,6 +60,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) @@ -329,7 +330,7 @@ def main() -> int: return 0 profile_defaults = _profile_defaults(args.ocr_profile) - prompt = profile_defaults["prompt"] + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 2c547b3..56870e5 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -7,7 +7,7 @@ import tempfile import time from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Tuple from PIL import Image @@ -22,6 +22,18 @@ ) LOGGER = logging.getLogger(__name__) +REPAIR_TILE_SPECS: Tuple[Tuple[str, float, float], ...] = ( + ("top", 0.0, 0.5), + ("mid", 0.35, 0.8), + ("bottom", 0.65, 1.0), +) +REPAIR_DARK_THRESHOLD = 235 +REPAIR_SHORT_CHARS = 700 +REPAIR_EXTREME_SHORT_CHARS = 120 +REPAIR_PUA_THRESHOLD = 64 +REPAIR_MIN_HALF_DARK = 0.08 +REPAIR_MAX_OVERALL_DARK = 0.25 +REPAIR_MIN_OVERALL_DARK = 0.04 def _parse_args() -> argparse.Namespace: @@ -33,6 +45,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) parser.add_argument("--attn-backend", default="vllm") parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) @@ -46,6 +59,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--batch-size", type=int, default=8) parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -98,6 +112,161 @@ def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: return [items[idx : idx + size] for idx in range(0, len(items), size)] +def _image_content_stats(image: Image.Image) -> dict: + sample = image.convert("L") + sample.thumbnail((256, 256)) + width, height = sample.size + pixels = list(sample.getdata()) + + def _dark_ratio(y0: int, y1: int) -> float: + values = [] + for row in range(y0, y1): + start = row * width + values.extend(pixels[start : start + width]) + total = len(values) + if total <= 0: + return 0.0 + dark = sum(1 for value in values if value < REPAIR_DARK_THRESHOLD) + return float(dark) / float(total) + + half = max(1, height // 2) + dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) + return { + "top_dark_ratio": _dark_ratio(0, half), + "bottom_dark_ratio": _dark_ratio(half, height), + "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), + } + + +def _count_private_use_chars(text: str) -> int: + return sum( + 1 + for ch in str(text or "") + if 0xE000 <= ord(ch) <= 0xF8FF + or 0xF0000 <= ord(ch) <= 0xFFFFD + or 0x100000 <= ord(ch) <= 0x10FFFD + ) + + +def _text_quality_metrics(text: str) -> dict: + stripped = str(text or "").strip() + letters = sum(1 for ch in stripped if ch.isalpha()) + digits = sum(1 for ch in stripped if ch.isdigit()) + pua_chars = _count_private_use_chars(stripped) + score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) + return { + "chars": int(len(stripped)), + "letters": int(letters), + "digits": int(digits), + "pua_chars": int(pua_chars), + "quality_score": float(score), + } + + +def _classify_repair(text: str, image_stats: dict, repair_mode: str) -> tuple[str, str | None]: + if str(repair_mode or "off").strip().lower() != "auto": + return "none", None + quality = _text_quality_metrics(text) + chars = int(quality["chars"]) + pua_chars = int(quality["pua_chars"]) + pua_ratio = float(pua_chars) / float(max(1, chars)) + if pua_chars >= REPAIR_PUA_THRESHOLD or pua_ratio >= 0.10: + return "plain", "markdown_garbage" + if chars <= REPAIR_EXTREME_SHORT_CHARS: + return "plain", "extreme_short" + top_dark = float(image_stats.get("top_dark_ratio", 0.0)) + bottom_dark = float(image_stats.get("bottom_dark_ratio", 0.0)) + overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if ( + chars <= REPAIR_SHORT_CHARS + and top_dark >= REPAIR_MIN_HALF_DARK + and bottom_dark >= REPAIR_MIN_HALF_DARK + and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK + ): + return "tile", "short_coverage" + return "none", None + + +def _load_job_image(item: dict) -> Image.Image: + image = Image.open(item["image_path"]).convert("RGB") + crop_box = item.get("crop_box") + if not crop_box: + return image + width, height = image.size + x0_norm, y0_norm, x1_norm, y1_norm = crop_box + crop_pixels = ( + int(round(float(x0_norm) * width)), + int(round(float(y0_norm) * height)), + int(round(float(x1_norm) * width)), + int(round(float(y1_norm) * height)), + ) + cropped = image.crop(crop_pixels) + image.close() + return cropped + + +def _generate_batch_outputs( + llm, + *, + jobs: List[dict], + prompt: str, + batch_size: int, + sampling_params, +) -> List[dict]: + outputs_by_key: Dict[tuple[str, int, str], dict] = {} + for batch in _batched(jobs, batch_size): + prompt_batch = [] + opened_images: List[Image.Image] = [] + keys: List[tuple[str, int, str]] = [] + for item in batch: + image = _load_job_image(item) + opened_images.append(image) + keys.append((str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + infer_start = time.perf_counter() + batch_outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + per_item_sec = infer_sec / max(1, len(batch)) + for image in opened_images: + image.close() + for item, key, output in zip(batch, keys, batch_outputs): + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + outputs_by_key[key] = { + "item": item, + "raw_text": raw_text, + "infer_sec": float(per_item_sec), + } + ordered = [] + for item in jobs: + ordered.append(outputs_by_key[(str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))]) + return ordered + + +def _stitch_tiled_markdown(parts: List[str]) -> str: + stitched: List[str] = [] + previous_lines: List[str] = [] + for part in parts: + lines = [line.rstrip() for line in str(part or "").splitlines() if line.strip()] + if not lines: + continue + overlap = 0 + max_overlap = min(len(previous_lines), len(lines), 12) + for size in range(max_overlap, 0, -1): + if previous_lines[-size:] == lines[:size]: + overlap = size + break + stitched.extend(lines[overlap:]) + previous_lines = lines + return "\n".join(stitched).strip() + + def main() -> int: args = _parse_args() input_dir = Path(args.input_dir).resolve() @@ -108,7 +277,8 @@ def main() -> int: return 0 profile_defaults = _profile_defaults(args.ocr_profile) - prompt = profile_defaults["prompt"] + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + plain_prompt = _profile_defaults("plain_ocr")["prompt"] base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) @@ -134,7 +304,7 @@ def main() -> int: state = { "stem": pdf_path.stem, "page_outputs": [""] * total_pages, - "page_metrics": [], + "page_metrics": [None] * total_pages, "render_sec": float(render_sec), "doc_start": float(doc_start), "completed_pages": 0, @@ -144,6 +314,7 @@ def main() -> int: _write_progress(output_dir, pdf_path.stem, [], total_pages, 0) for idx, image in enumerate(images): page_path = tmp_dir / f"{pdf_path.stem}_page_{idx + 1:04d}.png" + image_stats = _image_content_stats(image) image.save(page_path, format="PNG") image.close() jobs.append( @@ -151,63 +322,200 @@ def main() -> int: "stem": pdf_path.stem, "page_number": int(idx + 1), "image_path": page_path, + "image_stats": image_stats, + "variant": "page", } ) - for batch in _batched(jobs, args.batch_size): - prompt_batch = [] - images: List[Image.Image] = [] - for item in batch: - image = Image.open(item["image_path"]).convert("RGB") - images.append(image) - prompt_batch.append( - { - "prompt": prompt, - "multi_modal_data": {"image": image}, - } - ) - infer_start = time.perf_counter() - outputs = llm.generate(prompt_batch, sampling_params=sampling_params) - infer_sec = time.perf_counter() - infer_start - per_item_sec = infer_sec / max(1, len(batch)) - for image in images: - image.close() + plain_repair_jobs: List[dict] = [] + tile_repair_requests: List[dict] = [] + first_pass_outputs = _generate_batch_outputs( + llm, + jobs=jobs, + prompt=prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in first_pass_outputs: + item = result["item"] + state = doc_states[item["stem"]] + raw_text = str(result["raw_text"]) + image_stats = dict(item.get("image_stats", {})) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + state["page_outputs"][item["page_number"] - 1] = page_text + quality = _text_quality_metrics(page_text) + repair_strategy, repair_reason = _classify_repair( + page_text, + image_stats=image_stats, + repair_mode=args.repair_mode, + ) + metric = { + "page_number": int(item["page_number"]), + "infer_sec": float(result["infer_sec"]), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + "first_pass_quality_score": float(quality["quality_score"]), + "first_pass_letters": int(quality["letters"]), + "first_pass_digits": int(quality["digits"]), + "first_pass_pua_chars": int(quality["pua_chars"]), + "repair_strategy": repair_strategy, + "repair_reason": repair_reason, + "repair_attempted": False, + "repair_applied": False, + **image_stats, + **postprocess_metrics, + } + state["page_metrics"][item["page_number"] - 1] = metric + if repair_strategy == "plain": + plain_repair_jobs.append(item) + elif repair_strategy == "tile": + tile_repair_requests.append(item) + state["completed_pages"] = int(state["completed_pages"]) + 1 + progress_pages = [page for page in state["page_outputs"] if page] + _write_progress( + output_dir, + item["stem"], + progress_pages, + int(state["total_pages"]), + int(state["completed_pages"]), + ) - for item, output in zip(batch, outputs): + if plain_repair_jobs: + plain_repair_outputs = _generate_batch_outputs( + llm, + jobs=plain_repair_jobs, + prompt=plain_prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in plain_repair_outputs: + item = result["item"] state = doc_states[item["stem"]] - raw_text = "" - if getattr(output, "outputs", None): - raw_text = str(output.outputs[0].text) - page_text, postprocess_metrics = _postprocess_page_text( - raw_text, - prompt=prompt, + metric = state["page_metrics"][item["page_number"] - 1] + original_text = state["page_outputs"][item["page_number"] - 1] + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, content_debug=bool(args.content_debug), ) if args.content_debug: - page_text = f"\n{page_text}".strip() - state["page_outputs"][item["page_number"] - 1] = page_text - state["page_metrics"].append( - { - "page_number": int(item["page_number"]), - "infer_sec": float(per_item_sec), - "raw_chars": int(len(raw_text.strip())), - "final_chars": int(len(page_text.strip())), - **postprocess_metrics, - } + repair_text = f"\n{repair_text}".strip() + original_quality = _text_quality_metrics(original_text) + repair_quality = _text_quality_metrics(repair_text) + apply_repair = bool(repair_text.strip()) and ( + float(repair_quality["quality_score"]) >= float(original_quality["quality_score"]) + or str(metric.get("repair_reason")) in {"markdown_garbage", "extreme_short"} + ) + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_final_chars"] = int(len(repair_text.strip())) + metric["repair_quality_score"] = float(repair_quality["quality_score"]) + metric["repair_profile"] = "plain_ocr" + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) + if apply_repair: + state["page_outputs"][item["page_number"] - 1] = repair_text + metric["repair_applied"] = True + metric["final_chars"] = int(len(repair_text.strip())) + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + if tile_repair_requests: + tile_jobs: List[dict] = [] + for item in tile_repair_requests: + for tile_name, y0, y1 in REPAIR_TILE_SPECS: + tile_jobs.append( + { + "stem": item["stem"], + "page_number": int(item["page_number"]), + "image_path": item["image_path"], + "variant": tile_name, + "crop_box": (0.0, y0, 1.0, y1), + } + ) + tile_outputs = _generate_batch_outputs( + llm, + jobs=tile_jobs, + prompt=prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + grouped_tile_outputs: Dict[tuple[str, int], List[dict]] = {} + for result in tile_outputs: + key = (str(result["item"]["stem"]), int(result["item"]["page_number"])) + grouped_tile_outputs.setdefault(key, []).append(result) + for item in tile_repair_requests: + key = (str(item["stem"]), int(item["page_number"])) + state = doc_states[item["stem"]] + metric = state["page_metrics"][item["page_number"] - 1] + original_text = state["page_outputs"][item["page_number"] - 1] + grouped = sorted( + grouped_tile_outputs.get(key, []), + key=lambda value: {"top": 0, "mid": 1, "bottom": 2}.get(str(value["item"].get("variant")), 99), ) - state["completed_pages"] = int(state["completed_pages"]) + 1 - progress_pages = [page for page in state["page_outputs"] if page] - _write_progress( - output_dir, - item["stem"], - progress_pages, - int(state["total_pages"]), - int(state["completed_pages"]), + tile_parts: List[str] = [] + repair_infer_sec = 0.0 + for result in grouped: + repair_infer_sec += float(result["infer_sec"]) + tile_text, _ = _postprocess_page_text( + str(result["raw_text"]), + prompt=prompt, + content_debug=bool(args.content_debug), + ) + tile_parts.append(tile_text) + stitched = _stitch_tiled_markdown(tile_parts) + if args.content_debug: + stitched = f"\n{stitched}".strip() + original_quality = _text_quality_metrics(original_text) + stitched_quality = _text_quality_metrics(stitched) + apply_repair = bool(stitched.strip()) and ( + float(stitched_quality["quality_score"]) > float(original_quality["quality_score"]) + and int(stitched_quality["chars"]) >= int(original_quality["chars"]) ) + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(metric.get("repair_infer_sec", 0.0)) + float(repair_infer_sec) + metric["repair_final_chars"] = int(len(stitched.strip())) + metric["repair_quality_score"] = float(stitched_quality["quality_score"]) + metric["repair_tile_count"] = int(len(grouped)) + metric["repair_profile"] = "markdown_grounded_tiled" + metric["infer_sec"] = float(metric["infer_sec"]) + float(repair_infer_sec) + if apply_repair: + state["page_outputs"][item["page_number"] - 1] = stitched + metric["repair_applied"] = True + metric["final_chars"] = int(len(stitched.strip())) + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) for stem, state in doc_states.items(): markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" - page_metrics = sorted(state["page_metrics"], key=lambda item: int(item["page_number"])) + page_metrics = sorted( + [item for item in state["page_metrics"] if item], + key=lambda item: int(item["page_number"]), + ) + repair_summary = { + "repair_mode": str(args.repair_mode), + "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), + "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), + "plain_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied")))), + "tiled_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "markdown_grounded_tiled" and bool(item.get("repair_applied")))), + } _write_outputs( output_dir, stem, @@ -225,9 +533,11 @@ def main() -> int: "batch_size": int(args.batch_size), "gpu_memory_utilization": float(args.gpu_memory_utilization), "disable_fp8_kv": bool(args.disable_fp8_kv), + "repair_mode": str(args.repair_mode), "render_sec": float(state["render_sec"]), "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "repair_summary": repair_summary, "page_metrics": page_metrics, }, ) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index fe60390..8959e25 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -44,6 +44,7 @@ def _build_cli_command( content_debug: bool, device: Optional[str], ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -56,6 +57,7 @@ def _build_cli_command( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -78,6 +80,8 @@ def _build_cli_command( cmd += ["--device", str(device)] if ocr_profile: cmd += ["--ocr-profile", str(ocr_profile)] + if prompt_override: + cmd += ["--prompt-override", str(prompt_override)] if attn_backend: cmd += ["--attn-backend", str(attn_backend)] if base_size is not None: @@ -104,6 +108,8 @@ def _build_cli_command( cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] if disable_fp8_kv: cmd.append("--disable-fp8-kv") + if repair_mode: + cmd += ["--repair-mode", str(repair_mode)] return cmd @@ -139,6 +145,7 @@ def _run_cli( content_debug: bool, device: Optional[str], ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -151,6 +158,7 @@ def _run_cli( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -164,6 +172,7 @@ def _run_cli( content_debug=content_debug, device=device, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -176,6 +185,7 @@ def _run_cli( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -318,6 +328,7 @@ def _run_multi_cli( content_debug: bool, log_dir: Path, ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -330,6 +341,7 @@ def _run_multi_cli( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -363,6 +375,7 @@ def _run_multi_cli( content_debug=content_debug, device="cuda", ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -375,6 +388,7 @@ def _run_multi_cli( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -416,6 +430,7 @@ def run_for_files( device: Optional[str] = None, runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, attn_backend: str = "auto", base_size: Optional[int] = None, image_size: Optional[int] = None, @@ -430,6 +445,7 @@ def run_for_files( gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, vllm_batch_size: Optional[int] = None, + repair_mode: str = "auto", **_: Any, ) -> Dict[str, Any]: """Run DeepSeek OCR for the provided files.""" @@ -509,6 +525,7 @@ def run_for_files( content_debug=content_debug, log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -521,6 +538,7 @@ def run_for_files( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) else: _run_cli( @@ -534,6 +552,7 @@ def run_for_files( content_debug=content_debug, device=device, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -546,6 +565,7 @@ def run_for_files( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 4629d2f..bc20acd 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -126,6 +126,7 @@ def test_build_cli_command_includes_speed_flags(tmp_path): content_debug=False, device="cuda", ocr_profile="plain_ocr", + prompt_override="custom prompt", attn_backend="flash_attention_2", base_size=768, image_size=512, @@ -138,9 +139,11 @@ def test_build_cli_command_includes_speed_flags(tmp_path): vllm_batch_size=None, gpu_memory_utilization=None, disable_fp8_kv=False, + repair_mode=None, ) assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--prompt-override" in cmd and "custom prompt" in cmd assert "--attn-backend" in cmd and "flash_attention_2" in cmd assert "--base-size" in cmd and "768" in cmd assert "--image-size" in cmd and "512" in cmd @@ -163,6 +166,7 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): content_debug=False, device="cuda", ocr_profile="markdown_grounded", + prompt_override=None, attn_backend="auto", base_size=None, image_size=None, @@ -175,11 +179,28 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): vllm_batch_size=16, gpu_memory_utilization=0.92, disable_fp8_kv=True, + repair_mode="auto", ) assert "--batch-size" in cmd and "16" in cmd assert "--gpu-memory-utilization" in cmd and "0.92" in cmd assert "--disable-fp8-kv" in cmd + assert "--repair-mode" in cmd and "auto" in cmd + + +def test_vllm_repair_classifier_routes_garbage_and_short_pages(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _classify_repair + + dense_page = { + "top_dark_ratio": 0.16, + "bottom_dark_ratio": 0.16, + "overall_dark_ratio": 0.15, + } + assert _classify_repair("\uf0b7" * 80, dense_page, "auto") == ("plain", "markdown_garbage") + assert _classify_repair("42", dense_page, "auto") == ("plain", "extreme_short") + assert _classify_repair("Α" * 300, dense_page, "auto") == ("tile", "short_coverage") + assert _classify_repair("Α" * 1200, dense_page, "auto") == ("none", None) + assert _classify_repair("Α" * 300, dense_page, "off") == ("none", None) def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): From 5ad862043d628aa1871eabf9b851e702914a543a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:25:11 +0300 Subject: [PATCH 23/26] Add DeepSeek pipeline benchmark harness --- src/glossapi/ocr/deepseek/runner.py | 43 +- .../scripts/deepseek_pipeline_benchmark.py | 435 ++++++++++++++++++ 2 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 src/glossapi/scripts/deepseek_pipeline_benchmark.py diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 8959e25..906f30d 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -21,6 +21,7 @@ REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" +AUTO_VLLM_BATCH_PAGE_CAP = 160 def _page_count(pdf_path: Path) -> int: @@ -314,6 +315,24 @@ def _plan_lanes( return lanes +def _auto_vllm_batch_size( + *, + runtime_backend: str, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + total_pages = 0 + for name in file_list: + pdf_path = (input_root / name).resolve() + total_pages += int(_effective_page_count(pdf_path, max_pages)) + if total_pages <= 0: + return 1 + return min(int(total_pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + def _run_multi_cli( *, input_root: Path, @@ -362,6 +381,16 @@ def _run_multi_cli( if not lane_files: continue visible_device = int(lane["visible_device"]) + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend, + file_list=lane_files, + input_root=input_root, + max_pages=max_pages, + ) + ) log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" fh = stack.enter_context(log_path.open("w", encoding="utf-8")) cmd = _build_cli_command( @@ -385,7 +414,7 @@ def _run_multi_cli( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, runtime_backend=runtime_backend, - vllm_batch_size=vllm_batch_size, + vllm_batch_size=resolved_vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, @@ -541,6 +570,16 @@ def run_for_files( repair_mode=repair_mode, ) else: + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend_norm, + file_list=file_list, + input_root=pdf_root, + max_pages=max_pages, + ) + ) _run_cli( input_dir=pdf_root, output_dir=out_root, @@ -562,7 +601,7 @@ def run_for_files( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, runtime_backend=runtime_backend_norm, - vllm_batch_size=vllm_batch_size, + vllm_batch_size=resolved_vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py new file mode 100644 index 0000000..b9a0c94 --- /dev/null +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -0,0 +1,435 @@ +from __future__ import annotations + +import argparse +import json +import os +import random +import shutil +import subprocess +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + + +def _parse_devices(spec: str) -> List[int]: + tokens = [piece.strip() for piece in str(spec or "").split(",") if piece.strip()] + if not tokens: + raise argparse.ArgumentTypeError("--devices must contain at least one GPU id") + try: + return [int(token) for token in tokens] + except ValueError as exc: + raise argparse.ArgumentTypeError(f"Invalid GPU list: {spec}") from exc + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_pipeline_benchmark", + description="Benchmark DeepSeek OCR pipeline throughput for static and streaming-style scheduling.", + ) + p.add_argument("--repo", required=True) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--python-bin", required=True) + p.add_argument("--model-dir", required=True) + p.add_argument("--label", required=True) + p.add_argument("--mode", default="static", choices=["static", "streaming"]) + p.add_argument("--devices", default="0,1,2,3,4,5,6,7") + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--max-docs", type=int, default=None) + p.add_argument("--doc-order", default="name", choices=["name", "random", "largest_first"]) + p.add_argument("--seed", type=int, default=20260330) + p.add_argument("--stream-batch-pages", type=int, default=160) + p.add_argument("--runtime-backend", default="vllm", choices=["transformers", "vllm"]) + p.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + p.add_argument("--prompt-override", default=None) + p.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + p.add_argument("--attn-backend", default="auto") + p.add_argument("--base-size", type=int, default=None) + p.add_argument("--image-size", type=int, default=None) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--max-new-tokens", type=int, default=None) + p.add_argument("--vllm-batch-size", type=int, default=None) + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + p.add_argument("--disable-fp8-kv", action="store_true") + p.add_argument("--clean", action="store_true") + return p.parse_args() + + +def _weighted_files( + *, + input_dir: Path, + max_docs: Optional[int], + doc_order: str, + seed: int, +) -> List[Dict[str, Any]]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + weighted = [] + for path in sorted(input_dir.glob("*.pdf")): + pages = int(deepseek_runner._effective_page_count(path, None)) + weighted.append({"name": path.name, "pages": pages}) + if doc_order == "largest_first": + weighted.sort(key=lambda item: (-int(item["pages"]), str(item["name"]))) + elif doc_order == "random": + rng = random.Random(int(seed)) + rng.shuffle(weighted) + if max_docs is not None: + weighted = weighted[: max(0, int(max_docs))] + return weighted + + +def _empty_lanes(devices: List[int], workers_per_gpu: int) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "batches": [], + "assigned_pages": 0, + } + ) + lane_id += 1 + return lanes + + +def _plan_static( + weighted_files: List[Dict[str, Any]], + devices: List[int], + workers_per_gpu: int, + input_dir: Path, +) -> List[Dict[str, Any]]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + lanes = deepseek_runner._plan_lanes( + file_list=[str(item["name"]) for item in weighted_files], + input_root=input_dir, + lane_devices=devices, + workers_per_gpu=max(1, int(workers_per_gpu)), + max_pages=None, + ) + weights = {str(item["name"]): int(item["pages"]) for item in weighted_files} + planned: List[Dict[str, Any]] = [] + for lane in lanes: + files = list(lane["files"]) + if not files: + continue + weight = sum(int(weights.get(name, 0)) for name in files) + planned.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "assigned_pages": int(weight), + "batches": [ + { + "batch_id": 0, + "files": files, + "pages": int(weight), + } + ], + } + ) + return planned + + +def _plan_streaming( + weighted_files: List[Dict[str, Any]], + devices: List[int], + workers_per_gpu: int, + stream_batch_pages: int, +) -> List[Dict[str, Any]]: + lanes = _empty_lanes(devices, workers_per_gpu) + batch_target = max(1, int(stream_batch_pages)) + current: Dict[int, Dict[str, Any]] = { + int(lane["lane_id"]): {"files": [], "pages": 0} + for lane in lanes + } + + def flush(lane: Dict[str, Any]) -> None: + lane_id = int(lane["lane_id"]) + state = current[lane_id] + if not state["files"]: + return + lane["batches"].append( + { + "batch_id": len(lane["batches"]), + "files": list(state["files"]), + "pages": int(state["pages"]), + } + ) + state["files"] = [] + state["pages"] = 0 + + for item in weighted_files: + lane = min(lanes, key=lambda value: (int(value["assigned_pages"]) + int(current[int(value["lane_id"])]["pages"]), int(value["lane_id"]))) + lane_id = int(lane["lane_id"]) + current[lane_id]["files"].append(str(item["name"])) + current[lane_id]["pages"] = int(current[lane_id]["pages"]) + int(item["pages"]) + lane["assigned_pages"] = int(lane["assigned_pages"]) + int(item["pages"]) + if int(current[lane_id]["pages"]) >= batch_target: + flush(lane) + + for lane in lanes: + flush(lane) + return [lane for lane in lanes if lane["batches"]] + + +def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: + metrics_dir = run_dir / "json" / "metrics" + totals = { + "docs_with_metrics": 0, + "pages_flagged": 0, + "pages_repaired": 0, + "plain_repairs": 0, + "tiled_repairs": 0, + } + if not metrics_dir.exists(): + return totals + for path in metrics_dir.glob("*.metrics.json"): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + totals["docs_with_metrics"] += 1 + summary = data.get("repair_summary") or {} + totals["pages_flagged"] += int(summary.get("pages_flagged", 0)) + totals["pages_repaired"] += int(summary.get("pages_repaired", 0)) + totals["plain_repairs"] += int(summary.get("plain_repairs", 0)) + totals["tiled_repairs"] += int(summary.get("tiled_repairs", 0)) + return totals + + +def main() -> int: + args = _parse_args() + repo = Path(args.repo).resolve() + input_dir = Path(args.input_dir).resolve() + output_root = Path(args.output_dir).resolve() + python_bin = Path(args.python_bin).expanduser() + model_dir = Path(args.model_dir).resolve() + devices = _parse_devices(args.devices) + + from glossapi.ocr.deepseek import runner as deepseek_runner + + weighted_files = _weighted_files( + input_dir=input_dir, + max_docs=args.max_docs, + doc_order=args.doc_order, + seed=int(args.seed), + ) + if not weighted_files: + raise SystemExit("No PDFs found for benchmark input set.") + + if str(args.mode) == "streaming": + lanes = _plan_streaming( + weighted_files=weighted_files, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + stream_batch_pages=max(1, int(args.stream_batch_pages)), + ) + else: + lanes = _plan_static( + weighted_files=weighted_files, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + input_dir=input_dir, + ) + + run_dir = output_root / args.label + if args.clean and run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + logs_dir = run_dir / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "lane_plan.json").write_text(json.dumps(lanes, indent=2), encoding="utf-8") + + script_path = ( + deepseek_runner.DEFAULT_VLLM_SCRIPT + if str(args.runtime_backend) == "vllm" + else deepseek_runner.DEFAULT_SCRIPT + ) + + py_env = {"PYTHONPATH": str(repo / "src")} + + def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + batch_id = int(batch["batch_id"]) + files = list(batch["files"]) + pages = int(batch["pages"]) + resolved_vllm_batch_size = ( + int(args.vllm_batch_size) + if args.vllm_batch_size is not None + else deepseek_runner._auto_vllm_batch_size( + runtime_backend=str(args.runtime_backend), + file_list=files, + input_root=input_dir, + max_pages=None, + ) + ) + log_path = logs_dir / f"lane_{lane_id:02d}_batch_{batch_id:03d}_gpu{visible_device}.log" + fh = log_path.open("w", encoding="utf-8") + cmd = deepseek_runner._build_cli_command( + input_dir=input_dir, + output_dir=run_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script_path, + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile=str(args.ocr_profile), + prompt_override=args.prompt_override, + attn_backend=str(args.attn_backend), + base_size=args.base_size, + image_size=args.image_size, + crop_mode=None, + render_dpi=int(args.render_dpi), + max_new_tokens=args.max_new_tokens, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend=str(args.runtime_backend), + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + repair_mode=str(args.repair_mode), + ) + env = deepseek_runner._build_env(python_bin=python_bin, visible_device=visible_device) + if env.get("PYTHONPATH"): + env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" + else: + env["PYTHONPATH"] = py_env["PYTHONPATH"] + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + return { + "lane_id": lane_id, + "visible_device": visible_device, + "batch_id": batch_id, + "pages": pages, + "files": files, + "resolved_vllm_batch_size": resolved_vllm_batch_size, + "log_path": str(log_path), + "fh": fh, + "proc": proc, + "start_ts": time.perf_counter(), + "cmd": cmd, + } + + pending_batches: Dict[int, List[Dict[str, Any]]] = { + int(lane["lane_id"]): list(lane["batches"]) + for lane in lanes + } + active: List[Dict[str, Any]] = [] + global_start = time.perf_counter() + for lane in lanes: + lane_id = int(lane["lane_id"]) + if pending_batches[lane_id]: + first_batch = pending_batches[lane_id].pop(0) + active.append(start_batch(lane, first_batch)) + + batch_results: List[Dict[str, Any]] = [] + while active: + time.sleep(0.2) + for item in list(active): + rc = item["proc"].poll() + if rc is None: + continue + end_ts = time.perf_counter() + item["fh"].close() + elapsed = max(0.000001, float(end_ts - item["start_ts"])) + batch_results.append( + { + "lane_id": int(item["lane_id"]), + "visible_device": int(item["visible_device"]), + "batch_id": int(item["batch_id"]), + "pages": int(item["pages"]), + "files": list(item["files"]), + "return_code": int(rc), + "resolved_vllm_batch_size": item["resolved_vllm_batch_size"], + "start_offset_sec": float(item["start_ts"] - global_start), + "end_offset_sec": float(end_ts - global_start), + "elapsed_sec": float(elapsed), + "sec_per_page": float(elapsed / max(1, int(item["pages"]))), + "log_path": str(item["log_path"]), + "cmd": item["cmd"], + } + ) + active.remove(item) + lane = next(lane for lane in lanes if int(lane["lane_id"]) == int(item["lane_id"])) + if pending_batches[int(item["lane_id"])]: + next_batch = pending_batches[int(item["lane_id"])].pop(0) + active.append(start_batch(lane, next_batch)) + + total_elapsed = max(0.000001, time.perf_counter() - global_start) + total_pages = sum(int(item["pages"]) for item in weighted_files) + failures = [item for item in batch_results if int(item["return_code"]) != 0] + + lane_results: List[Dict[str, Any]] = [] + for lane in lanes: + lane_batches = [item for item in batch_results if int(item["lane_id"]) == int(lane["lane_id"])] + if not lane_batches: + continue + lane_start = min(float(item["start_offset_sec"]) for item in lane_batches) + lane_end = max(float(item["end_offset_sec"]) for item in lane_batches) + lane_elapsed = max(0.000001, lane_end - lane_start) + lane_pages = sum(int(item["pages"]) for item in lane_batches) + lane_results.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "batch_count": len(lane_batches), + "pages": int(lane_pages), + "active_elapsed_sec": float(lane_elapsed), + "sec_per_page": float(lane_elapsed / max(1, lane_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in lane_batches), + } + ) + + gpu_results: List[Dict[str, Any]] = [] + for visible_device in sorted({int(item["visible_device"]) for item in batch_results}): + gpu_batches = [item for item in batch_results if int(item["visible_device"]) == visible_device] + gpu_start = min(float(item["start_offset_sec"]) for item in gpu_batches) + gpu_end = max(float(item["end_offset_sec"]) for item in gpu_batches) + gpu_elapsed = max(0.000001, gpu_end - gpu_start) + gpu_pages = sum(int(item["pages"]) for item in gpu_batches) + gpu_results.append( + { + "visible_device": visible_device, + "batch_count": len(gpu_batches), + "pages": int(gpu_pages), + "active_elapsed_sec": float(gpu_elapsed), + "sec_per_page": float(gpu_elapsed / max(1, gpu_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in gpu_batches), + } + ) + + repair_metrics = _collect_repair_metrics(run_dir) + summary = { + "label": str(args.label), + "status": "pass" if not failures else "fail", + "mode": str(args.mode), + "runtime_backend": str(args.runtime_backend), + "ocr_profile": str(args.ocr_profile), + "repair_mode": str(args.repair_mode), + "devices": devices, + "workers_per_gpu": int(args.workers_per_gpu), + "doc_order": str(args.doc_order), + "stream_batch_pages": int(args.stream_batch_pages), + "docs": len(weighted_files), + "pages": int(total_pages), + "wall_time_sec": float(total_elapsed), + "sec_per_page": float(total_elapsed / max(1, total_pages)), + "batch_results": batch_results, + "lane_results": lane_results, + "gpu_results": gpu_results, + "repair_metrics": repair_metrics, + "failures": failures, + } + (run_dir / "pipeline_benchmark_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(json.dumps(summary, indent=2)) + return 1 if failures else 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) From 41b983e4ac925f9f419ce687b3d78980df80fe29 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:52:32 +0300 Subject: [PATCH 24/26] Document DeepSeek pipeline benchmark results --- docs/ocr_and_math_enhancement.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index ac2a5b7..3c9f584 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -152,6 +152,30 @@ Validated on 2026-03-30: - GPU memory utilization: `0.9` - Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` +Production markdown+repair benchmark on the same host: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Max new tokens: `2048` +- GPUs: `8` +- Static sharding (`1` shard/GPU): `574.87s` wall, `0.0754 sec/page` overall, `0.4971` to `0.5484 sec/page/GPU` +- Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` +- Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU +- Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches + +Decision: + +- Keep static sharding as the default large-run pipeline shape for now +- Do not enable streaming admission by default yet; on this benchmark it regressed badly versus static sharding +- Treat the earlier `0.3109 sec/page/GPU` result as the raw floor, and the static repaired-markdown result above as the current production-like baseline on this hardware + +Attention/runtime note: + +- The production fast path is `vllm`; logs on this stack show `flashinfer` autotuning plus CUDA graph capture +- Transformers remain the fallback path; prefer `flash_attention_2` there and do not optimize around `sdpa` + That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. - Batch sizes From 0a863238261fb402c32687161daa78218a8f946d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 21:37:39 +0300 Subject: [PATCH 25/26] Harden DeepSeek repair classification --- .../ocr/deepseek/run_pdf_ocr_transformers.py | 8 +-- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 54 ++++++++++++++++++- tests/test_deepseek_runner_contract.py | 11 ++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 071b3b5..d7b0387 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -12,10 +12,7 @@ from pathlib import Path from typing import Iterable, List -import fitz -import torch from PIL import Image -from transformers import AutoModel, AutoTokenizer SRC_ROOT = Path(__file__).resolve().parents[3] if str(SRC_ROOT) not in sys.path: @@ -82,6 +79,8 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: + import fitz + images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: @@ -204,6 +203,9 @@ def _load_model( repetition_penalty: float | None, no_repeat_ngram_size: int | None, ): + import torch + from transformers import AutoModel, AutoTokenizer + attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 56870e5..6d8354c 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -32,8 +32,12 @@ REPAIR_EXTREME_SHORT_CHARS = 120 REPAIR_PUA_THRESHOLD = 64 REPAIR_MIN_HALF_DARK = 0.08 +REPAIR_MIN_THIRD_DARK = 0.07 REPAIR_MAX_OVERALL_DARK = 0.25 REPAIR_MIN_OVERALL_DARK = 0.04 +REPAIR_FOOTNOTE_SHORT_CHARS = 1100 +REPAIR_MIN_FOOTNOTE_LINES = 2 +REPAIR_FOOTNOTE_RATIO = 0.40 def _parse_args() -> argparse.Namespace: @@ -130,10 +134,16 @@ def _dark_ratio(y0: int, y1: int) -> float: return float(dark) / float(total) half = max(1, height // 2) + third = max(1, height // 3) + top_third_end = min(height, third) + middle_third_end = min(height, third * 2) dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) return { "top_dark_ratio": _dark_ratio(0, half), "bottom_dark_ratio": _dark_ratio(half, height), + "top_third_dark_ratio": _dark_ratio(0, top_third_end), + "middle_third_dark_ratio": _dark_ratio(top_third_end, middle_third_end), + "bottom_third_dark_ratio": _dark_ratio(middle_third_end, height), "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), } @@ -153,34 +163,74 @@ def _text_quality_metrics(text: str) -> dict: letters = sum(1 for ch in stripped if ch.isalpha()) digits = sum(1 for ch in stripped if ch.isdigit()) pua_chars = _count_private_use_chars(stripped) + lines = [line.strip() for line in stripped.splitlines() if line.strip()] + footnote_like_lines = sum(1 for line in lines if _is_footnote_like_line(line)) + avg_line_length = (sum(len(line) for line in lines) / float(len(lines))) if lines else 0.0 score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) return { "chars": int(len(stripped)), "letters": int(letters), "digits": int(digits), "pua_chars": int(pua_chars), + "line_count": int(len(lines)), + "footnote_like_lines": int(footnote_like_lines), + "avg_line_length": float(avg_line_length), "quality_score": float(score), } +def _is_footnote_like_line(line: str) -> bool: + stripped = str(line or "").strip() + if not stripped: + return False + if len(stripped) <= 2: + return False + if stripped[0].isdigit(): + if len(stripped) > 1 and stripped[1] in {".", ")", "]"}: + return True + if len(stripped) > 2 and stripped[1].isspace(): + return True + if stripped[0] in {"*", "•", "-", "†", "‡"}: + return True + return False + + def _classify_repair(text: str, image_stats: dict, repair_mode: str) -> tuple[str, str | None]: if str(repair_mode or "off").strip().lower() != "auto": return "none", None quality = _text_quality_metrics(text) chars = int(quality["chars"]) pua_chars = int(quality["pua_chars"]) + line_count = int(quality["line_count"]) + footnote_like_lines = int(quality["footnote_like_lines"]) + footnote_ratio = float(footnote_like_lines) / float(max(1, line_count)) pua_ratio = float(pua_chars) / float(max(1, chars)) if pua_chars >= REPAIR_PUA_THRESHOLD or pua_ratio >= 0.10: return "plain", "markdown_garbage" - if chars <= REPAIR_EXTREME_SHORT_CHARS: - return "plain", "extreme_short" top_dark = float(image_stats.get("top_dark_ratio", 0.0)) bottom_dark = float(image_stats.get("bottom_dark_ratio", 0.0)) + top_third_dark = float(image_stats.get("top_third_dark_ratio", top_dark)) + middle_third_dark = float(image_stats.get("middle_third_dark_ratio", 0.0)) + bottom_third_dark = float(image_stats.get("bottom_third_dark_ratio", bottom_dark)) overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if ( + chars <= REPAIR_FOOTNOTE_SHORT_CHARS + and footnote_like_lines >= REPAIR_MIN_FOOTNOTE_LINES + and footnote_ratio >= REPAIR_FOOTNOTE_RATIO + and top_third_dark >= REPAIR_MIN_THIRD_DARK + and middle_third_dark >= REPAIR_MIN_THIRD_DARK + and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK + ): + return "tile", "footnote_dominant" + if chars <= REPAIR_EXTREME_SHORT_CHARS: + return "plain", "extreme_short" if ( chars <= REPAIR_SHORT_CHARS and top_dark >= REPAIR_MIN_HALF_DARK and bottom_dark >= REPAIR_MIN_HALF_DARK + and top_third_dark >= REPAIR_MIN_THIRD_DARK + and middle_third_dark >= REPAIR_MIN_THIRD_DARK + and bottom_third_dark >= REPAIR_MIN_THIRD_DARK and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK ): return "tile", "short_coverage" diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index bc20acd..d58472d 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -194,11 +194,22 @@ def test_vllm_repair_classifier_routes_garbage_and_short_pages(): dense_page = { "top_dark_ratio": 0.16, "bottom_dark_ratio": 0.16, + "top_third_dark_ratio": 0.15, + "middle_third_dark_ratio": 0.15, + "bottom_third_dark_ratio": 0.15, "overall_dark_ratio": 0.15, } assert _classify_repair("\uf0b7" * 80, dense_page, "auto") == ("plain", "markdown_garbage") assert _classify_repair("42", dense_page, "auto") == ("plain", "extreme_short") assert _classify_repair("Α" * 300, dense_page, "auto") == ("tile", "short_coverage") + footnote_only = "\n".join( + [ + "1. υποσημείωση πρώτη γραμμή", + "2. υποσημείωση δεύτερη γραμμή", + "3. υποσημείωση τρίτη γραμμή", + ] + ) + assert _classify_repair(footnote_only, dense_page, "auto") == ("tile", "footnote_dominant") assert _classify_repair("Α" * 1200, dense_page, "auto") == ("none", None) assert _classify_repair("Α" * 300, dense_page, "off") == ("none", None) From 3038fa8e4d71f5cd68ab6e215bec7c2dcbedef77 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 23:45:19 +0300 Subject: [PATCH 26/26] Update DeepSeek benchmark note --- docs/ocr_and_math_enhancement.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 3c9f584..dd26569 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -160,7 +160,7 @@ Production markdown+repair benchmark on the same host: - Repair mode: `auto` - Max new tokens: `2048` - GPUs: `8` -- Static sharding (`1` shard/GPU): `574.87s` wall, `0.0754 sec/page` overall, `0.4971` to `0.5484 sec/page/GPU` +- Static sharding (`1` shard/GPU), validated rerun after classifier hardening: `558.88s` wall, `0.0733 sec/page` overall, `0.4912` to `0.5475 sec/page/GPU` - Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` - Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU - Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches