Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ sandbox/
venv/
venvs/
.DS_Store
uv.lock
5 changes: 5 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
- Prefer specific exceptions over generic ones
- For CLI, use click library patterns
- Imports organized: stdlib, third-party, local (alphabetical within groups)
- **Imports must be at the top of the file** — do NOT place imports inside
functions or methods unless there is a concrete reason (circular dependency,
or heavy transitive imports like `pynwb`/`h5py`/`nwbinspector` that would
slow down module load for unrelated code paths). When deferring an import
for weight, add the comment `# Avoid heavy import by importing within function:`.

## Documentation
- Keep docstrings updated when changing function signatures
Expand Down
46 changes: 45 additions & 1 deletion dandi/files/bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from dandischema.digests.dandietag import DandiETag
from dandischema.models import BareAsset, CommonModel
from dandischema.models import Dandiset as DandisetMeta
from dandischema.models import get_schema_version
from dandischema.models import StandardsType, get_schema_version, nwb_standard
from packaging.version import Version
from pydantic import ValidationError
from pydantic_core import ErrorDetails
Expand All @@ -41,6 +41,20 @@
Validator,
)

# True when the installed dandischema exposes the per-asset dataStandard field
# and related StandardsType enhancements (version, extensions). All these
# fields ship together starting with dandischema 0.12.2.
# TODO: remove this guard (and all branches that check it) once the minimum
# required dandischema version is >= 0.12.2.
_SCHEMA_BAREASSET_HAS_DATASTANDARD = "dataStandard" in BareAsset.model_fields
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD and Version(
dandischema.__version__
) >= Version("0.12.2"):
raise RuntimeError(
f"dandischema {dandischema.__version__} should have "
f"'dataStandard' field on BareAsset"
)

lgr = dandi.get_logger()

# TODO -- should come from schema. This is just a simplistic example for now
Expand Down Expand Up @@ -504,6 +518,36 @@ def get_metadata(
else:
raise
metadata.path = self.path
if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
kwargs: dict[str, Any] = dict(nwb_standard)
# Avoid heavy import by importing within function:
from dandi.pynwb_utils import get_nwb_extensions, get_nwb_version

if nwb_ver := get_nwb_version(self.filepath):
kwargs["version"] = nwb_ver
try:
nwb_exts = get_nwb_extensions(self.filepath)
except Exception:
lgr.debug(
"Failed to extract NWB extensions from %s",
self.filepath,
exc_info=True,
)
nwb_exts = {}
if nwb_exts:
kwargs["extensions"] = [
StandardsType(name=name, version=ver).model_dump( # type: ignore[call-arg]
mode="json", exclude_none=True
)
for name, ver in sorted(nwb_exts.items())
]
nwb = StandardsType(**kwargs)
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
cur = getattr(metadata, "dataStandard", None)
if cur is None:
setattr(metadata, "dataStandard", [nwb])
elif nwb not in cur:
cur.append(nwb)
return metadata

# TODO: @validate_cache.memoize_path
Expand Down
88 changes: 85 additions & 3 deletions dandi/files/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,32 @@
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
import json
from pathlib import Path
from threading import Lock
import weakref

from dandischema.models import BareAsset
from dandischema.models import (
BareAsset,
StandardsType,
bids_standard,
ome_ngff_standard,
)

import dandi
from dandi.bids_validator_deno import bids_validate

from .bases import GenericAsset, LocalFileAsset, NWBAsset
from .bases import (
_SCHEMA_BAREASSET_HAS_DATASTANDARD,
GenericAsset,
LocalFileAsset,
NWBAsset,
)
Comment on lines +21 to +26

Check notice

Code scanning / CodeQL

Cyclic import Note

Import of module
dandi.files.bases
begins an import cycle.

Copilot Autofix

AI 5 days ago

To break the cycle with minimal behavioral change, keep only the class needed for immediate class definition (LocalFileAsset) as a top-level import, and lazily import the other .bases symbols (GenericAsset, NWBAsset, _SCHEMA_BAREASSET_HAS_DATASTANDARD) inside the methods/functions that use them.

In the shown snippet, we can safely reduce the top-level .bases import to only LocalFileAsset. This removes the cycle-triggering eager dependency footprint from module import time for three symbols. Then, in regions of dandi/files/bids.py (not shown here) where GenericAsset, NWBAsset, or _SCHEMA_BAREASSET_HAS_DATASTANDARD are referenced, add local imports in those functions/methods. For typing-only references, prefer from __future__ import annotations (already present) and/or if TYPE_CHECKING: imports.

Within the provided lines, the concrete edit is:

  • Replace lines 21–27 with from .bases import LocalFileAsset.
  • Replace the conditional block at lines 28–31 to avoid using _SCHEMA_BAREASSET_HAS_DATASTANDARD at module import time (set a conservative default and load actual flag lazily where needed).
Suggested changeset 1
dandi/files/bids.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/dandi/files/bids.py b/dandi/files/bids.py
--- a/dandi/files/bids.py
+++ b/dandi/files/bids.py
@@ -18,17 +18,12 @@
 import dandi
 from dandi.bids_validator_deno import bids_validate
 
-from .bases import (
-    _SCHEMA_BAREASSET_HAS_DATASTANDARD,
-    GenericAsset,
-    LocalFileAsset,
-    NWBAsset,
-)
+from .bases import LocalFileAsset
 
-if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
-    from dandischema.models import hed_standard  # type: ignore[attr-defined]
-else:
-    hed_standard = None  # type: ignore[assignment]
+# Avoid eager access to .bases module state at import time to reduce cycle pressure.
+# Resolve schema capability lazily at usage sites.
+_SCHEMA_BAREASSET_HAS_DATASTANDARD = False
+hed_standard = None  # type: ignore[assignment]
 from .zarr import ZarrAsset
 from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
 from ..metadata.core import add_common_metadata, prepare_metadata
EOF
@@ -18,17 +18,12 @@
import dandi
from dandi.bids_validator_deno import bids_validate

from .bases import (
_SCHEMA_BAREASSET_HAS_DATASTANDARD,
GenericAsset,
LocalFileAsset,
NWBAsset,
)
from .bases import LocalFileAsset

if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
from dandischema.models import hed_standard # type: ignore[attr-defined]
else:
hed_standard = None # type: ignore[assignment]
# Avoid eager access to .bases module state at import time to reduce cycle pressure.
# Resolve schema capability lazily at usage sites.
_SCHEMA_BAREASSET_HAS_DATASTANDARD = False
hed_standard = None # type: ignore[assignment]
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..metadata.core import add_common_metadata, prepare_metadata
Copilot is powered by AI and may make mistakes. Always verify output.

if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
from dandischema.models import hed_standard # type: ignore[attr-defined]
else:
hed_standard = None # type: ignore[assignment]
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..metadata.core import add_common_metadata, prepare_metadata
Expand All @@ -23,10 +40,32 @@
ValidationResult,
)

lgr = dandi.get_logger()

BIDS_ASSET_ERRORS = ("BIDS.NON_BIDS_PATH_PLACEHOLDER",)
BIDS_DATASET_ERRORS = ("BIDS.MANDATORY_FILE_MISSING_PLACEHOLDER",)


def _add_standard(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 542 of bases appears to have similar logic

metadata: BareAsset,
standard_dict: dict,
version: str | None = None,
) -> None:
"""Add a data standard to asset metadata if the field is available."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
return
kwargs = dict(standard_dict)
if version:
kwargs["version"] = version
standard = StandardsType(**kwargs)
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
cur = getattr(metadata, "dataStandard", None)
if cur is None:
setattr(metadata, "dataStandard", [standard])
elif standard not in cur:
cur.append(standard)


@dataclass
class BIDSDatasetDescriptionAsset(LocalFileAsset):
"""
Expand Down Expand Up @@ -192,7 +231,48 @@
assert self._dataset_errors is not None
return self._dataset_errors.copy()

# get_metadata(): inherit use of default metadata from LocalFileAsset
def get_metadata(
self,
digest: Digest | None = None,
ignore_errors: bool = True,
) -> BareAsset:
metadata = super().get_metadata(digest=digest, ignore_errors=ignore_errors)
try:
with open(self.filepath) as f:
desc = json.load(f)
except (OSError, json.JSONDecodeError) as e:
lgr.warning("Failed to read %s: %s", self.filepath, e)
_add_standard(metadata, bids_standard)
return metadata
_add_standard(metadata, bids_standard, version=desc.get("BIDSVersion"))
if hed_standard and (hed_version := desc.get("HEDVersion")):
# HEDVersion can be a string or list.
# List form: ["8.2.0", "sc:1.0.0"] where first element is base
# HED version and subsequent "prefix:version" entries are library
# schemas recorded as extensions.
if isinstance(hed_version, list):
version = hed_version[0] if hed_version else None
library_entries = hed_version[1:]
else:
version = hed_version
library_entries = []
kwargs: dict = dict(hed_standard)
if version:
kwargs["version"] = version
if library_entries:
extensions = []
for entry in library_entries:
# Format is "prefix:version" (e.g. "sc:1.0.0")
if ":" in str(entry):
lib_name, lib_ver = str(entry).split(":", 1)
else:
lib_name, lib_ver = str(entry), None
ext = StandardsType(name=lib_name, version=lib_ver) # type: ignore[call-arg]
extensions.append(ext.model_dump(mode="json", exclude_none=True))
if extensions:
kwargs["extensions"] = extensions
_add_standard(metadata, kwargs)
return metadata


@dataclass
Expand Down Expand Up @@ -312,6 +392,8 @@
add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
metadata.path = self.path
metadata.encodingFormat = ZARR_MIME_TYPE
if Path(self.path).suffixes == [".ome", ".zarr"]:
_add_standard(metadata, ome_ngff_standard)
return metadata


Expand Down
43 changes: 43 additions & 0 deletions dandi/pynwb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,49 @@ def _sanitize(v: Any) -> str:
return None


# Namespaces bundled with NWB/HDMF core — not extensions
_NWB_CORE_NAMESPACES = frozenset({"core", "hdmf-common", "hdmf-experimental"})


def get_nwb_extensions(filepath: str | Path | Readable) -> dict[str, str]:
"""Return NWB extensions embedded in an HDF5 file.

Reads the ``specifications`` group of an NWB HDF5 file and returns a
mapping of extension namespace names to their latest embedded version,
excluding core NWB/HDMF namespaces.

Parameters
----------
filepath
Path to an NWB ``.nwb`` HDF5 file, or a :class:`Readable`.

Returns
-------
dict[str, str]
``{namespace_name: latest_version}`` for each non-core namespace
found in the file's ``specifications`` group. Empty dict if no
extensions are present or the group does not exist.
"""
extensions: dict[str, str] = {}
with open_readable(filepath) as fp, h5py.File(fp, "r") as h5file:
specs = h5file.get("specifications")
if specs is None:
return extensions
for name in specs:
if name in _NWB_CORE_NAMESPACES:
continue
ns_group = specs[name]
if not isinstance(ns_group, h5py.Group):
continue
try:
sorted_versions = sorted(ns_group, key=Version)
if sorted_versions:
extensions[name] = sorted_versions[-1]
except Exception:
lgr.debug("Failed to parse versions for NWB extension %s", name)
return extensions


def get_neurodata_types_to_modalities_map() -> dict[str, str]:
"""Return a dict to map neurodata types known to pynwb to "modalities"

Expand Down
89 changes: 89 additions & 0 deletions dandi/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
from operator import attrgetter
import os
from pathlib import Path
Expand Down Expand Up @@ -29,6 +30,7 @@
dandi_file,
find_dandi_files,
)
from ..files.bases import _SCHEMA_BAREASSET_HAS_DATASTANDARD

lgr = get_logger()

Expand Down Expand Up @@ -621,3 +623,90 @@ def test_validate_invalid_zarr3(path: str, expected_result_ids: set[str]) -> Non

result_ids = {r.id for r in zf.get_validation_errors()}
assert result_ids == expected_result_ids


@pytest.mark.ai_generated
class TestBIDSDatasetDescriptionDataStandard:
"""Tests for per-asset dataStandard population from dataset_description.json"""

@staticmethod
def _make_bids_dd(tmp_path: Path, content: dict) -> BIDSDatasetDescriptionAsset:
dd_path = tmp_path / "dataset_description.json"
dd_path.write_text(json.dumps(content))
return BIDSDatasetDescriptionAsset(
filepath=dd_path,
path="dataset_description.json",
dandiset_path=tmp_path,
)

@staticmethod
def _standard_names(metadata): # type: ignore[no-untyped-def]
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
return [s.name for s in (metadata.dataStandard or [])]

def test_bids_always_set(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_detected_when_hedversion_present(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0", "HEDVersion": "8.2.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_not_detected_when_hedversion_absent(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" not in names

def test_hed_detected_with_list_hedversion(self, tmp_path: Path) -> None:
"""HEDVersion can be a list of strings per BIDS spec."""
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0"],
},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names

def test_hed_library_schemas_as_extensions(self, tmp_path: Path) -> None:
"""HED library schemas in list HEDVersion populate extensions."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0", "lang:1.1.0"],
},
)
metadata = asset.get_metadata()
hed_standards = [
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
s
for s in (getattr(metadata, "dataStandard", None) or [])
if s.name == "Hierarchical Event Descriptors (HED)"
]
assert len(hed_standards) == 1
hed = hed_standards[0]
assert hed.version == "8.2.0"
assert hed.extensions is not None
ext_names = {e.name for e in hed.extensions}
assert ext_names == {"sc", "lang"}
ext_map = {e.name: e.version for e in hed.extensions}
assert ext_map == {"sc": "1.0.0", "lang": "1.1.0"}
Loading
Loading