From 755f72ac0898f6bc1061210a43da13a1ae7b2b5f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 14:54:19 -0500 Subject: [PATCH 1/4] Add waterdata.get_combined_metadata for combined location + time-series inventory Wraps the Water Data API's combined-metadata collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. Each row carries every column from both source endpoints, so any location attribute (state, HUC, site type, drainage area, well depth, ...) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, ...) in a single query. Mirrors R's read_waterdata_combined_meta. Implementation re-uses the existing get_ogc_data infrastructure: the function is a thin parameter declaration plus a service / output_id pair (combined-metadata, combined_meta_id), since _switch_arg_id, _switch_properties_id, _construct_api_requests, and _walk_pages are all already service-agnostic. Closes #263. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/__init__.py | 2 + dataretrieval/waterdata/api.py | 213 ++++++++++++++++++++++++++++ tests/waterdata_test.py | 35 +++++ 4 files changed, 252 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4d7780de..beabe9d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`. + **05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`. **05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/. diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 519ba6ff..22fb7d38 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -13,6 +13,7 @@ from .api import ( get_channel, get_codes, + get_combined_metadata, get_continuous, get_daily, get_field_measurements, @@ -43,6 +44,7 @@ "SERVICES", "get_channel", "get_codes", + "get_combined_metadata", "get_continuous", "get_daily", "get_field_measurements", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 583b512c..9d25aa16 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -932,6 +932,219 @@ def get_time_series_metadata( return get_ogc_data(args, output_id, service) +def get_combined_metadata( + monitoring_location_id: str | list[str] | None = None, + parameter_code: str | list[str] | None = None, + parameter_name: str | list[str] | None = None, + parameter_description: str | list[str] | None = None, + unit_of_measure: str | list[str] | None = None, + statistic_id: str | list[str] | None = None, + data_type: str | list[str] | None = None, + computation_identifier: str | list[str] | None = None, + thresholds: int | None = None, + sublocation_identifier: str | list[str] | None = None, + primary: str | list[str] | None = None, + parent_time_series_id: str | list[str] | None = None, + web_description: str | list[str] | None = None, + last_modified: str | list[str] | None = None, + begin: str | list[str] | None = None, + end: str | list[str] | None = None, + agency_code: str | list[str] | None = None, + agency_name: str | list[str] | None = None, + monitoring_location_number: str | list[str] | None = None, + monitoring_location_name: str | list[str] | None = None, + district_code: str | list[str] | None = None, + country_code: str | list[str] | None = None, + country_name: str | list[str] | None = None, + state_code: str | list[str] | None = None, + state_name: str | list[str] | None = None, + county_code: str | list[str] | None = None, + county_name: str | list[str] | None = None, + minor_civil_division_code: str | list[str] | None = None, + site_type_code: str | list[str] | None = None, + site_type: str | list[str] | None = None, + hydrologic_unit_code: str | list[str] | None = None, + basin_code: str | list[str] | None = None, + altitude: str | list[str] | None = None, + altitude_accuracy: str | list[str] | None = None, + altitude_method_code: str | list[str] | None = None, + altitude_method_name: str | list[str] | None = None, + vertical_datum: str | list[str] | None = None, + vertical_datum_name: str | list[str] | None = None, + horizontal_positional_accuracy_code: str | list[str] | None = None, + horizontal_positional_accuracy: str | list[str] | None = None, + horizontal_position_method_code: str | list[str] | None = None, + horizontal_position_method_name: str | list[str] | None = None, + original_horizontal_datum: str | list[str] | None = None, + original_horizontal_datum_name: str | list[str] | None = None, + drainage_area: str | list[str] | None = None, + contributing_drainage_area: str | list[str] | None = None, + time_zone_abbreviation: str | list[str] | None = None, + uses_daylight_savings: str | list[str] | None = None, + construction_date: str | list[str] | None = None, + aquifer_code: str | list[str] | None = None, + national_aquifer_code: str | list[str] | None = None, + aquifer_type_code: str | list[str] | None = None, + well_constructed_depth: str | list[str] | None = None, + hole_constructed_depth: str | list[str] | None = None, + depth_source_code: str | list[str] | None = None, + properties: str | list[str] | None = None, + skip_geometry: bool | None = None, + bbox: list[float] | None = None, + limit: int | None = None, + filter: str | None = None, + filter_lang: FILTER_LANG | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get combined monitoring-location and time-series metadata. + + The ``combined-metadata`` collection joins the monitoring-locations + catalog with the time-series-metadata catalog so that one row is + returned per (location, parameter, statistic) inventory entry, + carrying every column from both source endpoints. This makes it the + most flexible "what data is available" endpoint in the Water Data + API: any monitoring-location attribute (state, HUC, site type, + drainage area, well-construction depth, …) can be combined with any + time-series attribute (parameter code, statistic, data type, period + of record, …) in a single query. + + See the OpenAPI reference for the full list of supported fields: + https://api.waterdata.usgs.gov/ogcapi/v0/openapi?f=html#/combined-metadata + The R analogue is ``read_waterdata_combined_meta`` in + https://github.com/DOI-USGS/dataRetrieval/. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. + Created by combining the agency code (e.g. ``USGS``) with the ID + number (e.g. ``02238500``), separated by a hyphen + (e.g. ``"USGS-02238500"``). + parameter_code : string or list of strings, optional + 5-digit codes used to identify the constituent measured and the + units of measure. See + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + parameter_name : string or list of strings, optional + A human-understandable name corresponding to ``parameter_code``. + parameter_description : string or list of strings, optional + A human-readable description of what is being measured. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement + associated with an observation. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents + (e.g. ``00001`` max, ``00002`` min, ``00003`` mean). Full list at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + data_type : string or list of strings, optional + The type of data the time series represents, e.g. + ``"Continuous"``, ``"Daily"``, ``"Field measurements"``. + computation_identifier : string or list of strings, optional + Indicates whether the data from this time series represent a + specific statistical computation. + thresholds : numeric or list of numbers, optional + Numeric limits known for a time series (e.g. historic maximum, + below-which-the-sensor-is-non-operative). + sublocation_identifier : string or list of strings, optional + primary : string or list of strings, optional + A flag identifying whether the time series is "primary". Primary + time series are standard observations that have undergone Bureau + review and approval. Non-primary (provisional) time series have a + missing ``primary`` value, are produced for timely best-science + use, and are retained by this system for only 120 days. + parent_time_series_id : string or list of strings, optional + web_description : string or list of strings, optional + A description of what this time series represents, as used by + WDFN and other USGS data dissemination products. + last_modified, begin, end : string, optional + Datetime fields that accept either an RFC 3339 datetime, an + interval (``"start/end"``, optionally half-bounded with ``..``), + or an ISO 8601 duration (e.g. ``"P1M"``, ``"PT36H"``). See + :func:`get_time_series_metadata` for the full grammar. + agency_code, agency_name, monitoring_location_number, \ +monitoring_location_name, district_code, country_code, country_name, \ +state_code, state_name, county_code, county_name, \ +minor_civil_division_code, site_type_code, site_type, \ +hydrologic_unit_code, basin_code : string or list of strings, optional + Location-catalog filters carried over from the + ``monitoring-locations`` collection. + altitude, altitude_accuracy, altitude_method_code, \ +altitude_method_name, vertical_datum, vertical_datum_name, \ +horizontal_positional_accuracy_code, horizontal_positional_accuracy, \ +horizontal_position_method_code, horizontal_position_method_name, \ +original_horizontal_datum, original_horizontal_datum_name, \ +drainage_area, contributing_drainage_area, time_zone_abbreviation, \ +uses_daylight_savings, construction_date : string or list of strings, optional + Spatial / datum / construction attributes carried over from the + ``monitoring-locations`` collection. + aquifer_code, national_aquifer_code, aquifer_type_code, \ +well_constructed_depth, hole_constructed_depth, depth_source_code : \ +string or list of strings, optional + Groundwater-well attributes (only populated for well sites). + properties : string or list of strings, optional + Subset of columns to return. Defaults to every available + property. + skip_geometry : boolean, optional + Skip per-feature geometries; the returned object will be a plain + ``DataFrame`` with no spatial information. The Water Data APIs + use camelCase ``skipGeometry`` in CQL2 queries. + bbox : list of numbers, optional + Only features whose geometry intersects the bounding box are + selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 + (longitude/latitude, west-south-east-north). + limit : numeric, optional + Page size; the maximum allowable value is 50000. Default + (``None``) requests the maximum allowable limit. + filter, filter_lang : optional + Server-side CQL filter passed through as the OGC ``filter`` / + ``filter-lang`` query parameters. See + :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + and the lexicographic-comparison pitfall. + convert_type : boolean, optional + If True, converts columns to appropriate types. + + Returns + ------- + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` + Formatted data returned from the API query. + md : :obj:`dataretrieval.utils.Metadata` + A custom metadata object pertaining to the query. + + Examples + -------- + .. code:: + + >>> # All time series and field measurements at a single site + >>> df, md = dataretrieval.waterdata.get_combined_metadata( + ... monitoring_location_id="USGS-05407000" + ... ) + + >>> # Inventory across multiple HUCs, restricted to streams and springs + >>> df, md = dataretrieval.waterdata.get_combined_metadata( + ... hydrologic_unit_code=["11010008", "11010009"], + ... site_type=["Stream", "Spring"], + ... ) + + >>> # Discharge time series at three sites with at least one + >>> # observation in the past month + >>> df, md = dataretrieval.waterdata.get_combined_metadata( + ... monitoring_location_id=[ + ... "USGS-07069000", + ... "USGS-07064000", + ... "USGS-07068000", + ... ], + ... end="P1M", + ... parameter_code="00060", + ... ) + + """ + service = "combined-metadata" + output_id = "combined_meta_id" + + args = _get_args(locals()) + + return get_ogc_data(args, output_id, service) + + def get_latest_continuous( monitoring_location_id: str | list[str] | None = None, parameter_code: str | list[str] | None = None, diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 0a7ec3c5..f34bfae2 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -9,6 +9,7 @@ from dataretrieval.waterdata import ( get_channel, + get_combined_metadata, get_continuous, get_daily, get_field_measurements, @@ -335,6 +336,40 @@ def test_get_time_series_metadata(): assert hasattr(md, "query_time") +def test_get_combined_metadata(): + df, md = get_combined_metadata( + monitoring_location_id="USGS-05407000", + skip_geometry=True, + ) + # Combined metadata returns one row per (parameter, statistic, data_type), + # carrying both location-catalog and time-series-catalog columns. + assert "monitoring_location_id" in df.columns + assert "parameter_code" in df.columns + assert "data_type" in df.columns + assert "drainage_area" in df.columns + assert (df["monitoring_location_id"] == "USGS-05407000").all() + assert hasattr(md, "url") + assert hasattr(md, "query_time") + + +def test_get_combined_metadata_multi_site_post(): + df, md = get_combined_metadata( + monitoring_location_id=[ + "USGS-07069000", + "USGS-07064000", + "USGS-07068000", + ], + parameter_code="00060", + skip_geometry=True, + ) + assert set(df["monitoring_location_id"].unique()) == { + "USGS-07069000", + "USGS-07064000", + "USGS-07068000", + } + assert (df["parameter_code"] == "00060").all() + + def test_get_reference_table(): df, md = get_reference_table("agency-codes") assert "agency_code" in df.columns From f12deff78bf9c3f2f1075a4c0b7fd063ef67e192 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 15:01:51 -0500 Subject: [PATCH 2/4] Address /simplify findings on get_combined_metadata - thresholds: int | None -> int | list[int] | None to match the docstring's "numeric or list of numbers" promise. - Replace the backslash-line-continued multi-parameter docstring group with a short numpydoc-valid entry that documents the most-used location filters (state_name, county_name, hydrologic_unit_code, site_type, site_type_code) and points the reader at get_monitoring_locations for the long tail. The previous form was not valid numpydoc syntax, and a single 800+ char one-line group fails ruff E501. - Drop a WHAT-narrating comment from test_get_combined_metadata; the assertions speak for themselves. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 35 ++++++++++++++-------------------- tests/waterdata_test.py | 2 -- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 9d25aa16..feb2f6a0 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -941,7 +941,7 @@ def get_combined_metadata( statistic_id: str | list[str] | None = None, data_type: str | list[str] | None = None, computation_identifier: str | list[str] | None = None, - thresholds: int | None = None, + thresholds: int | list[int] | None = None, sublocation_identifier: str | list[str] | None = None, primary: str | list[str] | None = None, parent_time_series_id: str | list[str] | None = None, @@ -1013,6 +1013,11 @@ def get_combined_metadata( The R analogue is ``read_waterdata_combined_meta`` in https://github.com/DOI-USGS/dataRetrieval/. + All ~35 location-catalog kwargs are accepted (``agency_code``, + ``state_name``, ``drainage_area``, ``aquifer_code``, …) but only + the most-used ones are documented below; see + :func:`get_monitoring_locations` for per-field descriptions. + Parameters ---------- monitoring_location_id : string or list of strings, optional @@ -1060,26 +1065,14 @@ def get_combined_metadata( interval (``"start/end"``, optionally half-bounded with ``..``), or an ISO 8601 duration (e.g. ``"P1M"``, ``"PT36H"``). See :func:`get_time_series_metadata` for the full grammar. - agency_code, agency_name, monitoring_location_number, \ -monitoring_location_name, district_code, country_code, country_name, \ -state_code, state_name, county_code, county_name, \ -minor_civil_division_code, site_type_code, site_type, \ -hydrologic_unit_code, basin_code : string or list of strings, optional - Location-catalog filters carried over from the - ``monitoring-locations`` collection. - altitude, altitude_accuracy, altitude_method_code, \ -altitude_method_name, vertical_datum, vertical_datum_name, \ -horizontal_positional_accuracy_code, horizontal_positional_accuracy, \ -horizontal_position_method_code, horizontal_position_method_name, \ -original_horizontal_datum, original_horizontal_datum_name, \ -drainage_area, contributing_drainage_area, time_zone_abbreviation, \ -uses_daylight_savings, construction_date : string or list of strings, optional - Spatial / datum / construction attributes carried over from the - ``monitoring-locations`` collection. - aquifer_code, national_aquifer_code, aquifer_type_code, \ -well_constructed_depth, hole_constructed_depth, depth_source_code : \ -string or list of strings, optional - Groundwater-well attributes (only populated for well sites). + state_name, county_name, hydrologic_unit_code, site_type, \ +site_type_code : string or list of strings, optional + Common location-catalog filters carried over from the + ``monitoring-locations`` collection. The function also accepts + the full list of location-catalog kwargs (agency, district, + altitude, vertical/horizontal datum, drainage area, aquifer, + well construction, …); see :func:`get_monitoring_locations` for + descriptions of each. properties : string or list of strings, optional Subset of columns to return. Defaults to every available property. diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index f34bfae2..f0914959 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -341,8 +341,6 @@ def test_get_combined_metadata(): monitoring_location_id="USGS-05407000", skip_geometry=True, ) - # Combined metadata returns one row per (parameter, statistic, data_type), - # carrying both location-catalog and time-series-catalog columns. assert "monitoring_location_id" in df.columns assert "parameter_code" in df.columns assert "data_type" in df.columns From 403bdd8a269f77cc410220873cd10684f8d9e9e2 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 15:13:56 -0500 Subject: [PATCH 3/4] Adapt R doc examples for get_combined_metadata, fix data_type values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ported three additional examples from R's read_waterdata_combined_meta that aren't redundant with the ones we already had: - Groundwater well — surfaces water-level and aquifer columns that the surface-water example shows as nulls. - State + county — common area-of-interest workflow. - Two-step "inventory then fetch" chain — get_combined_metadata to find what's available in a HUC, then get_continuous to pull the actual observations at every site found. Also corrected the data_type description: the live API returns "Continuous values" and "Daily values" (with the word "values"), not "Continuous" / "Daily" as the docstring previously claimed. Verified against api.waterdata.usgs.gov. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index feb2f6a0..0a3e16e7 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1042,7 +1042,8 @@ def get_combined_metadata( https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. data_type : string or list of strings, optional The type of data the time series represents, e.g. - ``"Continuous"``, ``"Daily"``, ``"Field measurements"``. + ``"Continuous values"``, ``"Daily values"``, + ``"Field measurements"``. computation_identifier : string or list of strings, optional Indicates whether the data from this time series represent a specific statistical computation. @@ -1106,11 +1107,22 @@ def get_combined_metadata( -------- .. code:: - >>> # All time series and field measurements at a single site + >>> # All time series and field measurements at a single surface-water site >>> df, md = dataretrieval.waterdata.get_combined_metadata( ... monitoring_location_id="USGS-05407000" ... ) + >>> # Same, for a groundwater well — water-level and aquifer columns + >>> # are populated where the surface-water example has nulls + >>> df, md = dataretrieval.waterdata.get_combined_metadata( + ... monitoring_location_id="USGS-375907091432201" + ... ) + + >>> # Every series in a single county, useful for area-of-interest workflows + >>> df, md = dataretrieval.waterdata.get_combined_metadata( + ... state_name="Wisconsin", county_name="Dane County" + ... ) + >>> # Inventory across multiple HUCs, restricted to streams and springs >>> df, md = dataretrieval.waterdata.get_combined_metadata( ... hydrologic_unit_code=["11010008", "11010009"], @@ -1129,6 +1141,20 @@ def get_combined_metadata( ... parameter_code="00060", ... ) + >>> # Two-step "what's available?" → "fetch it" workflow: + >>> # 1. inventory the sites in two HUCs + >>> hucs, _ = dataretrieval.waterdata.get_combined_metadata( + ... hydrologic_unit_code=["11010008", "11010009"], + ... site_type="Stream", + ... ) + >>> # 2. pull continuous discharge at every distinct site found + >>> sites = hucs["monitoring_location_id"].unique().tolist() + >>> df, md = dataretrieval.waterdata.get_continuous( + ... monitoring_location_id=sites, + ... parameter_code="00060", + ... time="P1D", + ... ) + """ service = "combined-metadata" output_id = "combined_meta_id" From 8c7a20655330a0e468c7c6d7b493e6437e3f41c6 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 15:36:01 -0500 Subject: [PATCH 4/4] Address Copilot review on get_combined_metadata - Widen thresholds: int | list[int] | None -> float | list[float] | None to match the docstring's "numeric or list of numbers" promise. The Water Data API treats threshold values as floats, so the previous int-only annotation was misleading downstream type-checked callers. - In test_get_combined_metadata_multi_site_post, swap the unused `md` binding for `_` to match the convention used by the other live waterdata tests in this file (`df, _ = get_*(...)`). The companion test_get_combined_metadata still binds `md` because it asserts on metadata attributes. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 2 +- tests/waterdata_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 0a3e16e7..9dd083f7 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -941,7 +941,7 @@ def get_combined_metadata( statistic_id: str | list[str] | None = None, data_type: str | list[str] | None = None, computation_identifier: str | list[str] | None = None, - thresholds: int | list[int] | None = None, + thresholds: float | list[float] | None = None, sublocation_identifier: str | list[str] | None = None, primary: str | list[str] | None = None, parent_time_series_id: str | list[str] | None = None, diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index f0914959..a77afeaa 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -351,7 +351,7 @@ def test_get_combined_metadata(): def test_get_combined_metadata_multi_site_post(): - df, md = get_combined_metadata( + df, _ = get_combined_metadata( monitoring_location_id=[ "USGS-07069000", "USGS-07064000",