diff --git a/NEWS.md b/NEWS.md index 31299d58..4d7780de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`. + **05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/. **04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge. diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index f0df1f1d..519ba6ff 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -21,6 +21,7 @@ get_monitoring_locations, get_reference_table, get_samples, + get_samples_summary, get_stats_date_range, get_stats_por, get_time_series_metadata, @@ -51,6 +52,7 @@ "get_nearest_continuous", "get_reference_table", "get_samples", + "get_samples_summary", "get_stats_date_range", "get_stats_por", "get_time_series_metadata", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index d4bb647a..583b512c 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -10,6 +10,7 @@ import logging from io import StringIO from typing import get_args +from urllib.parse import quote import pandas as pd import requests @@ -1800,6 +1801,77 @@ def get_samples( return df, BaseMetadata(response) +def get_samples_summary( + monitoringLocationIdentifier: str, + ssl_check: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get a summary of discrete water-quality samples at a single monitoring location. + + Wraps the Samples database summary service described at + https://api.waterdata.usgs.gov/samples-data/docs. The service returns one + row per (characteristic group, characteristic, user-supplied characteristic) + combination with result and activity counts and the first / most recent + activity dates — useful for taking inventory of what discrete-sample data + exists at a site before pulling the underlying observations with + :func:`get_samples`. + + The summary service is single-site only: it accepts exactly one monitoring + location per request. + + Parameters + ---------- + monitoringLocationIdentifier : string + A monitoring location identifier has two parts, separated by a dash + (``-``): the agency code and the location number. Examples: + ``"USGS-040851385"``, ``"AZ014-320821110580701"``, + ``"CAX01-15304600"``. Bare location numbers without an agency prefix + are accepted by the service but return an empty result, so a prefix + is effectively required. + ssl_check : bool, optional + Check the SSL certificate. Default is True. + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + md : :obj:`dataretrieval.utils.Metadata` + Custom ``dataretrieval`` metadata object pertaining to the query. + + Examples + -------- + .. code:: + + >>> # What discrete-sample data is available at this site? + >>> df, md = dataretrieval.waterdata.get_samples_summary( + ... monitoringLocationIdentifier="USGS-04074950" + ... ) + + """ + if not isinstance(monitoringLocationIdentifier, str): + raise TypeError( + "monitoringLocationIdentifier must be a string; the Samples " + "summary service accepts exactly one monitoring location per " + f"request, got {type(monitoringLocationIdentifier).__name__}." + ) + + url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}" + params = {"mimeType": "text/csv"} + + req = PreparedRequest() + req.prepare_url(url, params=params) + logger.info("Request: %s", req.url) + + response = requests.get( + url, params=params, verify=ssl_check, headers=_default_headers() + ) + + response.raise_for_status() + + df = pd.read_csv(StringIO(response.text), delimiter=",") + + return df, BaseMetadata(response) + + def get_stats_por( approval_status: str | None = None, computation_type: str | list[str] | None = None, diff --git a/tests/data/samples_summary.txt b/tests/data/samples_summary.txt new file mode 100644 index 00000000..afb376cd --- /dev/null +++ b/tests/data/samples_summary.txt @@ -0,0 +1,6 @@ +monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity +USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28 +USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28 +USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28 +USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28 +USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28 diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 195441e5..0a7ec3c5 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -17,6 +17,7 @@ get_monitoring_locations, get_reference_table, get_samples, + get_samples_summary, get_stats_date_range, get_stats_por, get_time_series_metadata, @@ -57,6 +58,40 @@ def test_mock_get_samples(requests_mock): assert md.comment is None +def test_mock_get_samples_summary(requests_mock): + """Tests USGS Samples summary query""" + request_url = ( + "https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500" + "?mimeType=text%2Fcsv" + ) + response_file_path = "tests/data/samples_summary.txt" + mock_request(requests_mock, request_url, response_file_path) + df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500") + assert type(df) is DataFrame + expected_columns = { + "monitoringLocationIdentifier", + "characteristicGroup", + "characteristic", + "characteristicUserSupplied", + "resultCount", + "activityCount", + "firstActivity", + "mostRecentActivity", + } + assert expected_columns.issubset(df.columns) + assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all() + assert md.url == request_url + assert isinstance(md.query_time, datetime.timedelta) + assert md.header == {"mock_header": "value"} + assert md.comment is None + + +def test_get_samples_summary_rejects_list(): + """The summary endpoint accepts only one site; a list must raise TypeError.""" + with pytest.raises(TypeError, match="exactly one monitoring location"): + get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"]) + + def test_check_profiles(): """Tests that correct errors are raised for invalid profiles.""" with pytest.raises(ValueError):