From 5dc014690ab3d47dc2fb0959141a87bb1d9a6bb0 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 12:59:51 -0500 Subject: [PATCH 1/4] Add waterdata.get_samples_summary for per-location sample inventory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the Samples database /summary/{monitoringLocationIdentifier} endpoint, mirroring the R package's summarize_waterdata_samples. Returns per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location — useful for taking inventory of what discrete-sample data exists at a site before pulling observations with get_samples. The Samples summary endpoint accepts only a single monitoring location per request, so the function takes a string (not a list). Closes #261. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/__init__.py | 2 + dataretrieval/waterdata/api.py | 58 +++++++++++++++++++++++++++++ tests/data/samples_summary.txt | 6 +++ tests/waterdata_test.py | 28 ++++++++++++++ 5 files changed, 96 insertions(+) create mode 100644 tests/data/samples_summary.txt diff --git a/NEWS.md b/NEWS.md index 31299d58..4d7780de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`. + **05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/. **04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge. diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index f0df1f1d..519ba6ff 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -21,6 +21,7 @@ get_monitoring_locations, get_reference_table, get_samples, + get_samples_summary, get_stats_date_range, get_stats_por, get_time_series_metadata, @@ -51,6 +52,7 @@ "get_nearest_continuous", "get_reference_table", "get_samples", + "get_samples_summary", "get_stats_date_range", "get_stats_por", "get_time_series_metadata", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index d4bb647a..500c5b47 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1800,6 +1800,64 @@ def get_samples( return df, BaseMetadata(response) +def get_samples_summary( + monitoringLocationIdentifier: str, + ssl_check: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get a summary of samples available at a single monitoring location. + + Wraps the Samples database ``/summary/{monitoringLocationIdentifier}`` + endpoint, which returns one row per (characteristic group, characteristic, + user-supplied characteristic) combination with result and activity counts + and the first / most recent activity dates. This is useful for taking an + inventory of what discrete-sample data exists at a site before pulling + the underlying observations with :func:`get_samples`. + + The Samples summary endpoint only accepts a single monitoring location + per request. + + See https://api.waterdata.usgs.gov/samples-data/docs#/summaries for the + full API reference. + + Parameters + ---------- + monitoringLocationIdentifier : string + A monitoring location identifier in ``AGENCY-ID`` format, e.g. + ``"USGS-04183500"``. + ssl_check : bool, optional + Check the SSL certificate. Default is True. + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + md : :obj:`dataretrieval.utils.Metadata` + Custom ``dataretrieval`` metadata object pertaining to the query. + + Examples + -------- + .. code:: + + >>> # What discrete-sample data is available at this site? + >>> df, md = dataretrieval.waterdata.get_samples_summary( + ... monitoringLocationIdentifier="USGS-04183500" + ... ) + + """ + url = f"{SAMPLES_URL}/summary/{monitoringLocationIdentifier}" + params = {"mimeType": "text/csv"} + + response = requests.get( + url, params=params, verify=ssl_check, headers=_default_headers() + ) + + response.raise_for_status() + + df = pd.read_csv(StringIO(response.text), delimiter=",") + + return df, BaseMetadata(response) + + def get_stats_por( approval_status: str | None = None, computation_type: str | list[str] | None = None, diff --git a/tests/data/samples_summary.txt b/tests/data/samples_summary.txt new file mode 100644 index 00000000..afb376cd --- /dev/null +++ b/tests/data/samples_summary.txt @@ -0,0 +1,6 @@ +monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity +USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28 +USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28 +USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28 +USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28 +USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28 diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 195441e5..1d07e03e 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -17,6 +17,7 @@ get_monitoring_locations, get_reference_table, get_samples, + get_samples_summary, get_stats_date_range, get_stats_por, get_time_series_metadata, @@ -57,6 +58,33 @@ def test_mock_get_samples(requests_mock): assert md.comment is None +def test_mock_get_samples_summary(requests_mock): + """Tests USGS Samples summary query""" + request_url = ( + "https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500" + "?mimeType=text%2Fcsv" + ) + response_file_path = "tests/data/samples_summary.txt" + mock_request(requests_mock, request_url, response_file_path) + df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500") + assert type(df) is DataFrame + assert list(df.columns) == [ + "monitoringLocationIdentifier", + "characteristicGroup", + "characteristic", + "characteristicUserSupplied", + "resultCount", + "activityCount", + "firstActivity", + "mostRecentActivity", + ] + assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all() + assert md.url == request_url + assert isinstance(md.query_time, datetime.timedelta) + assert md.header == {"mock_header": "value"} + assert md.comment is None + + def test_check_profiles(): """Tests that correct errors are raised for invalid profiles.""" with pytest.raises(ValueError): From 491db88e95f11359cdc02c9a0f7c4c35350c9e52 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 13:03:47 -0500 Subject: [PATCH 2/4] Address /simplify findings on get_samples_summary - URL-encode the path-segment monitoringLocationIdentifier so values containing /, ?, # or whitespace cannot break URL composition. - Log the resolved request URL via PreparedRequest, matching get_samples. - Loosen the test column assertion from exact-list to subset so a non-breaking server-side column addition does not flake the test. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 7 ++++++- tests/waterdata_test.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 500c5b47..67df7aaf 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -10,6 +10,7 @@ import logging from io import StringIO from typing import get_args +from urllib.parse import quote import pandas as pd import requests @@ -1844,9 +1845,13 @@ def get_samples_summary( ... ) """ - url = f"{SAMPLES_URL}/summary/{monitoringLocationIdentifier}" + url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}" params = {"mimeType": "text/csv"} + req = PreparedRequest() + req.prepare_url(url, params=params) + logger.info("Request: %s", req.url) + response = requests.get( url, params=params, verify=ssl_check, headers=_default_headers() ) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 1d07e03e..feec681b 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -68,16 +68,16 @@ def test_mock_get_samples_summary(requests_mock): mock_request(requests_mock, request_url, response_file_path) df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500") assert type(df) is DataFrame - assert list(df.columns) == [ + expected_columns = { "monitoringLocationIdentifier", "characteristicGroup", "characteristic", - "characteristicUserSupplied", "resultCount", "activityCount", "firstActivity", "mostRecentActivity", - ] + } + assert expected_columns.issubset(df.columns) assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all() assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) From bbbb3652392defb5928e304de90a144d001faef5 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 13:10:13 -0500 Subject: [PATCH 3/4] Adapt R doc for get_samples_summary, drop two doc claims that are wrong for this endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapted the wording from R's summarize_waterdata_samples (in the develop branch of DOI-USGS/dataRetrieval) to match the Python module's docstring style. Picked up the variety-of-agencies example IDs from the R doc. Two claims from the R doc were corrected rather than copied: - The R doc says "Location identifiers should be separated with commas" with a multi-ID example. That contradicts the function's own one-site check and is wrong for the summary service (which accepts exactly one ID). Dropped. - The R doc says "Location numbers without an agency prefix are assumed to have the prefix USGS." That's not true for this endpoint at the API level — bare IDs return an empty result with a different column shape. Documented the actual behavior instead. Also switched the example to USGS-04074950 (the site used by the R doc's example) so the two repos line up. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 67df7aaf..17582a0b 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1805,26 +1805,28 @@ def get_samples_summary( monitoringLocationIdentifier: str, ssl_check: bool = True, ) -> tuple[pd.DataFrame, BaseMetadata]: - """Get a summary of samples available at a single monitoring location. + """Get a summary of discrete water-quality samples at a single monitoring location. - Wraps the Samples database ``/summary/{monitoringLocationIdentifier}`` - endpoint, which returns one row per (characteristic group, characteristic, - user-supplied characteristic) combination with result and activity counts - and the first / most recent activity dates. This is useful for taking an - inventory of what discrete-sample data exists at a site before pulling - the underlying observations with :func:`get_samples`. + Wraps the Samples database summary service described at + https://api.waterdata.usgs.gov/samples-data/docs. The service returns one + row per (characteristic group, characteristic, user-supplied characteristic) + combination with result and activity counts and the first / most recent + activity dates — useful for taking inventory of what discrete-sample data + exists at a site before pulling the underlying observations with + :func:`get_samples`. - The Samples summary endpoint only accepts a single monitoring location - per request. - - See https://api.waterdata.usgs.gov/samples-data/docs#/summaries for the - full API reference. + The summary service is single-site only: it accepts exactly one monitoring + location per request. Parameters ---------- monitoringLocationIdentifier : string - A monitoring location identifier in ``AGENCY-ID`` format, e.g. - ``"USGS-04183500"``. + A monitoring location identifier has two parts, separated by a dash + (``-``): the agency code and the location number. Examples: + ``"USGS-040851385"``, ``"AZ014-320821110580701"``, + ``"CAX01-15304600"``. Bare location numbers without an agency prefix + are accepted by the service but return an empty result, so a prefix + is effectively required. ssl_check : bool, optional Check the SSL certificate. Default is True. @@ -1841,7 +1843,7 @@ def get_samples_summary( >>> # What discrete-sample data is available at this site? >>> df, md = dataretrieval.waterdata.get_samples_summary( - ... monitoringLocationIdentifier="USGS-04183500" + ... monitoringLocationIdentifier="USGS-04074950" ... ) """ From 96acd0c44b31d424fc8db3b6f0d14cfdb770639c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 13:15:42 -0500 Subject: [PATCH 4/4] Address Copilot review on get_samples_summary - Reject non-str monitoringLocationIdentifier with a TypeError that explains the constraint, instead of letting urllib.parse.quote raise a low-level TypeError. This matches R's summarize_waterdata_samples, which guards with `if (length(monitoringLocationIdentifier) > 1) stop(...)`. - Restore characteristicUserSupplied in the column-subset assertion; /simplify's "loosen exact-list to subset" was applied too aggressively and dropped a real schema column that disambiguates grouping. - Add a regression test that a list input raises the new TypeError. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 7 +++++++ tests/waterdata_test.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 17582a0b..583b512c 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1847,6 +1847,13 @@ def get_samples_summary( ... ) """ + if not isinstance(monitoringLocationIdentifier, str): + raise TypeError( + "monitoringLocationIdentifier must be a string; the Samples " + "summary service accepts exactly one monitoring location per " + f"request, got {type(monitoringLocationIdentifier).__name__}." + ) + url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}" params = {"mimeType": "text/csv"} diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index feec681b..0a7ec3c5 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -72,6 +72,7 @@ def test_mock_get_samples_summary(requests_mock): "monitoringLocationIdentifier", "characteristicGroup", "characteristic", + "characteristicUserSupplied", "resultCount", "activityCount", "firstActivity", @@ -85,6 +86,12 @@ def test_mock_get_samples_summary(requests_mock): assert md.comment is None +def test_get_samples_summary_rejects_list(): + """The summary endpoint accepts only one site; a list must raise TypeError.""" + with pytest.raises(TypeError, match="exactly one monitoring location"): + get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"]) + + def test_check_profiles(): """Tests that correct errors are raised for invalid profiles.""" with pytest.raises(ValueError):