From 5dc014690ab3d47dc2fb0959141a87bb1d9a6bb0 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 12:59:51 -0500
Subject: [PATCH 1/4] Add waterdata.get_samples_summary for per-location sample
 inventory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wraps the Samples database /summary/{monitoringLocationIdentifier}
endpoint, mirroring the R package's summarize_waterdata_samples. Returns
per-characteristic result and activity counts plus first / most recent
activity dates for a single monitoring location — useful for taking
inventory of what discrete-sample data exists at a site before pulling
observations with get_samples.

The Samples summary endpoint accepts only a single monitoring location
per request, so the function takes a string (not a list).

Closes #261.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NEWS.md                             |  2 +
 dataretrieval/waterdata/__init__.py |  2 +
 dataretrieval/waterdata/api.py      | 58 +++++++++++++++++++++++++++++
 tests/data/samples_summary.txt      |  6 +++
 tests/waterdata_test.py             | 28 ++++++++++++++
 5 files changed, 96 insertions(+)
 create mode 100644 tests/data/samples_summary.txt

diff --git a/NEWS.md b/NEWS.md
index 31299d58..4d7780de 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
+
 **05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.
 
 **04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
index f0df1f1d..519ba6ff 100644
--- a/dataretrieval/waterdata/__init__.py
+++ b/dataretrieval/waterdata/__init__.py
@@ -21,6 +21,7 @@
     get_monitoring_locations,
     get_reference_table,
     get_samples,
+    get_samples_summary,
     get_stats_date_range,
     get_stats_por,
     get_time_series_metadata,
@@ -51,6 +52,7 @@
     "get_nearest_continuous",
     "get_reference_table",
     "get_samples",
+    "get_samples_summary",
     "get_stats_date_range",
     "get_stats_por",
     "get_time_series_metadata",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index d4bb647a..500c5b47 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -1800,6 +1800,64 @@ def get_samples(
     return df, BaseMetadata(response)
 
 
+def get_samples_summary(
+    monitoringLocationIdentifier: str,
+    ssl_check: bool = True,
+) -> tuple[pd.DataFrame, BaseMetadata]:
+    """Get a summary of samples available at a single monitoring location.
+
+    Wraps the Samples database ``/summary/{monitoringLocationIdentifier}``
+    endpoint, which returns one row per (characteristic group, characteristic,
+    user-supplied characteristic) combination with result and activity counts
+    and the first / most recent activity dates. This is useful for taking an
+    inventory of what discrete-sample data exists at a site before pulling
+    the underlying observations with :func:`get_samples`.
+
+    The Samples summary endpoint only accepts a single monitoring location
+    per request.
+
+    See https://api.waterdata.usgs.gov/samples-data/docs#/summaries for the
+    full API reference.
+
+    Parameters
+    ----------
+    monitoringLocationIdentifier : string
+        A monitoring location identifier in ``AGENCY-ID`` format, e.g.
+        ``"USGS-04183500"``.
+    ssl_check : bool, optional
+        Check the SSL certificate. Default is True.
+
+    Returns
+    -------
+    df : ``pandas.DataFrame``
+        Formatted data returned from the API query.
+    md : :obj:`dataretrieval.utils.Metadata`
+        Custom ``dataretrieval`` metadata object pertaining to the query.
+
+    Examples
+    --------
+    .. code::
+
+        >>> # What discrete-sample data is available at this site?
+        >>> df, md = dataretrieval.waterdata.get_samples_summary(
+        ...     monitoringLocationIdentifier="USGS-04183500"
+        ... )
+
+    """
+    url = f"{SAMPLES_URL}/summary/{monitoringLocationIdentifier}"
+    params = {"mimeType": "text/csv"}
+
+    response = requests.get(
+        url, params=params, verify=ssl_check, headers=_default_headers()
+    )
+
+    response.raise_for_status()
+
+    df = pd.read_csv(StringIO(response.text), delimiter=",")
+
+    return df, BaseMetadata(response)
+
+
 def get_stats_por(
     approval_status: str | None = None,
     computation_type: str | list[str] | None = None,
diff --git a/tests/data/samples_summary.txt b/tests/data/samples_summary.txt
new file mode 100644
index 00000000..afb376cd
--- /dev/null
+++ b/tests/data/samples_summary.txt
@@ -0,0 +1,6 @@
+monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity
+USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28
+USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28
+USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28
+USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28
+USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 195441e5..1d07e03e 100644
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -17,6 +17,7 @@
     get_monitoring_locations,
     get_reference_table,
     get_samples,
+    get_samples_summary,
     get_stats_date_range,
     get_stats_por,
     get_time_series_metadata,
@@ -57,6 +58,33 @@ def test_mock_get_samples(requests_mock):
     assert md.comment is None
 
 
+def test_mock_get_samples_summary(requests_mock):
+    """Tests USGS Samples summary query"""
+    request_url = (
+        "https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500"
+        "?mimeType=text%2Fcsv"
+    )
+    response_file_path = "tests/data/samples_summary.txt"
+    mock_request(requests_mock, request_url, response_file_path)
+    df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500")
+    assert type(df) is DataFrame
+    assert list(df.columns) == [
+        "monitoringLocationIdentifier",
+        "characteristicGroup",
+        "characteristic",
+        "characteristicUserSupplied",
+        "resultCount",
+        "activityCount",
+        "firstActivity",
+        "mostRecentActivity",
+    ]
+    assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all()
+    assert md.url == request_url
+    assert isinstance(md.query_time, datetime.timedelta)
+    assert md.header == {"mock_header": "value"}
+    assert md.comment is None
+
+
 def test_check_profiles():
     """Tests that correct errors are raised for invalid profiles."""
     with pytest.raises(ValueError):

From 491db88e95f11359cdc02c9a0f7c4c35350c9e52 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 13:03:47 -0500
Subject: [PATCH 2/4] Address /simplify findings on get_samples_summary

- URL-encode the path-segment monitoringLocationIdentifier so values
  containing /, ?, # or whitespace cannot break URL composition.
- Log the resolved request URL via PreparedRequest, matching get_samples.
- Loosen the test column assertion from exact-list to subset so a
  non-breaking server-side column addition does not flake the test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/api.py | 7 ++++++-
 tests/waterdata_test.py        | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 500c5b47..67df7aaf 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -10,6 +10,7 @@
 import logging
 from io import StringIO
 from typing import get_args
+from urllib.parse import quote
 
 import pandas as pd
 import requests
@@ -1844,9 +1845,13 @@ def get_samples_summary(
         ... )
 
     """
-    url = f"{SAMPLES_URL}/summary/{monitoringLocationIdentifier}"
+    url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}"
     params = {"mimeType": "text/csv"}
 
+    req = PreparedRequest()
+    req.prepare_url(url, params=params)
+    logger.info("Request: %s", req.url)
+
     response = requests.get(
         url, params=params, verify=ssl_check, headers=_default_headers()
     )
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 1d07e03e..feec681b 100644
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -68,16 +68,16 @@ def test_mock_get_samples_summary(requests_mock):
     mock_request(requests_mock, request_url, response_file_path)
     df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500")
     assert type(df) is DataFrame
-    assert list(df.columns) == [
+    expected_columns = {
         "monitoringLocationIdentifier",
         "characteristicGroup",
         "characteristic",
-        "characteristicUserSupplied",
         "resultCount",
         "activityCount",
         "firstActivity",
         "mostRecentActivity",
-    ]
+    }
+    assert expected_columns.issubset(df.columns)
     assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all()
     assert md.url == request_url
     assert isinstance(md.query_time, datetime.timedelta)

From bbbb3652392defb5928e304de90a144d001faef5 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 13:10:13 -0500
Subject: [PATCH 3/4] Adapt R doc for get_samples_summary, drop two doc claims
 that are wrong for this endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adapted the wording from R's summarize_waterdata_samples (in the develop
branch of DOI-USGS/dataRetrieval) to match the Python module's docstring
style. Picked up the variety-of-agencies example IDs from the R doc.

Two claims from the R doc were corrected rather than copied:

- The R doc says "Location identifiers should be separated with commas"
  with a multi-ID example. That contradicts the function's own one-site
  check and is wrong for the summary service (which accepts exactly one
  ID). Dropped.
- The R doc says "Location numbers without an agency prefix are assumed
  to have the prefix USGS." That's not true for this endpoint at the API
  level — bare IDs return an empty result with a different column shape.
  Documented the actual behavior instead.

Also switched the example to USGS-04074950 (the site used by the R doc's
example) so the two repos line up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/api.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 67df7aaf..17582a0b 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -1805,26 +1805,28 @@ def get_samples_summary(
     monitoringLocationIdentifier: str,
     ssl_check: bool = True,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
-    """Get a summary of samples available at a single monitoring location.
+    """Get a summary of discrete water-quality samples at a single monitoring location.
 
-    Wraps the Samples database ``/summary/{monitoringLocationIdentifier}``
-    endpoint, which returns one row per (characteristic group, characteristic,
-    user-supplied characteristic) combination with result and activity counts
-    and the first / most recent activity dates. This is useful for taking an
-    inventory of what discrete-sample data exists at a site before pulling
-    the underlying observations with :func:`get_samples`.
+    Wraps the Samples database summary service described at
+    https://api.waterdata.usgs.gov/samples-data/docs. The service returns one
+    row per (characteristic group, characteristic, user-supplied characteristic)
+    combination with result and activity counts and the first / most recent
+    activity dates — useful for taking inventory of what discrete-sample data
+    exists at a site before pulling the underlying observations with
+    :func:`get_samples`.
 
-    The Samples summary endpoint only accepts a single monitoring location
-    per request.
-
-    See https://api.waterdata.usgs.gov/samples-data/docs#/summaries for the
-    full API reference.
+    The summary service is single-site only: it accepts exactly one monitoring
+    location per request.
 
     Parameters
     ----------
     monitoringLocationIdentifier : string
-        A monitoring location identifier in ``AGENCY-ID`` format, e.g.
-        ``"USGS-04183500"``.
+        A monitoring location identifier has two parts, separated by a dash
+        (``-``): the agency code and the location number. Examples:
+        ``"USGS-040851385"``, ``"AZ014-320821110580701"``,
+        ``"CAX01-15304600"``. Bare location numbers without an agency prefix
+        are accepted by the service but return an empty result, so a prefix
+        is effectively required.
     ssl_check : bool, optional
         Check the SSL certificate. Default is True.
 
@@ -1841,7 +1843,7 @@ def get_samples_summary(
 
         >>> # What discrete-sample data is available at this site?
         >>> df, md = dataretrieval.waterdata.get_samples_summary(
-        ...     monitoringLocationIdentifier="USGS-04183500"
+        ...     monitoringLocationIdentifier="USGS-04074950"
         ... )
 
     """

From 96acd0c44b31d424fc8db3b6f0d14cfdb770639c Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 13:15:42 -0500
Subject: [PATCH 4/4] Address Copilot review on get_samples_summary

- Reject non-str monitoringLocationIdentifier with a TypeError that
  explains the constraint, instead of letting urllib.parse.quote raise
  a low-level TypeError. This matches R's summarize_waterdata_samples,
  which guards with `if (length(monitoringLocationIdentifier) > 1) stop(...)`.
- Restore characteristicUserSupplied in the column-subset assertion;
  /simplify's "loosen exact-list to subset" was applied too aggressively
  and dropped a real schema column that disambiguates grouping.
- Add a regression test that a list input raises the new TypeError.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/api.py | 7 +++++++
 tests/waterdata_test.py        | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 17582a0b..583b512c 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -1847,6 +1847,13 @@ def get_samples_summary(
         ... )
 
     """
+    if not isinstance(monitoringLocationIdentifier, str):
+        raise TypeError(
+            "monitoringLocationIdentifier must be a string; the Samples "
+            "summary service accepts exactly one monitoring location per "
+            f"request, got {type(monitoringLocationIdentifier).__name__}."
+        )
+
     url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}"
     params = {"mimeType": "text/csv"}
 
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index feec681b..0a7ec3c5 100644
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -72,6 +72,7 @@ def test_mock_get_samples_summary(requests_mock):
         "monitoringLocationIdentifier",
         "characteristicGroup",
         "characteristic",
+        "characteristicUserSupplied",
         "resultCount",
         "activityCount",
         "firstActivity",
@@ -85,6 +86,12 @@ def test_mock_get_samples_summary(requests_mock):
     assert md.comment is None
 
 
+def test_get_samples_summary_rejects_list():
+    """The summary endpoint accepts only one site; a list must raise TypeError."""
+    with pytest.raises(TypeError, match="exactly one monitoring location"):
+        get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"])
+
+
 def test_check_profiles():
     """Tests that correct errors are raised for invalid profiles."""
     with pytest.raises(ValueError):