Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.

**05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.

**04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.
Expand Down
2 changes: 2 additions & 0 deletions dataretrieval/waterdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
get_monitoring_locations,
get_reference_table,
get_samples,
get_samples_summary,
get_stats_date_range,
get_stats_por,
get_time_series_metadata,
Expand Down Expand Up @@ -51,6 +52,7 @@
"get_nearest_continuous",
"get_reference_table",
"get_samples",
"get_samples_summary",
"get_stats_date_range",
"get_stats_por",
"get_time_series_metadata",
Expand Down
72 changes: 72 additions & 0 deletions dataretrieval/waterdata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
from io import StringIO
from typing import get_args
from urllib.parse import quote

import pandas as pd
import requests
Expand Down Expand Up @@ -1800,6 +1801,77 @@ def get_samples(
return df, BaseMetadata(response)


def get_samples_summary(
monitoringLocationIdentifier: str,
ssl_check: bool = True,
) -> tuple[pd.DataFrame, BaseMetadata]:
"""Get a summary of discrete water-quality samples at a single monitoring location.

Wraps the Samples database summary service described at
https://api.waterdata.usgs.gov/samples-data/docs. The service returns one
row per (characteristic group, characteristic, user-supplied characteristic)
combination with result and activity counts and the first / most recent
activity dates — useful for taking inventory of what discrete-sample data
exists at a site before pulling the underlying observations with
:func:`get_samples`.

The summary service is single-site only: it accepts exactly one monitoring
location per request.

Parameters
----------
monitoringLocationIdentifier : string
A monitoring location identifier has two parts, separated by a dash
(``-``): the agency code and the location number. Examples:
``"USGS-040851385"``, ``"AZ014-320821110580701"``,
``"CAX01-15304600"``. Bare location numbers without an agency prefix
are accepted by the service but return an empty result, so a prefix
is effectively required.
ssl_check : bool, optional
Check the SSL certificate. Default is True.

Returns
-------
df : ``pandas.DataFrame``
Formatted data returned from the API query.
md : :obj:`dataretrieval.utils.Metadata`
Custom ``dataretrieval`` metadata object pertaining to the query.

Examples
--------
.. code::

>>> # What discrete-sample data is available at this site?
>>> df, md = dataretrieval.waterdata.get_samples_summary(
... monitoringLocationIdentifier="USGS-04074950"
... )

"""
if not isinstance(monitoringLocationIdentifier, str):
raise TypeError(
"monitoringLocationIdentifier must be a string; the Samples "
"summary service accepts exactly one monitoring location per "
f"request, got {type(monitoringLocationIdentifier).__name__}."
)

url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}"
Comment thread
thodson-usgs marked this conversation as resolved.
params = {"mimeType": "text/csv"}

req = PreparedRequest()
req.prepare_url(url, params=params)
logger.info("Request: %s", req.url)

response = requests.get(
url, params=params, verify=ssl_check, headers=_default_headers()
)

response.raise_for_status()

df = pd.read_csv(StringIO(response.text), delimiter=",")

return df, BaseMetadata(response)


def get_stats_por(
approval_status: str | None = None,
computation_type: str | list[str] | None = None,
Expand Down
6 changes: 6 additions & 0 deletions tests/data/samples_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity
USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28
USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28
USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28
USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28
USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28
35 changes: 35 additions & 0 deletions tests/waterdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
get_monitoring_locations,
get_reference_table,
get_samples,
get_samples_summary,
get_stats_date_range,
get_stats_por,
get_time_series_metadata,
Expand Down Expand Up @@ -57,6 +58,40 @@ def test_mock_get_samples(requests_mock):
assert md.comment is None


def test_mock_get_samples_summary(requests_mock):
"""Tests USGS Samples summary query"""
request_url = (
"https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500"
"?mimeType=text%2Fcsv"
)
response_file_path = "tests/data/samples_summary.txt"
mock_request(requests_mock, request_url, response_file_path)
df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500")
assert type(df) is DataFrame
expected_columns = {
"monitoringLocationIdentifier",
"characteristicGroup",
"characteristic",
"characteristicUserSupplied",
"resultCount",
"activityCount",
"firstActivity",
"mostRecentActivity",
}
assert expected_columns.issubset(df.columns)
Comment thread
thodson-usgs marked this conversation as resolved.
assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all()
assert md.url == request_url
assert isinstance(md.query_time, datetime.timedelta)
assert md.header == {"mock_header": "value"}
assert md.comment is None


def test_get_samples_summary_rejects_list():
"""The summary endpoint accepts only one site; a list must raise TypeError."""
with pytest.raises(TypeError, match="exactly one monitoring location"):
get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"])


def test_check_profiles():
"""Tests that correct errors are raised for invalid profiles."""
with pytest.raises(ValueError):
Expand Down
Loading