From 20daa0020b92325e35200cdf74b42e82c72b7b2c Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 29 May 2026 09:01:18 -0400
Subject: [PATCH 1/7] feat(waterdata): add waterdata.xarray module returning CF
 datasets

Add dataretrieval.waterdata.xarray, optional-dependency wrappers that
mirror the Water Data time-series getters but return CF-conventions
xarray.Dataset objects instead of bare DataFrames.

- Ragged (CF contiguous ragged array) layout by default; pass dense=True
  for the NaN-filled (monitoring_location_id, time) grid with one named
  variable per parameter.
- CF metadata is derived from columns the getters already return
  (unit_of_measure -> units, statistic_id -> cell_methods,
  parameter_code -> standard_name/vertical_datum), plus a cached
  parameter-name lookup; sites carry cf_role=timeseries_id with lon/lat.
- Coverage: get_daily, get_continuous, get_latest_continuous,
  get_latest_daily, get_nearest_continuous, get_peaks,
  get_field_measurements, get_samples, and preliminary
  get_stats_por / get_stats_date_range.
- xarray is an optional extra (pip install dataretrieval[xarray]); the
  core package never imports it. Hash-valued ID columns are dropped
  inside the xarray builders, so the plain getters are left untouched.

CF vocabulary maps live in waterdata.types (xarray-free, plain data).
Adds a demo notebook + docs entry and offline converter unit tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/types.py              |   59 +
 dataretrieval/waterdata/xarray.py             | 1081 +++++++++++++++++
 demos/waterdata_xarray_demo.ipynb             |  382 ++++++
 docs/source/examples/index.rst                |   12 +
 .../examples/waterdata_xarray_demo.nblink     |    3 +
 pyproject.toml                                |    8 +
 tests/waterdata_xarray_test.py                |  803 ++++++++++++
 7 files changed, 2348 insertions(+)
 create mode 100644 dataretrieval/waterdata/xarray.py
 create mode 100644 demos/waterdata_xarray_demo.ipynb
 create mode 100644 docs/source/examples/waterdata_xarray_demo.nblink
 create mode 100644 tests/waterdata_xarray_test.py

diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py
index f5e1496b..93b8d19b 100644
--- a/dataretrieval/waterdata/types.py
+++ b/dataretrieval/waterdata/types.py
@@ -74,3 +74,62 @@
         "count",
     ],
 }
+
+
+# --- CF / xarray vocabulary mappings ---------------------------------------
+# Lookup tables used by :mod:`dataretrieval.waterdata.xarray` to translate
+# USGS terms into CF-conventions metadata. Each is intentionally partial:
+# anything not listed falls back to a sensible default (raw unit string kept
+# verbatim; no standard_name emitted) rather than guessing a wrong CF term.
+# They are plain data, so they live here rather than in the (xarray-optional)
+# converter module and can be extended without importing xarray.
+
+# USGS unit strings -> UDUNITS / CF-canonical form.
+CF_UNIT_MAP = {
+    "ft^3/s": "ft3 s-1",
+    "ft3/s": "ft3 s-1",
+    "ft": "ft",
+    "in": "in",
+    "degC": "degC",
+    "deg C": "degC",
+    "uS/cm": "uS/cm",
+    "mg/l": "mg L-1",
+    "mg/L": "mg L-1",
+    # UDUNITS 'ton' is the US short ton; 'short_ton' is not a valid UDUNITS name.
+    "tons/day": "ton day-1",
+    "%": "percent",
+}
+
+# USGS statistic_id -> the operator in a CF ``cell_methods`` string.
+CF_CELL_METHODS = {
+    "00001": "maximum",
+    "00002": "minimum",
+    "00003": "mean",
+    "00006": "sum",
+    "00008": "median",
+    "00011": "point",  # instantaneous
+}
+
+# USGS 5-digit parameter code -> CF standard_name. Deliberately conservative;
+# codes without a confident match are left without a standard_name.
+CF_STANDARD_NAMES = {
+    "00060": "water_volume_transport_in_river_channel",
+    # 00010 (water temperature) is intentionally omitted: ``water_temperature``
+    # is NOT a CF standard name, and the only valid CF water-temperature name,
+    # ``sea_water_temperature``, is wrong-domain for USGS freshwater/groundwater.
+    # Leaving it unmapped keeps the variable's ``long_name`` without emitting an
+    # invalid or misleading ``standard_name``.
+    "00065": "water_surface_height_above_reference_datum",
+    "63160": "water_surface_height_above_reference_datum",
+    "00045": "lwe_thickness_of_precipitation_amount",
+}
+
+# USGS parameter code -> vertical reference datum, attached as a
+# ``vertical_datum`` attribute. The two water-surface-height parameters share
+# the CF standard_name water_surface_height_above_reference_datum, so the datum
+# distinguishes them: gage height (00065) is measured from a local site (gage)
+# datum, while stream water level (63160) is referenced to NAVD88.
+CF_VERTICAL_DATUM = {
+    "00065": "local site datum",
+    "63160": "NAVD88",
+}
diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
new file mode 100644
index 00000000..0eef2c2e
--- /dev/null
+++ b/dataretrieval/waterdata/xarray.py
@@ -0,0 +1,1081 @@
+"""xarray-returning wrappers for the Water Data getters.
+
+Each public function here mirrors the same-named function in
+:mod:`dataretrieval.waterdata`, but returns a CF-conventions
+:class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`.
+
+By default the data is returned as a CF *contiguous ragged array*
+(``featureType = "timeSeries"``): every observation is concatenated along a
+single ``obs`` dimension, and each (monitoring location, parameter,
+statistic) series is one instance along a ``timeseries`` dimension whose
+``row_size`` records how many observations it contributes. This stores only
+real observations -- no NaN fill -- so it scales to large, very ragged
+multi-site pulls where record lengths differ by decades. Parameter,
+statistic, unit, and location identity become per-instance metadata, and
+``time`` is a 1-D coordinate along ``obs``. The trade-off is that ``time``
+is no longer a dimension you can index directly: to select by time you
+first regroup the flat ``obs`` back into per-series views (e.g. with
+``cf-xarray``, or via the offsets implied by ``row_size``).
+
+Pass ``dense=True`` for the alternative gridded layout: one named data
+variable per parameter on a ``(monitoring_location_id, time)`` grid, NaN
+where a series has no observation. This is ergonomic for a few overlapping
+series (``ds["discharge"].sel(time=...)`` just works) but the union time
+axis and NaN fill make it memory-costly for large ragged collections.
+
+Either way the CF metadata is derived from columns the getter already
+returns (``unit_of_measure`` -> ``units``, ``statistic_id`` ->
+``cell_methods``, ``parameter_code`` -> ``standard_name``), plus the
+human-readable parameter name from a small cached metadata lookup. The
+timeseries identity carries ``cf_role = "timeseries_id"`` -- the synthesized
+``timeseries_id`` coordinate when ragged, ``monitoring_location_id`` when
+dense -- each site has ``longitude`` / ``latitude`` (and
+``hydrologic_unit_code`` / ``state_name`` when the metadata call already
+provides them), and dataset attributes carry ``Conventions``, provenance,
+the request URL, and ``date_modified``.
+
+Internally the conversion is organized around three small object roles:
+
+* a :class:`_Schema` value object describes a service's column dialect;
+* a :class:`_DatasetBuilder` strategy turns a values frame into a Dataset --
+  the time-series :class:`_RaggedBuilder` / :class:`_DenseBuilder` (sharing a
+  :class:`_SeriesBuilder` base) and the flat :class:`_StatsBuilder`;
+* a :class:`_MetadataCache` memoizes the supplementary metadata lookup.
+
+A :class:`_Service` record wires those together for each public getter.
+
+This module requires the optional ``xarray`` dependency::
+
+    pip install dataretrieval[xarray]
+"""
+
+from __future__ import annotations
+
+import datetime as _dt
+import inspect as _inspect
+import re as _re
+import threading as _threading
+import warnings as _warnings
+from collections.abc import Callable
+from dataclasses import dataclass, field, replace
+from functools import wraps as _wraps
+
+import pandas as _pd
+
+try:
+    import xarray as _xr
+except ModuleNotFoundError as _exc:  # pragma: no cover - exercised only sans xarray
+    raise ModuleNotFoundError(
+        "dataretrieval.waterdata.xarray requires the optional 'xarray' "
+        "dependency. Install it with:  pip install dataretrieval[xarray]"
+    ) from _exc
+
+from . import api as _api
+from .nearest import get_nearest_continuous as _get_nearest_continuous
+from .types import (
+    CF_CELL_METHODS,
+    CF_STANDARD_NAMES,
+    CF_UNIT_MAP,
+    CF_VERTICAL_DATUM,
+)
+
+__all__ = [
+    "clear_metadata_cache",
+    "get_continuous",
+    "get_daily",
+    "get_field_measurements",
+    "get_latest_continuous",
+    "get_latest_daily",
+    "get_nearest_continuous",
+    "get_peaks",
+    "get_samples",
+    "get_stats_date_range",
+    "get_stats_por",
+]
+
+
+# === constants =============================================================
+
+# The CF vocabulary lookups (USGS units -> UDUNITS, statistic_id ->
+# cell_methods operator, parameter_code -> standard_name) are plain data and
+# live in ``types`` -- imported as CF_UNIT_MAP / CF_CELL_METHODS /
+# CF_STANDARD_NAMES at the top of this module.
+
+# Columns kept off the value pivot but surfaced as ancillary (flag) variables.
+_ANCILLARY = ("qualifier", "approval_status")
+
+# The monitoring-location ("site") column. A site is the DSG location; a
+# timeseries *instance* is a site plus its group columns (parameter/statistic),
+# so one site can host several instances.
+_SITE = "monitoring_location_id"
+
+# Only the human-readable name is sourced from the metadata endpoint; units,
+# statistic, and parameter code all come from the values frame itself.
+_NAME_DESCRIPTORS = ("parameter_name", "parameter_description")
+
+# Per-site descriptors that ride along on the same metadata call (no extra
+# request) and are surfaced as per-site auxiliary coordinates. Only
+# fields the endpoint already returns are used -- station name/altitude live on
+# the monitoring-locations endpoint and would need a separate call, so they are
+# intentionally not fetched here.
+_SITE_DESCRIPTORS = ("hydrologic_unit_code", "state_name")
+
+# CF attributes for the per-site coordinates built from the descriptors above.
+_SITE_COORD_ATTRS = {
+    "hydrologic_unit_code": {"long_name": "hydrologic unit code (HUC)"},
+    "state_name": {"long_name": "state name"},
+}
+
+# Upper bound on the per-process metadata cache (see :class:`_MetadataCache`).
+_CACHE_MAXSIZE = 4096
+
+
+# === stateless helpers =====================================================
+
+
+def _slug(name) -> str:
+    """A lower_snake_case, identifier-safe variable name."""
+    s = _re.sub(r"[^0-9a-zA-Z]+", "_", str(name).strip().lower()).strip("_")
+    return s or "value"
+
+
+def _first(series):
+    """First non-null value of a column, or None."""
+    nonnull = series.dropna()
+    return nonnull.iloc[0] if len(nonnull) else None
+
+
+def _first_present(frame, col):
+    """First non-null value of ``col`` if the column is present, else None."""
+    return _first(frame[col]) if col in frame else None
+
+
+def _unique_present(frame, col):
+    """Distinct non-null values of ``col`` if present, else an empty list."""
+    return frame[col].dropna().unique() if col in frame else []
+
+
+def _sites(df):
+    """Unique monitoring-location ids present in a values frame."""
+    if _SITE in df:
+        return df[_SITE].unique()
+    return []
+
+
+def _date_modified(df):
+    """Most recent upstream record-refresh time in the frame, ISO-8601 or None.
+
+    ``last_modified`` reflects when each record was last refreshed in the USGS
+    database; the maximum is surfaced as the dataset's ``date_modified`` (ACDD)
+    so a reader knows how current the pull is.
+    """
+    if df is None or "last_modified" not in getattr(df, "columns", ()):
+        return None
+    ts = _pd.to_datetime(df["last_modified"], errors="coerce", utc=True).dropna()
+    return ts.max().isoformat() if len(ts) else None
+
+
+def _lonlat(geom):
+    """``(lon, lat)`` from a geometry, or None if it isn't a point.
+
+    Geometry comes back as a shapely ``Point`` when geopandas is installed but
+    as a plain ``[lon, lat]`` list when it is not (the OGC GeoJSON coordinates
+    are flattened into a list). Handle both so the spatial coordinates survive
+    either way; anything else (a polygon, a malformed cell) is skipped rather
+    than guessed.
+    """
+    x, y = getattr(geom, "x", None), getattr(geom, "y", None)
+    if x is not None and y is not None:
+        return x, y
+    if isinstance(geom, (list, tuple)) and len(geom) >= 2:
+        try:
+            return float(geom[0]), float(geom[1])
+        except (TypeError, ValueError):
+            return None
+    return None
+
+
+def _point_coords(df, site):
+    """lon/lat dicts keyed by site from point geometry, or None."""
+    if "geometry" not in df.columns:
+        return None
+    geo = df.dropna(subset=["geometry"]).drop_duplicates(site)
+    if geo.empty:
+        return None
+    lon, lat = {}, {}
+    for _, row in geo.iterrows():
+        xy = _lonlat(row["geometry"])
+        if xy is not None:
+            lon[row[site]], lat[row[site]] = xy
+    if not lon:
+        return None  # no point geometry; skip rather than guess
+    return lon, lat
+
+
+def _prepare_values(df, group_cols, ancillary_cols):
+    """Slim the frame and coerce types, shared by the dense / ragged builders.
+
+    Keeps only the columns we convert, parses ``time`` to naive-UTC
+    ``datetime64`` (xarray has no tz dtype), coerces ``value`` to numeric, and
+    drops rows whose ``time`` is unparseable/missing (with a warning, so
+    observations are never lost without a trace). Returns
+    ``(work, group_cols, ancillary, has_unit)`` filtered to columns present.
+    """
+    group_cols = [c for c in group_cols if c in df.columns]
+    ancillary = [c for c in ancillary_cols if c in df.columns]
+    has_unit = "unit_of_measure" in df.columns
+    cols = [_SITE, "time", "value", *group_cols, *ancillary]
+    if has_unit:
+        cols.append("unit_of_measure")
+    present = [c for c in dict.fromkeys(cols) if c in df.columns]
+    work = df.loc[:, present].copy()
+    # Instance id, time, and value are mandatory to build a series. A response
+    # that lacks any of them (e.g. a non-result Samples profile) has nothing to
+    # convert, so return an empty frame -> empty Dataset rather than KeyError.
+    if not {_SITE, "time", "value"}.issubset(work.columns):
+        return work.iloc[0:0], group_cols, ancillary, has_unit
+    # ``format="ISO8601"`` matches the Water Data API timestamps and avoids
+    # pandas' slow per-element ``dateutil`` fallback (and its warning) on large
+    # frames; the tz is normalized to UTC then dropped (xarray has no tz dtype).
+    work["time"] = _pd.to_datetime(
+        work["time"], format="ISO8601", errors="coerce", utc=True
+    ).dt.tz_localize(None)
+    work["value"] = _pd.to_numeric(work["value"], errors="coerce")
+    n_before = len(work)
+    work = work[work["time"].notna()]
+    dropped = n_before - len(work)
+    if dropped:
+        _warnings.warn(
+            f"dropped {dropped} row(s) with an unparseable or missing time.",
+            stacklevel=3,
+        )
+    return work, group_cols, ancillary, has_unit
+
+
+def _cf_units(unit):
+    """Map a USGS unit string to its CF / UDUNITS form, or pass it through."""
+    return CF_UNIT_MAP.get(str(unit), str(unit))
+
+
+def _var_attrs(desc, *, unit, pcode, stat, default_cell_method):
+    """Build the CF attribute dict for one data variable.
+
+    ``unit``, ``pcode`` and ``stat`` are read from the values frame; ``desc``
+    supplies only the human-readable name from the metadata lookup. Linking the
+    ``ancillary_variables`` is left to each builder, since the flag-variable
+    names follow the (layout-specific) data-variable name.
+    """
+    attrs: dict[str, str] = {}
+    long_name = desc.get("parameter_description") or desc.get("parameter_name")
+    if long_name and _pd.notna(long_name):
+        attrs["long_name"] = str(long_name)
+
+    if unit is not None and _pd.notna(unit):
+        attrs["units"] = _cf_units(unit)
+
+    op = (
+        CF_CELL_METHODS.get(str(stat)) if stat is not None and _pd.notna(stat) else None
+    )
+    op = op or default_cell_method
+    if op:
+        attrs["cell_methods"] = f"time: {op}"
+
+    if pcode is not None and _pd.notna(pcode):
+        sn = CF_STANDARD_NAMES.get(str(pcode))
+        if sn:
+            attrs["standard_name"] = sn
+        datum = CF_VERTICAL_DATUM.get(str(pcode))
+        if datum:
+            attrs["vertical_datum"] = datum
+        attrs["usgs_parameter_code"] = str(pcode)
+
+    if stat is not None and _pd.notna(stat):
+        attrs["usgs_statistic_id"] = str(stat)
+    return attrs
+
+
+def _dataset_attrs(service, base_meta):
+    """Dataset-level provenance (CF + ACDD) attributes."""
+    attrs = {
+        "Conventions": "CF-1.11",
+        "institution": "U.S. Geological Survey",
+        "source": f"USGS Water Data API ({service})",
+        "featureType": "timeSeries",
+        "history": (
+            f"{_dt.datetime.now(_dt.timezone.utc).isoformat(timespec='seconds')} "
+            "created by dataretrieval.waterdata.xarray"
+        ),
+    }
+    url = getattr(base_meta, "url", None)
+    if url:
+        attrs["references"] = str(url)
+    return attrs
+
+
+def _empty_dataset(service, base_meta):
+    """An attribute-only Dataset for an empty / unconvertible response."""
+    ds = _xr.Dataset()
+    ds.attrs = _dataset_attrs(service, base_meta)
+    return ds
+
+
+# === metadata cache ========================================================
+
+
+class _MetadataCache:
+    """Per-process, bounded, thread-safe cache of supplementary metadata.
+
+    Keyed by monitoring location, this memoizes the human-readable parameter
+    name and a couple of site descriptors that the values getters don't return.
+    ``getter`` is called as ``getter(monitoring_location_id=[...])`` and must
+    return ``(DataFrame, metadata)``; one fetch is issued per not-yet-cached
+    batch of sites and reused thereafter.
+
+    The metadata is supplementary, so a fetch failure degrades to "no metadata"
+    with a warning rather than discarding the already-retrieved observations,
+    and the failed sites are left uncached so a later, recovered call retries
+    them. The cache is bounded (oldest-inserted entries are evicted past
+    ``maxsize``) and its mutation is serialized, so a long-running, many-site,
+    multi-threaded process stays both correct and bounded.
+    """
+
+    def __init__(self, getter, *, maxsize=_CACHE_MAXSIZE):
+        self._getter = getter
+        self._maxsize = maxsize
+        self._entries: dict[str, dict] = {}
+        self._lock = _threading.Lock()
+
+    def lookup(self, site_ids):
+        """``(param_meta, site_meta)`` for ``site_ids``, fetching any misses.
+
+        ``param_meta`` is ``{parameter_code: {name descriptors}}`` (keyed by the
+        stable parameter code, so no hash id is needed) and ``site_meta`` is
+        ``{monitoring_location_id: {site descriptors present in the response}}``.
+        """
+        sites = sorted({str(s) for s in site_ids if _pd.notna(s)})
+        # Racy read of the keys is fine: a concurrent miss just re-fetches (the
+        # fetch is idempotent); only the writes in _ingest take the lock.
+        todo = [s for s in sites if s not in self._entries]
+        if todo:
+            try:
+                meta, _ = self._getter(monitoring_location_id=todo)
+            except Exception as exc:  # supplementary lookup; never lose the data
+                _warnings.warn(
+                    f"metadata lookup failed ({exc!r}); returning the dataset "
+                    "without parameter names / site descriptors.",
+                    stacklevel=2,
+                )
+            else:
+                self._ingest(meta, todo)
+        param_meta: dict[str, dict] = {}
+        site_meta: dict[str, dict] = {}
+        with self._lock:
+            for s in sites:
+                entry = self._entries.get(s, {})
+                param_meta.update(entry.get("params", {}))
+                if entry.get("site"):
+                    site_meta[s] = entry["site"]
+        return param_meta, site_meta
+
+    def clear(self):
+        """Empty the cache (release memory / force a refresh)."""
+        with self._lock:
+            self._entries.clear()
+
+    def __len__(self):
+        return len(self._entries)
+
+    def _ingest(self, meta, todo):
+        """Parse ``meta`` into per-site entries, then merge + evict under lock.
+
+        The parsing runs lock-free on a local dict; only the (cheap) merge into
+        the shared cache and the FIFO eviction past ``maxsize`` hold the lock.
+        """
+        fresh = {s: {"params": {}, "site": {}} for s in todo}
+        if not meta.empty:
+            name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns]
+            site_cols = [c for c in _SITE_DESCRIPTORS if c in meta.columns]
+            has_pcode = "parameter_code" in meta.columns
+            # to_dict("records") is markedly faster than iterrows() (no per-row
+            # Series boxing) and the dict access below is identical.
+            for row in meta.to_dict("records"):
+                site = row.get("monitoring_location_id")
+                if site not in fresh:
+                    continue
+                if has_pcode:
+                    pcode = row.get("parameter_code")
+                    if _pd.notna(pcode):
+                        fresh[site]["params"][str(pcode)] = {
+                            c: row.get(c) for c in name_cols
+                        }
+                if not fresh[site]["site"]:
+                    desc = {c: row.get(c) for c in site_cols if _pd.notna(row.get(c))}
+                    if desc:
+                        fresh[site]["site"] = desc
+        with self._lock:
+            self._entries.update(fresh)
+            while len(self._entries) > self._maxsize:
+                self._entries.pop(next(iter(self._entries)))
+
+
+# One cache per metadata endpoint, shared across getters for the process life.
+_TS_CACHE = _MetadataCache(_api.get_time_series_metadata)
+_FIELD_CACHE = _MetadataCache(_api.get_field_measurements_metadata)
+
+
+def clear_metadata_cache():
+    """Empty the per-process metadata caches (parameter names, site descriptors).
+
+    The xarray getters cache the supplementary metadata lookup per monitoring
+    location for the life of the process. Long-running services can call this to
+    release memory or to force a refresh after the upstream metadata changed.
+    """
+    _TS_CACHE.clear()
+    _FIELD_CACHE.clear()
+
+
+# === column schemas ========================================================
+
+# Water-quality samples (Samples DB / WQX) speak a different column vocabulary
+# than the time-series getters; map their tidy backbone onto the canonical
+# names the builders understand.
+_SAMPLES_RENAME = {
+    "Location_Identifier": _SITE,
+    "Activity_StartDateTime": "time",
+    "Result_Measure": "value",
+    "Result_MeasureUnit": "unit_of_measure",
+    "Result_Characteristic": "characteristic",
+    "Result_SampleFraction": "sample_fraction",
+    "Result_ResultDetectionCondition": "detection_condition",
+    "Result_MeasureStatusIdentifier": "status",
+}
+_CANONICAL_COORD_ATTRS = {
+    "parameter_code": {"long_name": "USGS parameter code"},
+    "statistic_id": {"long_name": "USGS statistic code"},
+    "unit_of_measure": {"long_name": "unit of measurement"},
+}
+_SAMPLES_COORD_ATTRS = {
+    "characteristic": {"long_name": "characteristic name"},
+    "sample_fraction": {"long_name": "sample fraction"},
+    "unit_of_measure": {"long_name": "unit of measurement"},
+    "detection_condition": {"long_name": "result detection condition"},
+    "status": {"long_name": "result status"},
+}
+
+
+@dataclass(frozen=True)
+class _Schema:
+    """How a service's columns map onto the canonical builder vocabulary.
+
+    One object describes a column dialect so a single set of builders serves
+    both the time-series getters and the (differently-named) Samples data:
+
+    * ``rename`` -- source -> canonical column names (empty for the getters
+      that already use the canonical names);
+    * ``group_cols`` -- columns that identify one series (an instance);
+    * ``ancillary`` -- per-observation flag columns carried alongside ``value``;
+    * ``label_col`` -- the column whose value names the variable / ``long_name``
+      (``parameter_code`` for the getters, ``characteristic`` for samples);
+    * ``infer_standard_name`` -- whether ``label_col`` is a USGS parameter code
+      eligible for a CF ``standard_name`` lookup (False for free-text
+      characteristics);
+    * ``coord_attrs`` -- ``long_name`` etc. for the per-instance metadata vars.
+    """
+
+    rename: dict = field(default_factory=dict)
+    group_cols: tuple = ("parameter_code", "statistic_id")
+    ancillary: tuple = _ANCILLARY
+    label_col: str = "parameter_code"
+    infer_standard_name: bool = True
+    coord_attrs: dict = field(default_factory=dict)
+
+
+_CANONICAL = _Schema(coord_attrs=_CANONICAL_COORD_ATTRS)
+_FIELD = replace(_CANONICAL, group_cols=("parameter_code",))
+_SAMPLES = _Schema(
+    rename=_SAMPLES_RENAME,
+    group_cols=("characteristic", "sample_fraction"),
+    ancillary=("detection_condition", "status"),
+    label_col="characteristic",
+    infer_standard_name=False,
+    coord_attrs=_SAMPLES_COORD_ATTRS,
+)
+
+
+# === dataset builders ======================================================
+
+
+class _DatasetBuilder:
+    """Strategy base: convert one values frame into a CF ``xarray.Dataset``.
+
+    Holds only what every layout needs -- the frame, the request metadata, and
+    the dataset-level provenance scaffolding -- and leaves :meth:`build` to the
+    subclass. The series layouts share more state and live under
+    :class:`_SeriesBuilder`; :class:`_StatsBuilder` extends this base directly.
+    Construct via the :func:`_build_ragged` / :func:`_build_dense` /
+    :func:`_build_stats` helpers rather than directly.
+    """
+
+    def __init__(self, df, base_meta, *, service):
+        self.df = df
+        self.base_meta = base_meta
+        self.service = service
+
+    def build(self):
+        raise NotImplementedError
+
+    def _is_empty(self):
+        return self.df is None or len(self.df) == 0
+
+    def _empty(self):
+        return _empty_dataset(self.service, self.base_meta)
+
+    def _apply_provenance(self, ds):
+        """Set the dataset-level CF/ACDD attributes and ``date_modified``."""
+        ds.attrs = _dataset_attrs(self.service, self.base_meta)
+        dm = _date_modified(self.df)
+        if dm:
+            ds.attrs["date_modified"] = dm
+        return ds
+
+
+class _SeriesBuilder(_DatasetBuilder):
+    """Base for the time-series layouts (ragged / dense).
+
+    Adds the column ``schema``, the metadata lookups, and the prepare /
+    spatial-coordinate scaffolding the two series layouts share. :meth:`build`
+    is a template: it renames the dialect, slims and type-checks the frame,
+    short-circuits an empty result, and hands the cleaned frame to the subclass
+    via :meth:`_build_series`.
+    """
+
+    def __init__(
+        self,
+        df,
+        base_meta,
+        *,
+        service,
+        schema=_CANONICAL,
+        series_meta=None,
+        site_meta=None,
+        default_cell_method=None,
+    ):
+        # Rename the source dialect onto the canonical names once, up front, so
+        # every downstream step speaks one vocabulary.
+        if df is not None and schema.rename:
+            df = df.rename(columns=schema.rename)
+        super().__init__(df, base_meta, service=service)
+        self.schema = schema
+        self.series_meta = series_meta or {}
+        self.site_meta = site_meta or {}
+        self.default_cell_method = default_cell_method
+
+    def build(self):
+        if self._is_empty():
+            return self._empty()
+        work, group_cols, ancillary, has_unit = _prepare_values(
+            self.df, self.schema.group_cols, self.schema.ancillary
+        )
+        if work.empty:
+            return self._empty()
+        return self._build_series(work, group_cols, ancillary, has_unit)
+
+    def _build_series(self, work, group_cols, ancillary, has_unit):
+        raise NotImplementedError
+
+    def _add_spatial_coords(self, ds, dim, order):
+        """Attach lon/lat and per-site descriptor coords along ``dim``.
+
+        ``order`` is the list of site ids in ``dim`` order (one per instance),
+        so the same logic serves both the dense (the site dimension) and ragged
+        (the timeseries dimension) layouts.
+        """
+        coords = _point_coords(self.df, _SITE)
+        if coords is not None:
+            lon, lat = coords
+            ds = ds.assign_coords(
+                longitude=(dim, [lon.get(k) for k in order]),
+                latitude=(dim, [lat.get(k) for k in order]),
+            )
+            ds["longitude"].attrs = {
+                "standard_name": "longitude",
+                "units": "degrees_east",
+            }
+            ds["latitude"].attrs = {
+                "standard_name": "latitude",
+                "units": "degrees_north",
+            }
+        # Instance-indexed site descriptors carried along on the metadata call
+        # (HUC, state); added only where present so absent fields don't appear.
+        if self.site_meta:
+            for col, col_attrs in _SITE_COORD_ATTRS.items():
+                vals = [self.site_meta.get(str(k), {}).get(col) for k in order]
+                if any(v is not None for v in vals):
+                    ds = ds.assign_coords({col: (dim, vals)})
+                    ds[col].attrs.update(col_attrs)
+        return ds
+
+
+class _RaggedBuilder(_SeriesBuilder):
+    """CF *contiguous ragged array* layout (the default).
+
+    All observations are concatenated into one ``obs`` dimension; each series (a
+    ``schema.group_cols`` combination at a site) is an instance along
+    ``timeseries`` with ``row_size`` giving its length. Only real observations
+    are stored (no NaN fill), so this scales to large, very ragged multi-site
+    pulls. Per-series parameter / statistic / unit are instance coordinates;
+    descriptors homogeneous across instances are also written onto ``value``.
+    """
+
+    def _build_series(self, work, group_cols, ancillary, has_unit):
+        inst_cols = [_SITE, *group_cols]
+        ds, inst_frame = self._assemble(work, inst_cols, ancillary, has_unit)
+        value_attrs = self._value_attrs(ds, inst_frame)
+        ds = self._finalize(ds, inst_frame, ancillary, value_attrs)
+
+        for col, col_attrs in self.schema.coord_attrs.items():
+            if col in ds.variables:
+                ds[col].attrs.update(col_attrs)
+        order = inst_frame[_SITE].to_numpy()
+        return self._add_spatial_coords(ds, "timeseries", order)
+
+    def _assemble(self, work, inst_cols, ancillary, has_unit):
+        """Cleaned values frame -> CF contiguous-ragged Dataset skeleton.
+
+        Observations are concatenated along a single ``obs`` dimension, sorted
+        so each instance's rows are contiguous; ``row_size`` records how many
+        obs each instance contributes (the CF ``sample_dimension`` link).
+        ``unit_of_measure`` (when present) is carried as one value per instance.
+        Returns ``(ds, inst_frame)`` where ``inst_frame`` is one row/instance.
+        """
+        inst_first = ["unit_of_measure"] if has_unit else []
+        work = work.sort_values([*inst_cols, "time"], kind="stable")
+        grp = work.groupby(inst_cols, dropna=False, sort=False)
+        row_size = grp.size()
+        idx = row_size.index
+        inst_frame = (
+            idx.to_frame(index=False)
+            if isinstance(idx, _pd.MultiIndex)
+            else _pd.DataFrame({inst_cols[0]: idx})
+        )
+        data_vars = {
+            "value": ("obs", work["value"].to_numpy()),
+            "row_size": ("timeseries", row_size.to_numpy().astype("int32")),
+        }
+        for c in ancillary:
+            data_vars[c] = ("obs", work[c].to_numpy())
+        coords = {"time": ("obs", work["time"].to_numpy())}
+        for c in inst_cols:
+            coords[c] = ("timeseries", inst_frame[c].to_numpy())
+        for c in inst_first:
+            coords[c] = ("timeseries", grp[c].first().to_numpy())
+        return _xr.Dataset(data_vars, coords=coords), inst_frame
+
+    def _value_attrs(self, ds, inst_frame):
+        """CF attributes for ``value``.
+
+        Homogeneous descriptors (a single parameter / statistic / unit across
+        all instances) are written onto ``value``; otherwise ``value`` stays
+        generic and the per-instance coordinates carry each series' identity.
+        """
+        schema = self.schema
+        labels = _unique_present(inst_frame, schema.label_col)
+        stats = _unique_present(inst_frame, "statistic_id")
+        units = (
+            ds["unit_of_measure"].to_series().dropna().unique()
+            if "unit_of_measure" in ds.coords
+            else []
+        )
+        unit = units[0] if len(units) == 1 else None
+        if len(labels) == 1:
+            if schema.infer_standard_name:
+                desc, pcode = self.series_meta.get(str(labels[0]), {}), labels[0]
+            else:  # free-text label (e.g. a characteristic): it *is* the name
+                desc, pcode = {"parameter_name": str(labels[0])}, None
+            return _var_attrs(
+                desc,
+                unit=unit,
+                pcode=pcode,
+                stat=stats[0] if len(stats) == 1 else None,
+                default_cell_method=self.default_cell_method,
+            )
+        value_attrs = {
+            "long_name": "measured value",
+            "comment": (
+                "multiple series with differing metadata are stacked here; see "
+                "the per-timeseries coordinates for each series' identity"
+            ),
+        }
+        if unit is not None:
+            value_attrs["units"] = _cf_units(unit)
+        # A service-wide cell method (e.g. samples are instantaneous grabs)
+        # still applies when the statistic doesn't vary; the time-series getters
+        # leave per-parameter cell methods to the per-instance coordinates.
+        if self.default_cell_method and not schema.infer_standard_name:
+            value_attrs["cell_methods"] = f"time: {self.default_cell_method}"
+        return value_attrs
+
+    def _finalize(self, ds, inst_frame, ancillary, value_attrs):
+        """Attach the structural + provenance CF metadata for a ragged Dataset.
+
+        Sets the ``value`` attrs (with ``ancillary_variables`` linked), dataset
+        provenance, the ``row_size`` ``sample_dimension`` link, and a
+        per-instance ``timeseries_id`` carrying ``cf_role`` (a site alone isn't
+        unique once it has several series).
+        """
+        if ancillary:
+            value_attrs = {**value_attrs, "ancillary_variables": " ".join(ancillary)}
+        ds["value"].attrs = value_attrs
+        ds = self._apply_provenance(ds)
+        ds["time"].attrs.setdefault("standard_name", "time")
+        ds["row_size"].attrs = {
+            "long_name": "number of observations per time series",
+            "sample_dimension": "obs",
+        }
+        # Join the instance keys into a cf_role id, skipping null parts so a
+        # missing key (e.g. a characteristic with no sample fraction) doesn't
+        # render as a literal "nan" token.
+        ts_id = inst_frame.apply(
+            lambda r: ":".join(str(x) for x in r if _pd.notna(x)), axis=1
+        ).to_numpy()
+        ds = ds.assign_coords(timeseries_id=("timeseries", ts_id))
+        ds["timeseries_id"].attrs["cf_role"] = "timeseries_id"
+        ds[_SITE].attrs.setdefault("long_name", "monitoring location identifier")
+        return ds
+
+
+class _DenseBuilder(_SeriesBuilder):
+    """Dense ``(monitoring_location_id, time)`` grid: one named variable per
+    parameter, NaN where a series has no observation. Ergonomic for a few
+    overlapping series but memory-costly for large ragged collections; see
+    :class:`_RaggedBuilder` for the default.
+    """
+
+    def _build_series(self, work, group_cols, ancillary, has_unit):
+        # Outer join on time: parameters sampled on different clocks share a
+        # union time axis, NaN where a given parameter has no observation.
+        ds = _xr.merge(
+            self._variable_datasets(work, group_cols, ancillary, has_unit),
+            combine_attrs="drop_conflicts",
+            join="outer",
+        )
+        ds = self._apply_provenance(ds)
+        ds["time"].attrs.setdefault("standard_name", "time")
+        if _SITE in ds.coords:
+            ds[_SITE].attrs.setdefault("cf_role", "timeseries_id")
+        order = list(ds[_SITE].values) if _SITE in ds.coords else []
+        return self._add_spatial_coords(ds, _SITE, order)
+
+    def _variable_datasets(self, work, group_cols, ancillary, has_unit):
+        """One pivoted ``(site, time)`` Dataset per (parameter, statistic)."""
+        datasets, used = [], set()
+        for _, group in work.groupby(group_cols, dropna=False):
+            pcode = _first_present(group, "parameter_code")
+            stat = _first_present(group, "statistic_id")
+            group_units = group["unit_of_measure"].dropna().unique() if has_unit else ()
+            unit = group_units[0] if len(group_units) else None
+            desc = self.series_meta.get(str(pcode), {}) if pcode is not None else {}
+
+            name = self._variable_name(desc, pcode, stat, used)
+            used.add(name)
+
+            if len(group_units) > 1:
+                # One variable can carry only one ``units`` attr; surface the
+                # mix instead of silently labeling every value with the first.
+                _warnings.warn(
+                    f"'{name}' spans multiple units {list(group_units)}; labeling "
+                    f"with '{unit}'. Filter by site/parameter to avoid mixing "
+                    "units in one variable.",
+                    stacklevel=3,
+                )
+
+            sub = group.set_index([_SITE, "time"])[["value", *ancillary]]
+            if not sub.index.is_unique:
+                _warnings.warn(
+                    f"'{name}' has multiple values per (site, time) -- two series "
+                    "share this (site, parameter, statistic); keeping the first. "
+                    "Filter the query to separate them.",
+                    stacklevel=3,
+                )
+                sub = sub[~sub.index.duplicated(keep="first")]
+            ds_g = sub.to_xarray().rename(
+                {"value": name, **{c: f"{name}_{c}" for c in ancillary}}
+            )
+            ds_g[name].attrs = _var_attrs(
+                desc,
+                unit=unit,
+                pcode=pcode,
+                stat=stat,
+                default_cell_method=self.default_cell_method,
+            )
+            if ancillary:
+                ds_g[name].attrs["ancillary_variables"] = " ".join(
+                    f"{name}_{c}" for c in ancillary
+                )
+            datasets.append(ds_g)
+        return datasets
+
+    @staticmethod
+    def _variable_name(desc, pcode, stat, used):
+        """A unique slug for a variable; disambiguate same-parameter series."""
+        name = _slug(desc.get("parameter_name") or pcode or "value")
+        if name in used:  # same parameter, different statistic -> distinct var
+            op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None)
+            name = f"{name}_{_slug(op)}" if op else name
+        while name in used:
+            name += "_x"
+        return name
+
+
+class _StatsBuilder(_DatasetBuilder):
+    """Best-effort, preliminary conversion of the statistics tables.
+
+    The statistics service returns percentile tables keyed by time-of-year
+    rather than a (time, value) series, so this produces a flat Dataset (one
+    variable per column over an ``index`` dimension) with dataset-level
+    provenance only. A richer percentile / day-of-year layout is future work.
+    """
+
+    def build(self):
+        if self._is_empty():
+            return self._empty()
+        # The series builders surface only the columns they convert, so opaque
+        # hash IDs never reach those datasets. This flat path keeps every column,
+        # so drop the stats service's hash-valued IDs (and geometry) here to keep
+        # the CF dataset free of per-record UUID coordinates.
+        drop = ("geometry", "computation_id", "parent_time_series_id", "time_series_id")
+        flat = self.df.drop(columns=[c for c in drop if c in self.df.columns])
+        ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True))
+        ds = self._apply_provenance(ds)
+        ds.attrs["comment"] = "preliminary flat conversion; see module docs"
+        return ds
+
+
+def _build_ragged(df, base_meta, **kwargs):
+    """Functional entry point for the ragged layout (see :class:`_RaggedBuilder`)."""
+    return _RaggedBuilder(df, base_meta, **kwargs).build()
+
+
+def _build_dense(df, base_meta, **kwargs):
+    """Functional entry point for the dense layout (see :class:`_DenseBuilder`)."""
+    return _DenseBuilder(df, base_meta, **kwargs).build()
+
+
+def _build_stats(df, base_meta, service):
+    """Functional entry point for the stats layout (see :class:`_StatsBuilder`)."""
+    return _StatsBuilder(df, base_meta, service=service).build()
+
+
+# === public getters ========================================================
+
+
+def _fetch(func, args, kwargs):
+    """Call the underlying getter, dropping a stray ``include_hash`` kwarg.
+
+    The xarray builders surface only the columns they convert, so the opaque
+    hash-valued ID columns (per-record UUIDs, per-series join keys) never reach
+    the dataset regardless of what the getter returns. ``include_hash`` is not a
+    parameter of the plain getters, so it is swallowed here to keep passing it
+    to an xarray wrapper harmless.
+    """
+    kwargs.pop("include_hash", None)
+    return func(*args, **kwargs)
+
+
+def _xr_doc(func, *, cf_metadata=True, allow_dense=True):
+    """Prepend an xarray note to the wrapped getter's docstring.
+
+    ``cf_metadata=False`` describes the preliminary stats path (a flat Dataset
+    without per-variable CF attributes); ``allow_dense=False`` describes a
+    ragged-only path (samples), where ``dense=`` does not apply.
+    """
+    if not cf_metadata:
+        returns = (
+            "a preliminary, flat ``xarray.Dataset`` (dataset-level provenance "
+            "only; per-variable CF metadata is not yet populated)"
+        )
+    elif allow_dense:
+        returns = (
+            "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array "
+            "(pass ``dense=True`` for the NaN-filled (site, time) grid with one "
+            "named variable per parameter)"
+        )
+    else:
+        returns = (
+            "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array "
+            "(always ragged; discrete samples are too sparse for a dense grid, "
+            "so ``dense=`` does not apply)"
+        )
+    # A column-0 summary paragraph + a cleandoc'd body keeps the combined
+    # docstring valid numpydoc (the wrapped getter's first line is unindented
+    # but its body is source-indented, which would otherwise break dedent and
+    # over-indent the Parameters/Returns sections).
+    note = (
+        "xarray wrapper: same arguments as "
+        f"``dataretrieval.waterdata.{func.__name__}``, but returns {returns}. "
+        "Hash-valued ID columns are always omitted here; the ``include_hash`` "
+        "flag does not apply."
+    )
+    body = _inspect.cleandoc(func.__doc__) if func.__doc__ else ""
+    return f"{note}\n\n{body}" if body else note
+
+
+def _public_signature(wrapped, *, has_dense):
+    """The signature ``help()`` / IDEs should show for a wrapper.
+
+    Keeps the wrapped getter's own parameters (so its real arguments stay
+    discoverable), adds the keyword-only ``dense`` toggle where the layout
+    offers it, and declares the ``xarray.Dataset`` return -- correcting the
+    DataFrame signature that ``functools.wraps`` would otherwise carry over.
+    """
+    sig = _inspect.signature(wrapped)
+    params = list(sig.parameters.values())
+    if has_dense:
+        cut = next(
+            (
+                i
+                for i, p in enumerate(params)
+                if p.kind is _inspect.Parameter.VAR_KEYWORD
+            ),
+            len(params),
+        )
+        params.insert(
+            cut,
+            _inspect.Parameter(
+                "dense",
+                _inspect.Parameter.KEYWORD_ONLY,
+                default=False,
+                annotation="bool",
+            ),
+        )
+    return sig.replace(parameters=params, return_annotation="xarray.Dataset")
+
+
+@dataclass(frozen=True)
+class _Service:
+    """Per-service configuration that drives one public xarray getter.
+
+    The variation across services is data, not behaviour: which getter to call,
+    which :class:`_MetadataCache` (if any) supplies the parameter names, the
+    column ``schema``, the fallback ``cell_methods`` operator, whether the
+    result is a time series or the preliminary stats table, and whether a dense
+    grid is offered.
+    """
+
+    getter: Callable
+    service: str
+    metadata_cache: _MetadataCache | None = None
+    schema: _Schema = _CANONICAL
+    default_cell_method: str | None = None
+    layout: str = "series"  # "series" | "stats"
+    allow_dense: bool = True
+
+
+def _make_getter(spec):
+    """Build the public getter for one ``_Service`` spec (Factory)."""
+
+    has_dense = spec.layout != "stats" and spec.allow_dense
+
+    @_wraps(spec.getter)  # carry __name__/__wrapped__ through to help()/IDEs
+    def getter(*args, dense=False, **kwargs):
+        if dense and not has_dense:
+            _warnings.warn(
+                f"{spec.getter.__name__} has no dense layout; ignoring dense=True.",
+                stacklevel=2,
+            )
+            dense = False
+        df, base_meta = _fetch(spec.getter, args, kwargs)
+        if spec.layout == "stats":
+            return _build_stats(df, base_meta, spec.service)
+        if spec.metadata_cache is not None:
+            series_meta, site_meta = spec.metadata_cache.lookup(_sites(df))
+        else:
+            series_meta, site_meta = {}, {}
+        build = _build_dense if dense else _build_ragged
+        return build(
+            df,
+            base_meta,
+            service=spec.service,
+            schema=spec.schema,
+            series_meta=series_meta,
+            site_meta=site_meta,
+            default_cell_method=spec.default_cell_method,
+        )
+
+    # ``@_wraps`` copied the getter's docstring, signature and annotations
+    # verbatim -- which would advertise a DataFrame return and hide ``dense``.
+    # Replace the docstring with the xarray note and publish an accurate
+    # signature: the getter's own args, plus ``dense`` where it applies,
+    # returning an ``xarray.Dataset``.
+    getter.__doc__ = _xr_doc(
+        spec.getter,
+        cf_metadata=(spec.layout != "stats"),
+        allow_dense=spec.allow_dense,
+    )
+    getter.__signature__ = _public_signature(spec.getter, has_dense=has_dense)
+    getter.__annotations__ = {
+        **getter.__annotations__,
+        "return": "xarray.Dataset",
+        **({"dense": "bool"} if has_dense else {}),
+    }
+    return getter
+
+
+get_daily = _make_getter(_Service(_api.get_daily, "daily", metadata_cache=_TS_CACHE))
+get_continuous = _make_getter(
+    _Service(_api.get_continuous, "continuous", metadata_cache=_TS_CACHE)
+)
+get_latest_continuous = _make_getter(
+    _Service(
+        _api.get_latest_continuous,
+        "latest-continuous",
+        metadata_cache=_TS_CACHE,
+        default_cell_method="point",
+    )
+)
+get_latest_daily = _make_getter(
+    _Service(
+        _api.get_latest_daily,
+        "latest-daily",
+        metadata_cache=_TS_CACHE,
+        default_cell_method="point",
+    )
+)
+get_nearest_continuous = _make_getter(
+    _Service(
+        _get_nearest_continuous,
+        "continuous",
+        metadata_cache=_TS_CACHE,
+        default_cell_method="point",
+    )
+)
+get_peaks = _make_getter(
+    _Service(
+        _api.get_peaks,
+        "peaks",
+        metadata_cache=_TS_CACHE,
+        default_cell_method="maximum",
+    )
+)
+get_field_measurements = _make_getter(
+    _Service(
+        _api.get_field_measurements,
+        "field-measurements",
+        metadata_cache=_FIELD_CACHE,
+        schema=_FIELD,
+        default_cell_method="point",
+    )
+)
+get_stats_por = _make_getter(_Service(_api.get_stats_por, "statistics", layout="stats"))
+get_stats_date_range = _make_getter(
+    _Service(_api.get_stats_date_range, "statistics", layout="stats")
+)
+get_samples = _make_getter(
+    _Service(
+        _api.get_samples,
+        "samples",
+        schema=_SAMPLES,
+        default_cell_method="point",
+        allow_dense=False,
+    )
+)
diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb
new file mode 100644
index 00000000..3aa28246
--- /dev/null
+++ b/demos/waterdata_xarray_demo.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CF-conventions `xarray` datasets from the `waterdata` module\n",
+    "\n",
+    "The `dataretrieval.waterdata.xarray` module mirrors the `waterdata` getters\n",
+    "(`get_daily`, `get_continuous`, `get_peaks`, `get_samples`, …) but returns a\n",
+    "[CF-conventions](https://cfconventions.org/) [`xarray.Dataset`](https://docs.xarray.dev/)\n",
+    "instead of a `pandas.DataFrame`. Units, statistics, and station coordinates come\n",
+    "straight from the data, and parameter names from a small cached lookup; all are\n",
+    "written onto the dataset as CF attributes, so the result is self-describing and\n",
+    "ready to write to netCDF.\n",
+    "\n",
+    "This notebook covers:\n",
+    "\n",
+    "1. the default **ragged** layout and how to read it;\n",
+    "2. **why** ragged is the default (and the `dense=True` opt-out);\n",
+    "3. the one trade-off you need to know — **selecting by time**;\n",
+    "4. multiple parameters in one pull;\n",
+    "5. water-quality samples (`get_samples`);\n",
+    "6. writing CF netCDF."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "The xarray helpers are an optional extra:\n",
+    "\n",
+    "```bash\n",
+    "pip install dataretrieval[xarray]\n",
+    "```\n",
+    "\n",
+    "As with the rest of `waterdata`, signing up for a free\n",
+    "[API key](https://api.waterdata.usgs.gov/signup/) gives you higher rate limits.\n",
+    "Import the xarray wrappers under a short alias so they don't shadow the plain\n",
+    "`DataFrame`-returning getters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from dataretrieval.waterdata import xarray as wdx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. A single time series\n",
+    "\n",
+    "Pull a year of daily mean discharge (parameter `00060`, statistic `00003`) at one\n",
+    "gage. The wrapper takes the same arguments as `waterdata.get_daily`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = wdx.get_daily(\n",
+    "    monitoring_location_id=\"USGS-05407000\",\n",
+    "    parameter_code=\"00060\",\n",
+    "    statistic_id=\"00003\",\n",
+    "    time=\"2023-01-01/2024-01-01\",\n",
+    ")\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note the shape of the result — this is a CF **contiguous ragged array**\n",
+    "(`featureType = \"timeSeries\"`):\n",
+    "\n",
+    "* every observation lives along a single `obs` dimension;\n",
+    "* each *series* — one `(monitoring location, parameter, statistic)` — is one\n",
+    "  instance along the `timeseries` dimension;\n",
+    "* `row_size` records how many observations each series contributes (the CF\n",
+    "  `sample_dimension` link), and `timeseries_id` carries `cf_role`;\n",
+    "* because there is a single, homogeneous parameter here, the descriptors land\n",
+    "  directly on `value` (`long_name`, `units`, `cell_methods`, `standard_name`).\n",
+    "\n",
+    "The flag columns (`qualifier`, `approval_status`) are linked as\n",
+    "`ancillary_variables`, and dataset attributes carry provenance:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(ds[\"value\"].attrs)\n",
+    "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\", \"date_modified\")})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Why ragged is the default\n",
+    "\n",
+    "Real collections are *ragged*: some gages have a century of record, others a few\n",
+    "years. Pull discharge at two gages with very different start dates and look at\n",
+    "`row_size`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sites = [\"USGS-05407000\", \"USGS-02238500\"]  # records since 1913 vs 2008\n",
+    "# Bound the window so the docs build stays light; the start-date gap still\n",
+    "# shows up as different row_size (~24 vs ~16 years of daily values).\n",
+    "ragged = wdx.get_daily(\n",
+    "    monitoring_location_id=sites,\n",
+    "    parameter_code=\"00060\",\n",
+    "    statistic_id=\"00003\",\n",
+    "    time=\"2000-01-01/2024-01-01\",\n",
+    ")\n",
+    "print(\"dims      :\", dict(ragged.sizes))\n",
+    "print(\"row_size  :\", dict(zip(ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The alternative is a **dense** `(monitoring_location_id, time)` grid — one named\n",
+    "variable per parameter, NaN where a series has no observation. It is convenient\n",
+    "(see the next section) but pays for a union time axis and NaN fill. Pass\n",
+    "`dense=True` to get it, and compare the in-memory size:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dense = wdx.get_daily(\n",
+    "    monitoring_location_id=sites,\n",
+    "    parameter_code=\"00060\",\n",
+    "    statistic_id=\"00003\",\n",
+    "    time=\"2000-01-01/2024-01-01\",\n",
+    "    dense=True,\n",
+    ")\n",
+    "print(\"dense dims :\", dict(dense.sizes))\n",
+    "print(\"dense var  :\", list(dense.data_vars))\n",
+    "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB   dense {dense.nbytes/1e6:6.2f} MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With only two gages the gap is modest, but it grows fast: a whole state's daily\n",
+    "discharge can be 15–30% dense, where the gridded layout balloons to hundreds of\n",
+    "megabytes (mostly NaN) while the ragged array stores only real observations.\n",
+    "That is why ragged is the default."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. The trade-off: selecting by time\n",
+    "\n",
+    "This is the one thing to internalize about the ragged layout.\n",
+    "\n",
+    "In the **dense** dataset, `time` is a real dimension with an index, so\n",
+    "label-based selection just works:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# all sites on a given day, addressed as a (site, time) grid\n",
+    "dense[\"discharge\"].sel(time=\"2020-06-01\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the **ragged** dataset, `value` is one flat list along `obs`, and `time` is\n",
+    "*just another variable* riding along `obs` — not a dimension. So `value` cannot\n",
+    "be addressed as a `(site, time)` grid, and a time selection returns whatever\n",
+    "observations happen to match, **mixed across sites with no labels** (and\n",
+    "identical dates across sites collide onto the same axis):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"value dims:\", ragged[\"value\"].dims)  # ('obs',) -- not (site, time)\n",
+    "# .sel(time=...) on the flat obs axis can't give you a per-series slice\n",
+    "ragged[\"value\"].sel(time=\"2020-06-01\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To recover the convenient, time-indexed view you first **regroup** the flat\n",
+    "`obs` back into per-series pieces using the offsets implied by `row_size`. A\n",
+    "tiny helper:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def series(ds, i):\n",
+    "    \"\"\"Instance ``i`` of a ragged dataset as a time-indexed DataArray.\"\"\"\n",
+    "    starts = np.concatenate([[0], np.cumsum(ds[\"row_size\"].values)])\n",
+    "    sl = slice(int(starts[i]), int(starts[i + 1]))\n",
+    "    da = ds[\"value\"].isel(obs=sl)\n",
+    "    return da.assign_coords(time=ds[\"time\"].isel(obs=sl)).swap_dims(obs=\"time\")\n",
+    "\n",
+    "\n",
+    "s0 = series(ragged, 0)\n",
+    "print(ragged[\"timeseries_id\"].values[0])\n",
+    "s0.sel(time=\"2020-06-01\")  # now .sel(time=...) works on a single series"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For anything beyond a one-off slice, the [`cf-xarray`](https://cf-xarray.readthedocs.io/)\n",
+    "package understands the CF ragged encoding (`cf_role`, `sample_dimension`) and\n",
+    "will regroup/decode for you. Or, when you know the pull is small and overlapping,\n",
+    "just ask for `dense=True` up front and use `.sel(time=...)` directly.\n",
+    "\n",
+    "**Rule of thumb:** ragged for storage and large multi-site pulls; `dense=True`\n",
+    "for ergonomic time-based slicing of a few overlapping series."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Multiple parameters in one pull\n",
+    "\n",
+    "When a pull mixes parameters (and units), the ragged layout keeps a single\n",
+    "`value` and records the parameter/unit per *instance* — so nothing is\n",
+    "mislabeled. The homogeneous descriptors drop off `value` and live on the\n",
+    "`parameter_code` / `unit_of_measure` coordinates instead:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "multi = wdx.get_daily(\n",
+    "    monitoring_location_id=\"USGS-05407000\",\n",
+    "    parameter_code=[\"00060\", \"00045\"],  # discharge + precipitation\n",
+    "    time=\"2023-06-01/2023-07-01\",\n",
+    ")\n",
+    "print(\"value long_name:\", multi[\"value\"].attrs.get(\"long_name\"))\n",
+    "print(\"per-instance parameter_code:\", multi[\"parameter_code\"].values)\n",
+    "print(\"per-instance unit_of_measure:\", multi[\"unit_of_measure\"].values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the `dense=True` form each parameter instead becomes its own named variable\n",
+    "(`discharge`, `precipitation`, …) on the shared time grid."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Water-quality samples\n",
+    "\n",
+    "`get_samples` returns discrete water-quality results in the same ragged shape:\n",
+    "one instance per `(monitoring location, characteristic, sample fraction)`, with\n",
+    "the result value plus `detection_condition` and `status` as ancillary\n",
+    "(censoring) variables. Characteristics are free text, so no CF `standard_name`\n",
+    "is inferred and non-numeric results coerce to NaN (the `detection_condition`\n",
+    "variable preserves non-detects)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wq = wdx.get_samples(\n",
+    "    service=\"results\",\n",
+    "    profile=\"basicphyschem\",\n",
+    "    monitoringLocationIdentifier=\"USGS-05406500\",\n",
+    "    activityStartDateLower=\"2019-01-01\",\n",
+    "    activityStartDateUpper=\"2020-01-01\",\n",
+    ")\n",
+    "print(\"dims:\", dict(wq.sizes))\n",
+    "print(\"characteristics:\", sorted(set(wq[\"characteristic\"].values))[:8], \"...\")\n",
+    "print(\"ancillary:\", wq[\"value\"].attrs.get(\"ancillary_variables\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Writing CF netCDF\n",
+    "\n",
+    "Because the dataset already carries CF metadata in the standard ragged-array\n",
+    "encoding, it serializes straight to a self-describing netCDF file that\n",
+    "CF-aware tools (THREDDS, `cf-xarray`, Panoply, …) can read back (requires a\n",
+    "netCDF backend, e.g. `pip install netCDF4`):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tempfile\n",
+    "\n",
+    "path = os.path.join(tempfile.mkdtemp(), \"daily_discharge.nc\")\n",
+    "ragged.to_netcdf(path)  # CF-1.11 contiguous ragged array\n",
+    "print(\"wrote\", path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References\n",
+    "\n",
+    "* CF conventions — [Discrete Sampling Geometries](https://cfconventions.org/cf-conventions/cf-conventions.html#discrete-sampling-geometries)\n",
+    "  (contiguous ragged array representation)\n",
+    "* [`cf-xarray`](https://cf-xarray.readthedocs.io/) — decode/group CF DSG datasets\n",
+    "* [`xarray`](https://docs.xarray.dev/) documentation"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
index 91d7bd1f..c86f6f4d 100644
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@@ -32,6 +32,18 @@ functions and is executed against the live USGS Water Data API.
     USGS_WaterData_ContinuousData_Examples
     USGS_WaterData_ReferenceLists_Examples
 
+CF-conventions ``xarray`` datasets from the ``waterdata`` module
+----------------------------------------------------------------
+The ``waterdata.xarray`` wrappers return CF-conventions ``xarray.Dataset``
+objects (a ragged array by default, with a ``dense=True`` gridded opt-out).
+This notebook demonstrates the layouts, the time-selection trade-off, and
+writing CF netCDF.
+
+.. toctree::
+    :maxdepth: 1
+
+    waterdata_xarray_demo
+
 Simple uses of the ``dataretrieval`` package
 --------------------------------------------
 
diff --git a/docs/source/examples/waterdata_xarray_demo.nblink b/docs/source/examples/waterdata_xarray_demo.nblink
new file mode 100644
index 00000000..9c886545
--- /dev/null
+++ b/docs/source/examples/waterdata_xarray_demo.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../demos/waterdata_xarray_demo.ipynb"
+}
diff --git a/pyproject.toml b/pyproject.toml
index 62ac7478..2c90f806 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,10 +57,18 @@ doc = [
   # bound as the [nldi] extra.
   "geopandas>=0.10",
   "mapclassify",
+  # waterdata_xarray_demo.ipynb imports dataretrieval.waterdata.xarray and
+  # writes a CF netCDF file; nbsphinx executes it at build time, so the doc
+  # environment needs xarray and a netCDF backend.
+  "xarray>=2023.1.0",
+  "netCDF4",
 ]
 nldi = [
   'geopandas>=0.10'
 ]
+xarray = [
+  'xarray>=2023.1.0'
+]
 
 [project.urls]
 homepage = "https://github.com/DOI-USGS/dataretrieval-python"
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
new file mode 100644
index 00000000..d74fc64c
--- /dev/null
+++ b/tests/waterdata_xarray_test.py
@@ -0,0 +1,803 @@
+"""Offline unit tests for dataretrieval.waterdata.xarray converters.
+
+These exercise the pure DataFrame -> xarray.Dataset converters with synthetic
+frames, so they run without network access. Live end-to-end behavior is
+covered by the waterdata getters' own tests.
+"""
+
+from types import SimpleNamespace
+
+import pandas as pd
+import pytest
+
+xr = pytest.importorskip("xarray")
+from dataretrieval.waterdata import xarray as wdx  # noqa: E402
+
+
+def _meta(url="https://example.test/items"):
+    return SimpleNamespace(url=url)
+
+
+def _daily_frame(
+    time_series_id="A",
+    site="USGS-1",
+    values=(100, 110),
+    times=("2024-06-01", "2024-06-02"),
+):
+    n = len(values)
+    return pd.DataFrame(
+        {
+            "time": list(times),
+            "value": list(values),
+            "monitoring_location_id": [site] * n,
+            "parameter_code": ["00060"] * n,
+            "statistic_id": ["00003"] * n,
+            "unit_of_measure": ["ft^3/s"] * n,
+            "qualifier": [None] * n,
+            "approval_status": ["Approved"] * n,
+            "time_series_id": [time_series_id] * n,
+        }
+    )
+
+
+# series_meta is keyed by parameter_code and supplies only the readable name;
+# units/statistic/parameter_code come from the values frame itself.
+_DISCHARGE_META = {
+    "00060": {
+        "parameter_name": "Discharge",
+        "parameter_description": "Discharge, cubic feet per second",
+    }
+}
+
+
+def _temp_frame(times=("2024-06-02", "2024-06-03"), values=(18.0, 19.0)):
+    """A water-temperature (00010, instantaneous) frame, parallel to _daily_frame."""
+    n = len(values)
+    return pd.DataFrame(
+        {
+            "time": list(times),
+            "value": list(values),
+            "monitoring_location_id": ["USGS-1"] * n,
+            "parameter_code": ["00010"] * n,
+            "statistic_id": ["00011"] * n,
+            "unit_of_measure": ["degC"] * n,
+            "qualifier": [None] * n,
+            "approval_status": ["Approved"] * n,
+            "time_series_id": ["B"] * n,
+        }
+    )
+
+
+_TEMP_META = {
+    "00010": {
+        "parameter_name": "Temperature, water",
+        "parameter_description": "Temperature, water, degrees Celsius",
+    }
+}
+
+
+def test_build_timeseries_cf_attributes():
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert isinstance(ds, xr.Dataset)
+    assert "discharge" in ds.data_vars
+    v = ds["discharge"]
+    assert v.attrs["long_name"] == "Discharge, cubic feet per second"
+    assert v.attrs["units"] == "ft3 s-1"  # UDUNITS-normalized from "ft^3/s"
+    assert v.attrs["cell_methods"] == "time: mean"
+    assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel"
+    assert v.attrs["usgs_parameter_code"] == "00060"
+    assert v.attrs["usgs_statistic_id"] == "00003"
+    # dataset-level provenance
+    assert ds.attrs["Conventions"] == "CF-1.11"
+    assert ds.attrs["featureType"] == "timeSeries"
+    assert ds.attrs["references"] == "https://example.test/items"
+    # site is the DSG instance dimension
+    assert ds["monitoring_location_id"].attrs.get("cf_role") == "timeseries_id"
+    assert ds.sizes["time"] == 2
+
+
+def test_ancillary_variables_linked():
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "discharge_qualifier" in ds.data_vars
+    assert "discharge_approval_status" in ds.data_vars
+    assert ds["discharge"].attrs["ancillary_variables"] == (
+        "discharge_qualifier discharge_approval_status"
+    )
+
+
+def test_unknown_unit_passes_through_verbatim():
+    df = _daily_frame()
+    df["unit_of_measure"] = "widgets/s"  # units are read from the frame
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert ds["discharge"].attrs["units"] == "widgets/s"
+
+
+def test_missing_standard_name_is_omitted():
+    # parameter_code with no curated CF mapping -> no standard_name key
+    meta = {"99999": {"parameter_name": "Mystery", "parameter_description": "Mystery"}}
+    df = _daily_frame()
+    df["parameter_code"] = "99999"
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=meta)
+    assert "standard_name" not in ds["mystery"].attrs
+    assert ds["mystery"].attrs["usgs_parameter_code"] == "99999"
+
+
+def test_vertical_datum_distinguishes_stage_parameters():
+    # 00065 (gage height) and 63160 (water level above NAVD88) share the CF
+    # standard_name water_surface_height_above_reference_datum; the vertical_datum
+    # attribute records the differing reference datum so they're distinguishable.
+    for pcode, datum in (("00065", "local site datum"), ("63160", "NAVD88")):
+        df = _daily_frame()
+        df["parameter_code"] = pcode
+        ds = wdx._build_ragged(df, _meta(), service="continuous", series_meta={})
+        v = ds["value"]
+        assert v.attrs["standard_name"] == "water_surface_height_above_reference_datum"
+        assert v.attrs["vertical_datum"] == datum
+    # a parameter with no datum mapping gets no vertical_datum attr
+    ds = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "vertical_datum" not in ds["value"].attrs
+
+
+def test_multiple_parameters_outer_join_on_time():
+    # discharge at t1,t2 ; temperature at t2,t3 -> union time, NaN fill
+    q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02"))
+    t = _temp_frame()
+    meta = {**_DISCHARGE_META, **_TEMP_META}
+    ds = wdx._build_dense(
+        pd.concat([q, t]), _meta(), service="continuous", series_meta=meta
+    )
+    assert {"discharge", "temperature_water"} <= set(ds.data_vars)
+    assert ds.sizes["time"] == 3  # union of {t1,t2,t3}
+    # temperature has no value at t1 -> NaN
+    assert pd.isna(ds["temperature_water"].sel(time="2024-06-01").item())
+    # cell_methods derived from statistic_id 00011 (instantaneous) -> point
+    assert ds["temperature_water"].attrs["cell_methods"] == "time: point"
+
+
+def test_collision_dedups_with_warning():
+    # two values for the same (site, parameter, statistic, time) are ambiguous
+    # without the hash key -> keep the first and warn; site stays the dim.
+    a = _daily_frame(values=(100,), times=("2024-06-01",))
+    b = _daily_frame(values=(200,), times=("2024-06-01",))
+    with pytest.warns(UserWarning, match="multiple values per"):
+        ds = wdx._build_dense(
+            pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
+        )
+    assert "monitoring_location_id" in ds.dims
+    assert ds.sizes["time"] == 1
+    assert (
+        ds["discharge"].sel(monitoring_location_id="USGS-1", time="2024-06-01").item()
+        == 100
+    )
+
+
+def test_empty_frame_returns_dataset_with_conventions():
+    ds = wdx._build_dense(pd.DataFrame(), _meta(), service="daily", series_meta={})
+    assert isinstance(ds, xr.Dataset)
+    assert list(ds.data_vars) == []
+    assert ds.attrs["Conventions"] == "CF-1.11"
+
+
+def test_build_stats_flat_dataset():
+    df = pd.DataFrame(
+        {
+            "monitoring_location_id": ["USGS-1", "USGS-1"],
+            "parameter_code": ["00060", "00060"],
+            "month": [1, 2],
+            "p50_va": [120.0, 130.0],
+        }
+    )
+    ds = wdx._build_stats(df, _meta(), "statistics")
+    assert isinstance(ds, xr.Dataset)
+    assert "p50_va" in ds.data_vars
+    assert ds.attrs["Conventions"] == "CF-1.11"
+
+
+def test_build_stats_drops_hash_columns():
+    # The plain getters no longer drop hash IDs, so the flat stats builder must
+    # drop the stats service's per-record / per-series UUIDs itself to keep the
+    # CF dataset free of opaque coordinates.
+    df = pd.DataFrame(
+        {
+            "monitoring_location_id": ["USGS-1"],
+            "parameter_code": ["00060"],
+            "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"],
+            "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"],
+            "time_series_id": ["b026-24dfa11f8503"],
+            "p50_va": [120.0],
+        }
+    )
+    ds = wdx._build_stats(df, _meta(), "statistics")
+    for hash_col in ("computation_id", "parent_time_series_id", "time_series_id"):
+        assert hash_col not in ds.variables
+    assert "p50_va" in ds.data_vars
+
+
+def test_ragged_omits_hash_columns():
+    # The synthetic daily frame carries a time_series_id hash column; the ragged
+    # builder whitelists the columns it converts, so the hash never surfaces in
+    # the dataset (the timeseries path stays hash-free without the getter's
+    # help).
+    ds = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "time_series_id" not in ds.variables
+
+
+def test_public_wrappers_exist_and_are_documented():
+    for name in [n for n in wdx.__all__ if n.startswith("get_")]:
+        fn = getattr(wdx, name)
+        assert callable(fn)
+        # _make_getter carries the wrapped getter's name through, so the public
+        # symbol, its __name__, and the docstring reference all agree.
+        assert fn.__name__ == name
+        assert "xarray wrapper" in (fn.__doc__ or "")
+        assert f"dataretrieval.waterdata.{name}" in (fn.__doc__ or "")
+
+
+def test_fetch_strips_include_hash():
+    # The xarray path never surfaces hash columns, so include_hash must be
+    # dropped before the underlying getter is called (no wasted fetch).
+    captured = {}
+
+    def fake_getter(*args, **kwargs):
+        captured.update(kwargs)
+        return "df", "meta"
+
+    df, meta = wdx._fetch(
+        fake_getter, (), {"include_hash": True, "parameter_code": "00060"}
+    )
+    assert (df, meta) == ("df", "meta")
+    assert "include_hash" not in captured
+    assert captured == {"parameter_code": "00060"}
+
+
+def test_every_wrapper_routes_through_fetch(monkeypatch):
+    # Pin the wiring: each public wrapper must delegate the underlying fetch to
+    # _fetch (which is what strips include_hash). Guards against a wrapper
+    # quietly reverting to calling the getter directly and leaking the flag.
+    seen = []
+
+    def spy(func, args, kwargs):
+        seen.append(dict(kwargs))
+        return pd.DataFrame(), SimpleNamespace(url=None)  # empty -> no network
+
+    monkeypatch.setattr(wdx, "_fetch", spy)
+    for name in [n for n in wdx.__all__ if n.startswith("get_")]:
+        seen.clear()
+        ds = getattr(wdx, name)(monitoring_location_id="USGS-1", include_hash=True)
+        assert isinstance(ds, xr.Dataset)
+        assert len(seen) == 1, f"{name} did not route through _fetch"
+        # the wrapper hands include_hash to _fetch; _fetch is what drops it
+        # (asserted in test_fetch_strips_include_hash)
+        assert seen[0].get("include_hash") is True
+
+
+def test_unparseable_time_dropped_with_warning():
+    # A bad/missing time must be dropped explicitly (with a warning), not
+    # silently swallowed by to_xarray.
+    df = _daily_frame(
+        values=(100, 110, 120),
+        times=("2024-06-01", "not-a-date", "2024-06-03"),
+    )
+    with pytest.warns(UserWarning, match="unparseable or missing time"):
+        ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert ds.sizes["time"] == 2  # the bad-time row is gone, the good ones stay
+    assert 110 not in ds["discharge"].values  # the dropped value did not survive
+
+
+def test_all_unparseable_time_returns_empty_dataset():
+    df = _daily_frame(values=(1, 2), times=("bad-a", "bad-b"))
+    with pytest.warns(UserWarning, match="unparseable or missing time"):
+        ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert list(ds.data_vars) == []
+    assert ds.attrs["Conventions"] == "CF-1.11"
+
+
+def test_mixed_units_in_one_variable_warns():
+    # Same (parameter, statistic) across two sites but different units -> one
+    # variable can hold only one units attr; warn instead of silently mislabeling.
+    a = _daily_frame(site="USGS-1", values=(100,), times=("2024-06-01",))
+    b = _daily_frame(site="USGS-2", values=(3,), times=("2024-06-01",))
+    b["unit_of_measure"] = "m3 s-1"
+    with pytest.warns(UserWarning, match="spans multiple units"):
+        ds = wdx._build_dense(
+            pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
+        )
+    assert ds.sizes["monitoring_location_id"] == 2
+    assert ds["discharge"].attrs["units"] == "ft3 s-1"  # first unit (ft^3/s)
+
+
+def test_point_coords_from_list_geometry():
+    # Without geopandas the OGC geometry is a plain [lon, lat] list, not a
+    # shapely Point. lon/lat must still be extracted (regression: the old
+    # ``.x``/``.y`` access raised AttributeError and silently dropped them).
+    df = _daily_frame()
+    df["geometry"] = [[-90.44, 43.19]] * len(df)
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert "longitude" in ds.coords and "latitude" in ds.coords
+    assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44
+    assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19
+    assert ds["longitude"].attrs["units"] == "degrees_east"
+    assert ds["latitude"].attrs["standard_name"] == "latitude"
+
+
+def test_point_coords_from_pointlike_geometry():
+    # The shapely-Point path (geopandas installed) still works: any object
+    # exposing .x/.y is read directly.
+    df = _daily_frame()
+    df["geometry"] = [SimpleNamespace(x=-90.44, y=43.19)] * len(df)
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44
+    assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19
+
+
+def test_non_point_geometry_skipped():
+    # A non-point geometry (no .x/.y, not a [lon, lat] pair) is skipped, not
+    # guessed -- no lon/lat coords are added.
+    df = _daily_frame()
+    df["geometry"] = [object()] * len(df)
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert "longitude" not in ds.coords and "latitude" not in ds.coords
+
+
+def test_date_modified_from_last_modified():
+    # The newest last_modified becomes the dataset-level date_modified (ACDD).
+    df = _daily_frame()
+    df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"]
+    ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert ds.attrs["date_modified"].startswith("2024-06-10")
+
+
+def test_no_date_modified_without_last_modified():
+    # The default frame has no last_modified column -> no date_modified attr.
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "date_modified" not in ds.attrs
+
+
+def test_site_metadata_coordinates():
+    # HUC / state_name ride along on the metadata call and become
+    # instance-indexed auxiliary coordinates.
+    site_meta = {
+        "USGS-1": {"hydrologic_unit_code": "07070005", "state_name": "Wisconsin"}
+    }
+    ds = wdx._build_dense(
+        _daily_frame(),
+        _meta(),
+        service="daily",
+        series_meta=_DISCHARGE_META,
+        site_meta=site_meta,
+    )
+    huc = ds["hydrologic_unit_code"].sel(monitoring_location_id="USGS-1").item()
+    assert huc == "07070005"
+    assert ds["state_name"].sel(monitoring_location_id="USGS-1").item() == "Wisconsin"
+    assert ds["hydrologic_unit_code"].attrs["long_name"] == "hydrologic unit code (HUC)"
+
+
+def test_site_metadata_absent_adds_no_coords():
+    # No site_meta -> no HUC/state coords (back-compat with the old signature).
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "hydrologic_unit_code" not in ds.coords
+    assert "state_name" not in ds.coords
+
+
+# --- ragged layout (the default) -------------------------------------------
+
+
+def test_ragged_structure_and_cf_attrs():
+    # Single (site, parameter, statistic) -> one instance; value attrs are set
+    # because the descriptors are homogeneous across the (one) instance.
+    ds = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert set(ds.sizes) == {"obs", "timeseries"}
+    assert ds.sizes == {"obs": 2, "timeseries": 1}
+    assert ds["value"].dims == ("obs",)
+    assert list(ds["value"].values) == [100, 110]
+    assert ds["row_size"].dims == ("timeseries",)
+    assert int(ds["row_size"][0]) == 2
+    assert ds["row_size"].attrs["sample_dimension"] == "obs"
+    # per-instance identity + cf_role on the synthesized timeseries_id
+    assert ds["monitoring_location_id"].dims == ("timeseries",)
+    assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id"
+    assert ds["timeseries_id"].values[0] == "USGS-1:00060:00003"
+    # homogeneous descriptors land on value
+    v = ds["value"]
+    assert v.attrs["long_name"] == "Discharge, cubic feet per second"
+    assert v.attrs["units"] == "ft3 s-1"
+    assert v.attrs["cell_methods"] == "time: mean"
+    assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel"
+    assert v.attrs["ancillary_variables"] == "qualifier approval_status"
+    assert ds.attrs["featureType"] == "timeSeries"
+    # ancillary flags are per-observation; metadata is per-instance
+    assert ds["qualifier"].dims == ("obs",)
+    assert ds["approval_status"].dims == ("obs",)
+    assert ds["parameter_code"].dims == ("timeseries",)
+    assert ds["parameter_code"].values.tolist() == ["00060"]
+    assert ds["statistic_id"].values.tolist() == ["00003"]
+    assert ds["unit_of_measure"].values.tolist() == ["ft^3/s"]
+    assert ds["parameter_code"].attrs["long_name"] == "USGS parameter code"
+
+
+def test_ragged_stores_only_real_observations():
+    # Two sites of very different length: obs == sum of lengths, no NaN fill.
+    a = _daily_frame(
+        site="USGS-A",
+        values=(1, 2, 3),
+        times=("2024-06-01", "2024-06-02", "2024-06-03"),
+    )
+    b = _daily_frame(site="USGS-B", values=(9,), times=("2024-06-03",))
+    ds = wdx._build_ragged(
+        pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert ds.sizes == {"obs": 4, "timeseries": 2}  # 3 + 1, not 2 x 3 grid
+    assert not pd.isna(ds["value"].values).any()
+    assert sorted(ds["row_size"].values.tolist()) == [1, 3]
+
+
+def test_ragged_mixed_parameters_value_is_generic():
+    # Mixed parameters/units -> value carries no single units/standard_name;
+    # the per-instance parameter_code coordinate disambiguates.
+    q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02"))
+    t = _temp_frame()
+    meta = {**_DISCHARGE_META, **_TEMP_META}
+    ds = wdx._build_ragged(
+        pd.concat([q, t]), _meta(), service="continuous", series_meta=meta
+    )
+    assert ds.sizes == {"obs": 4, "timeseries": 2}
+    assert ds["value"].attrs["long_name"] == "measured value"
+    assert "units" not in ds["value"].attrs  # ft^3/s vs degC -> not homogeneous
+    assert "standard_name" not in ds["value"].attrs
+    assert set(ds["parameter_code"].values) == {"00060", "00010"}
+
+
+def test_ragged_keeps_duplicate_observations_without_warning():
+    # The dense path warns + dedups colliding (site, time); ragged just keeps
+    # both observations -- no grid, so no ambiguity. Assert specifically that
+    # neither dense-path diagnostic fires (not "no warning at all", so an
+    # unrelated pandas warning can't fail the test for the wrong reason).
+    a = _daily_frame(values=(100,), times=("2024-06-01",))
+    b = _daily_frame(values=(200,), times=("2024-06-01",))
+    import warnings as _w
+
+    with _w.catch_warnings(record=True) as caught:
+        _w.simplefilter("always")
+        ds = wdx._build_ragged(
+            pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
+        )
+    msgs = [str(w.message) for w in caught]
+    assert not any(
+        "multiple values per" in m or "spans multiple units" in m for m in msgs
+    ), msgs
+    assert ds.sizes["obs"] == 2
+    assert sorted(ds["value"].values.tolist()) == [100, 200]
+
+
+def test_ragged_lonlat_and_date_modified_per_instance():
+    df = _daily_frame()
+    df["geometry"] = [[-90.44, 43.19]] * len(df)
+    df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"]
+    ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert ds["longitude"].dims == ("timeseries",)
+    assert float(ds["longitude"][0]) == -90.44
+    assert float(ds["latitude"][0]) == 43.19
+    assert ds.attrs["date_modified"].startswith("2024-06-10")
+
+
+def _instance_blocks(ds):
+    """Map each instance's (parameter, statistic) -> (time-sorted values, unit)
+    by walking the row_size offsets -- the reader's view of a ragged array."""
+    starts, acc = [], 0
+    for n in ds["row_size"].values.tolist():
+        starts.append(acc)
+        acc += int(n)
+    starts.append(acc)
+    blocks = {}
+    for i in range(ds.sizes["timeseries"]):
+        sl = slice(starts[i], starts[i + 1])
+        key = (str(ds["parameter_code"].values[i]), str(ds["statistic_id"].values[i]))
+        blocks[key] = (
+            ds["value"].values[sl].tolist(),
+            str(ds["unit_of_measure"].values[i]),
+        )
+    return blocks
+
+
+def test_ragged_alignment_survives_interleaved_input():
+    # The critical invariant: row_size, the contiguous obs blocks, and the
+    # per-instance metadata must all stay aligned even when the input rows are
+    # interleaved across instances and out of time order. A regression in the
+    # sort/group ordering would silently map values to the wrong series.
+    cols = [
+        "monitoring_location_id",
+        "parameter_code",
+        "statistic_id",
+        "unit_of_measure",
+        "time",
+        "value",
+    ]
+    # Values are deliberately NON-monotonic with time within each instance, so
+    # a regression that ordered obs by value (instead of time) would produce a
+    # different sequence and fail -- "sorted values" alone wouldn't catch that.
+    rows = [
+        ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-02", 100),  # later, smaller
+        ("USGS-1", "00010", "00011", "degC", "2024-06-01", 19),
+        ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-03", 500),
+        ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-01", 150),  # earlier, larger
+        ("USGS-1", "00010", "00011", "degC", "2024-06-03", 18),
+        ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-01", 480),
+    ]
+    df = pd.DataFrame(rows, columns=cols)
+    ds = wdx._build_ragged(df, _meta(), service="daily", series_meta={})
+
+    assert ds.sizes == {"obs": 6, "timeseries": 3}
+    assert int(ds["row_size"].sum()) == ds.sizes["obs"]
+    blocks = _instance_blocks(ds)
+    # each instance's values + unit are the ones that belong to it, in TIME order
+    # (not value order: 150 precedes 100, and 19 precedes 18)
+    assert blocks[("00060", "00003")] == ([150, 100], "ft^3/s")
+    assert blocks[("00060", "00001")] == ([480, 500], "ft^3/s")
+    assert blocks[("00010", "00011")] == ([19, 18], "degC")
+
+
+def test_ragged_field_schema_without_statistic():
+    # The field-measurements schema groups by parameter_code only (no
+    # statistic_id): the instance has no statistic segment, and the service
+    # default cell method fills in.
+    df = pd.DataFrame(
+        {
+            "monitoring_location_id": ["USGS-1", "USGS-1"],
+            "parameter_code": ["00060", "00060"],
+            "time": ["2024-06-01", "2024-06-02"],
+            "value": [100, 110],
+            "unit_of_measure": ["ft^3/s", "ft^3/s"],
+        }
+    )
+    ds = wdx._build_ragged(
+        df,
+        _meta(),
+        service="field-measurements",
+        schema=wdx._FIELD,
+        series_meta=_DISCHARGE_META,
+        default_cell_method="point",
+    )
+    assert ds.sizes == {"obs": 2, "timeseries": 1}
+    assert "statistic_id" not in ds.coords
+    assert ds["timeseries_id"].values[0] == "USGS-1:00060"  # no stat segment
+    assert ds["value"].attrs["cell_methods"] == "time: point"
+    assert ds["value"].attrs["long_name"] == "Discharge, cubic feet per second"
+
+
+# --- dense opt-out wiring ---------------------------------------------------
+
+
+def test_wrapper_defaults_to_ragged_and_dense_opt_out(monkeypatch):
+    # Default wrapper output is ragged; dense=True returns the gridded layout.
+    monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta()))
+    monkeypatch.setattr(wdx._TS_CACHE, "lookup", lambda site_ids: (_DISCHARGE_META, {}))
+
+    ragged = wdx.get_daily(monitoring_location_id="USGS-1")
+    assert "obs" in ragged.sizes and "value" in ragged.data_vars
+    assert "discharge" not in ragged.data_vars
+
+    dense = wdx.get_daily(monitoring_location_id="USGS-1", dense=True)
+    assert "discharge" in dense.data_vars
+    assert "obs" not in dense.sizes
+
+
+def test_series_wrapper_signature_advertises_dense_and_dataset_return():
+    # functools.wraps would otherwise carry over the getter's DataFrame
+    # signature and hide ``dense``; _make_getter publishes an accurate one.
+    import inspect
+
+    sig = inspect.signature(wdx.get_daily)
+    assert "monitoring_location_id" in sig.parameters  # real args stay visible
+    dense = sig.parameters["dense"]
+    assert dense.kind is inspect.Parameter.KEYWORD_ONLY
+    assert dense.default is False
+    assert "Dataset" in str(sig.return_annotation)
+    assert wdx.get_daily.__annotations__["return"] == "xarray.Dataset"
+
+
+def test_no_dense_wrappers_omit_dense_from_signature():
+    # stats + samples have no dense layout, so ``dense`` must not be advertised
+    # (the wrapper still accept-and-ignores it; see the dense-warning tests).
+    import inspect
+
+    for name in ("get_stats_por", "get_samples"):
+        sig = inspect.signature(getattr(wdx, name))
+        assert "dense" not in sig.parameters, name
+        assert "Dataset" in str(sig.return_annotation), name
+
+
+# --- water-quality samples --------------------------------------------------
+
+
+def _samples_frame(characteristics=("Temperature, water",), units=("deg C",)):
+    rows = []
+    for ch, u in zip(characteristics, units):
+        rows.append(
+            {
+                "Location_Identifier": "USGS-1",
+                "Activity_StartDateTime": "2020-07-10T12:00:00Z",
+                "Result_Characteristic": ch,
+                "Result_SampleFraction": "Total",
+                "Result_Measure": 12.5,
+                "Result_MeasureUnit": u,
+                "Result_ResultDetectionCondition": None,
+                "Result_MeasureStatusIdentifier": "Provisional",
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+def _samples_ds(frame):
+    """Build a samples ragged Dataset the way the get_samples service does."""
+    return wdx._build_ragged(
+        frame,
+        _meta(),
+        service="samples",
+        schema=wdx._SAMPLES,
+        default_cell_method="point",
+    )
+
+
+def test_build_samples_single_characteristic():
+    ds = _samples_ds(_samples_frame())
+    assert set(ds.sizes) == {"obs", "timeseries"}
+    assert ds["value"].dims == ("obs",)
+    assert ds["characteristic"].values[0] == "Temperature, water"
+    assert ds["value"].attrs["long_name"] == "Temperature, water"
+    assert ds["value"].attrs["cell_methods"] == "time: point"
+    # censoring columns are ancillary, linked from value
+    assert "detection_condition" in ds.variables and "status" in ds.variables
+    assert ds["value"].attrs["ancillary_variables"] == "detection_condition status"
+    assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id"
+
+
+def test_build_samples_mixed_characteristics_generic_value():
+    ds = _samples_ds(
+        _samples_frame(("Temperature, water", "pH"), ("deg C", "std units"))
+    )
+    assert ds.sizes["timeseries"] == 2
+    assert ds["value"].attrs["long_name"] == "measured value"
+    assert set(ds["characteristic"].values) == {"Temperature, water", "pH"}
+
+
+def test_get_samples_wrapper_builds_ragged(monkeypatch):
+    monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta()))
+    ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", include_hash=True)
+    assert "obs" in ds.sizes and "value" in ds.data_vars
+    assert ds["characteristic"].values[0] == "Temperature, water"
+
+
+def test_prepare_values_missing_required_column_returns_empty():
+    # A frame lacking a mandatory column (here: no value) must degrade to an
+    # empty Dataset, not raise KeyError from the column slim.
+    df = _daily_frame().drop(columns=["value"])
+    ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+    assert isinstance(ds, xr.Dataset)
+    assert list(ds.data_vars) == []
+    assert ds.attrs["Conventions"] == "CF-1.11"
+
+
+def test_build_samples_missing_value_returns_empty():
+    # A non-result Samples profile (no Result_Measure -> no "value") must not
+    # crash; it has nothing to convert.
+    df = pd.DataFrame(
+        {
+            "Location_Identifier": ["USGS-1"],
+            "Activity_StartDateTime": ["2020-07-10T12:00:00Z"],
+            "Result_Characteristic": ["pH"],
+        }
+    )
+    ds = _samples_ds(df)
+    assert list(ds.data_vars) == []
+    assert ds.attrs["Conventions"] == "CF-1.11"
+
+
+def test_get_samples_ignores_dense_with_warning(monkeypatch):
+    # dense=True is advertised generically; get_samples is always ragged, so it
+    # must accept-and-ignore (with a warning) rather than leak dense= to the
+    # underlying getter (which would TypeError).
+    monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta()))
+    with pytest.warns(UserWarning, match="no dense layout"):
+        ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", dense=True)
+    assert "obs" in ds.sizes  # still ragged
+
+
+def test_get_stats_ignores_dense_with_warning(monkeypatch):
+    # The stats layout has no dense grid either; dense=True is ignored with the
+    # same warning (consistent with samples), not silently swallowed.
+    frame = pd.DataFrame({"monitoring_location_id": ["USGS-1"], "p50_va": [120.0]})
+    monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (frame, _meta()))
+    with pytest.warns(UserWarning, match="no dense layout"):
+        ds = wdx.get_stats_por(monitoring_location_id="USGS-1", dense=True)
+    assert isinstance(ds, xr.Dataset)
+
+
+def test_timeseries_id_skips_missing_instance_key():
+    # A NaN instance key (e.g. a characteristic with no sample fraction) must
+    # not render as a literal "nan" token in the cf_role timeseries_id.
+    df = _samples_frame()
+    df["Result_SampleFraction"] = None
+    ds = _samples_ds(df)
+    assert all("nan" not in tid for tid in ds["timeseries_id"].values)
+    assert ds["timeseries_id"].values[0] == "USGS-1:Temperature, water"
+
+
+# --- metadata-lookup resilience + cache control -----------------------------
+
+
+def test_metadata_lookup_failure_degrades_with_warning():
+    # The metadata lookup is supplementary (only the readable name + site
+    # descriptors); a fetch failure must warn and return empty rather than
+    # discard already-fetched data -- and must NOT cache the failure, so a
+    # later recovered call retries.
+    def boom(monitoring_location_id):
+        raise RuntimeError("network down")
+
+    cache = wdx._MetadataCache(boom)
+    with pytest.warns(UserWarning, match="metadata lookup failed"):
+        param_meta, site_meta = cache.lookup(["USGS-1"])
+    assert param_meta == {} and site_meta == {}
+    assert len(cache) == 0  # failure not cached -> retryable
+
+
+def test_wrapper_survives_metadata_failure(monkeypatch):
+    # End to end: a failing metadata endpoint still yields a CF dataset built
+    # from the data; only the metadata-sourced long_name is missing.
+    monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta()))
+
+    def boom(monitoring_location_id):
+        raise RuntimeError("network down")
+
+    monkeypatch.setattr(wdx._TS_CACHE, "_getter", boom)
+    wdx._TS_CACHE.clear()
+    with pytest.warns(UserWarning, match="metadata lookup failed"):
+        ds = wdx.get_daily(monitoring_location_id="USGS-1")
+    assert "value" in ds.data_vars  # observations survived
+    # standard_name/units come from the frame; long_name (metadata) is absent
+    assert "long_name" not in ds["value"].attrs
+    assert ds["value"].attrs["units"] == "ft3 s-1"
+
+
+def test_metadata_cache_bounded_and_clearable():
+    # The cache is bounded (FIFO eviction) so a long-running, many-site
+    # process can't grow it without limit.
+    def fake(monitoring_location_id):
+        rows = [
+            {
+                "monitoring_location_id": s,
+                "parameter_code": "00060",
+                "parameter_name": f"name-{s}",
+            }
+            for s in monitoring_location_id
+        ]
+        return pd.DataFrame(rows), SimpleNamespace(url=None)
+
+    cache = wdx._MetadataCache(fake, maxsize=3)
+    for i in range(10):
+        cache.lookup([f"S{i}"])
+    assert len(cache) <= 3
+
+    cache.lookup(["S99"])
+    assert len(cache) > 0
+    cache.clear()  # instance opt-out empties it
+    assert len(cache) == 0
+
+    # the public helper clears the real per-process caches
+    wdx._TS_CACHE._entries["X"] = {"params": {}, "site": {}}
+    wdx._FIELD_CACHE._entries["Y"] = {"params": {}, "site": {}}
+    wdx.clear_metadata_cache()
+    assert len(wdx._TS_CACHE) == 0 and len(wdx._FIELD_CACHE) == 0

From de3239f4064906acef5d590a6677f3a43542e6bc Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 08:34:32 -0500
Subject: [PATCH 2/7] fix(waterdata.xarray): default to the dense grid and
 harden edge cases

Make the (monitoring_location_id, time) dense grid the default layout
(pass dense=False for the contiguous ragged array), add select_series()
and clear_metadata_cache(), and fix defects found in review:

- NaN-truthiness: a present-but-null parameter_name/description no longer
  masks the fallback (was naming a variable "nan" / dropping long_name)
- dense (site, time) collisions dedup deterministically (smallest value)
  rather than keeping an arbitrary upstream-order row
- partial point geometry keeps a numeric NaN-filled lon/lat coordinate
  instead of a CF-invalid object array
- list-valued flag columns (qualifier) are flattened to strings so the
  datasets serialize to netCDF
- omit CF featureType on the preliminary flat stats layout

Rewrite the demo notebook around the dense default and refresh the README.
56 offline tests pass; demo executes clean against the live API.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md                         |  26 ++++
 dataretrieval/waterdata/xarray.py | 249 ++++++++++++++++++++++--------
 demos/waterdata_xarray_demo.ipynb | 244 +++++++++++++----------------
 tests/waterdata_xarray_test.py    | 181 ++++++++++++++++++++--
 4 files changed, 488 insertions(+), 212 deletions(-)

diff --git a/README.md b/README.md
index e56d31c2..c149e29e 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,32 @@ import logging
 logging.basicConfig(level=logging.DEBUG)
 ```
 
+#### Returning xarray Datasets
+
+The Water Data time-series, peaks, field-measurement, statistics, and samples
+getters have [xarray](https://docs.xarray.dev/) counterparts in
+`dataretrieval.waterdata.xarray` that return a CF-conventions
+`xarray.Dataset` instead of a DataFrame — ready to write to netCDF or hand to
+the CF-aware scientific Python stack. Install the optional dependency with
+`pip install dataretrieval[xarray]`:
+
+```python
+from dataretrieval.waterdata import xarray as wdx
+
+# Same arguments as waterdata.get_daily, but returns a CF Dataset
+ds = wdx.get_daily(
+    monitoring_location_id='USGS-01646500',
+    parameter_code='00060',  # Discharge
+    time='2024-10-01/2025-09-30',
+)
+```
+
+See the
+[xarray demo notebook](https://github.com/DOI-USGS/dataretrieval-python/blob/main/demos/waterdata_xarray_demo.ipynb)
+for a walkthrough of the default dense `(site, time)` grid and the
+`dense=False` contiguous-ragged layout for large multi-site pulls, the CF
+metadata, and writing to netCDF.
+
 ### Water Quality Portal (WQP)
 
 Access water quality data from multiple agencies:
diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index 0eef2c2e..b15912c0 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -4,24 +4,25 @@
 :mod:`dataretrieval.waterdata`, but returns a CF-conventions
 :class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`.
 
-By default the data is returned as a CF *contiguous ragged array*
-(``featureType = "timeSeries"``): every observation is concatenated along a
-single ``obs`` dimension, and each (monitoring location, parameter,
-statistic) series is one instance along a ``timeseries`` dimension whose
-``row_size`` records how many observations it contributes. This stores only
-real observations -- no NaN fill -- so it scales to large, very ragged
-multi-site pulls where record lengths differ by decades. Parameter,
-statistic, unit, and location identity become per-instance metadata, and
-``time`` is a 1-D coordinate along ``obs``. The trade-off is that ``time``
-is no longer a dimension you can index directly: to select by time you
-first regroup the flat ``obs`` back into per-series views (e.g. with
-``cf-xarray``, or via the offsets implied by ``row_size``).
-
-Pass ``dense=True`` for the alternative gridded layout: one named data
-variable per parameter on a ``(monitoring_location_id, time)`` grid, NaN
-where a series has no observation. This is ergonomic for a few overlapping
-series (``ds["discharge"].sel(time=...)`` just works) but the union time
-axis and NaN fill make it memory-costly for large ragged collections.
+By default the data is returned on a CF ``(monitoring_location_id, time)``
+grid (``featureType = "timeSeries"``): one named data variable per parameter
+(``discharge``, ``temperature_water``, ...), NaN where a series has no
+observation. This is the ergonomic layout -- ``ds["discharge"].sel(
+monitoring_location_id=..., time=...)`` just works -- and
+``monitoring_location_id`` is the instance dimension carrying
+``cf_role = "timeseries_id"``. The cost is the union time axis and NaN fill,
+which grow for large, very ragged multi-site collections.
+
+Pass ``dense=False`` for the alternative CF *contiguous ragged array*: every
+observation is concatenated along a single ``obs`` dimension, and each
+(monitoring location, parameter, statistic) series is one instance along a
+``timeseries`` dimension whose ``row_size`` records how many observations it
+contributes. This stores only real observations -- no NaN fill -- so it
+scales to large multi-site pulls where record lengths differ by decades.
+Parameter, statistic, unit, and location identity become per-instance
+metadata and ``time`` is a 1-D coordinate along ``obs``, so to select one
+series use :func:`select_series` (or regroup the flat ``obs`` via the offsets
+implied by ``row_size``, e.g. with ``cf-xarray``).
 
 Either way the CF metadata is derived from columns the getter already
 returns (``unit_of_measure`` -> ``units``, ``statistic_id`` ->
@@ -60,6 +61,7 @@
 from dataclasses import dataclass, field, replace
 from functools import wraps as _wraps
 
+import numpy as _np
 import pandas as _pd
 
 try:
@@ -81,6 +83,7 @@
 
 __all__ = [
     "clear_metadata_cache",
+    "select_series",
     "get_continuous",
     "get_daily",
     "get_field_measurements",
@@ -139,15 +142,12 @@ def _slug(name) -> str:
     return s or "value"
 
 
-def _first(series):
-    """First non-null value of a column, or None."""
-    nonnull = series.dropna()
-    return nonnull.iloc[0] if len(nonnull) else None
-
-
 def _first_present(frame, col):
     """First non-null value of ``col`` if the column is present, else None."""
-    return _first(frame[col]) if col in frame else None
+    if col not in frame:
+        return None
+    nonnull = frame[col].dropna()
+    return nonnull.iloc[0] if len(nonnull) else None
 
 
 def _unique_present(frame, col):
@@ -155,6 +155,30 @@ def _unique_present(frame, col):
     return frame[col].dropna().unique() if col in frame else []
 
 
+def _none_if_nan(value):
+    """Collapse a pandas NaN to None.
+
+    A NaN is *truthy*, so ``meta.get(col) or fallback`` keeps the NaN instead of
+    falling back. Metadata frames yield NaN for a column that is present but null
+    for a given parameter, so normalize it to None before any such ``or`` chain.
+    """
+    return value if _pd.notna(value) else None
+
+
+def _scalarize(cell):
+    """Collapse a list-valued flag cell into a single string; pass scalars through.
+
+    The Water Data API returns some ancillary (flag) columns -- notably
+    ``qualifier`` -- as a *list* of codes per observation. xarray/netCDF cannot
+    encode an object array whose elements are lists, so join the codes into one
+    space-separated string (an empty list becomes None).
+    """
+    if isinstance(cell, (list, tuple)):
+        parts = [str(v) for v in cell if v is not None and _pd.notna(v)]
+        return " ".join(parts) if parts else None
+    return cell
+
+
 def _sites(df):
     """Unique monitoring-location ids present in a values frame."""
     if _SITE in df:
@@ -203,10 +227,10 @@ def _point_coords(df, site):
     if geo.empty:
         return None
     lon, lat = {}, {}
-    for _, row in geo.iterrows():
-        xy = _lonlat(row["geometry"])
+    for site_id, geom in zip(geo[site].to_numpy(), geo["geometry"].to_numpy()):
+        xy = _lonlat(geom)
         if xy is not None:
-            lon[row[site]], lat[row[site]] = xy
+            lon[site_id], lat[site_id] = xy
     if not lon:
         return None  # no point geometry; skip rather than guess
     return lon, lat
@@ -241,6 +265,11 @@ def _prepare_values(df, group_cols, ancillary_cols):
         work["time"], format="ISO8601", errors="coerce", utc=True
     ).dt.tz_localize(None)
     work["value"] = _pd.to_numeric(work["value"], errors="coerce")
+    # Flatten any list-valued flag cells (e.g. ``qualifier``) to strings so the
+    # ancillary variables stay netCDF-encodable (``_scalarize`` passes scalars
+    # through unchanged).
+    for c in ancillary:
+        work[c] = work[c].map(_scalarize)
     n_before = len(work)
     work = work[work["time"].notna()]
     dropped = n_before - len(work)
@@ -266,8 +295,10 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method):
     names follow the (layout-specific) data-variable name.
     """
     attrs: dict[str, str] = {}
-    long_name = desc.get("parameter_description") or desc.get("parameter_name")
-    if long_name and _pd.notna(long_name):
+    long_name = _none_if_nan(desc.get("parameter_description")) or _none_if_nan(
+        desc.get("parameter_name")
+    )
+    if long_name:
         attrs["long_name"] = str(long_name)
 
     if unit is not None and _pd.notna(unit):
@@ -294,28 +325,36 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method):
     return attrs
 
 
-def _dataset_attrs(service, base_meta):
-    """Dataset-level provenance (CF + ACDD) attributes."""
+def _dataset_attrs(service, base_meta, *, feature_type):
+    """Dataset-level provenance (CF + ACDD) attributes.
+
+    ``feature_type`` is the CF discrete-sampling ``featureType`` to advertise, or
+    None to omit it (sourced from the builder's ``feature_type`` class attr).
+    The series layouts are genuine ``timeSeries`` geometries; the preliminary
+    flat stats table is not a DSG at all, so it passes None rather than mislabel
+    itself.
+    """
     attrs = {
         "Conventions": "CF-1.11",
         "institution": "U.S. Geological Survey",
         "source": f"USGS Water Data API ({service})",
-        "featureType": "timeSeries",
         "history": (
             f"{_dt.datetime.now(_dt.timezone.utc).isoformat(timespec='seconds')} "
             "created by dataretrieval.waterdata.xarray"
         ),
     }
+    if feature_type:
+        attrs["featureType"] = feature_type
     url = getattr(base_meta, "url", None)
     if url:
         attrs["references"] = str(url)
     return attrs
 
 
-def _empty_dataset(service, base_meta):
+def _empty_dataset(service, base_meta, *, feature_type):
     """An attribute-only Dataset for an empty / unconvertible response."""
     ds = _xr.Dataset()
-    ds.attrs = _dataset_attrs(service, base_meta)
+    ds.attrs = _dataset_attrs(service, base_meta, feature_type=feature_type)
     return ds
 
 
@@ -434,6 +473,63 @@ def clear_metadata_cache():
     _FIELD_CACHE.clear()
 
 
+def select_series(ds, **keys):
+    """Extract one time series from a ragged-layout Dataset by label.
+
+    The ragged layout (``dense=False``) stores each series as a contiguous block
+    along ``obs`` located by ``row_size``, so ``time`` is a flat ``obs`` axis and
+    a single series is addressed by its per-series coordinates --
+    ``parameter_code``, ``monitoring_location_id``, ``statistic_id`` (or, for
+    samples, ``characteristic`` / ``sample_fraction``) -- rather than a named
+    variable::
+
+        ds = wdx.get_daily(
+            monitoring_location_id=[...], parameter_code=[...], dense=False
+        )
+        q = select_series(
+            ds, monitoring_location_id="USGS-05407000", parameter_code="00060"
+        )
+        q["value"].sel(time="2023-07")  # time is now a real dimension
+
+    This is the ragged-layout counterpart to ``ds[name].sel(...)`` on the default
+    dense Dataset. It selects the one matching instance and returns its
+    observations as a 1-D, time-indexed ``xarray.Dataset`` (``value`` plus any
+    ancillary flag variables), with the series' identity carried as scalar
+    coordinates. Raises ``KeyError`` if no instance matches and ``ValueError``
+    if more than one does (add keys to disambiguate).
+    """
+    if "row_size" not in ds.variables or "obs" not in ds.dims:
+        raise ValueError(
+            "select_series expects a ragged Dataset (from dense=False); the "
+            "default dense Dataset already exposes one variable per parameter, "
+            "so select by name instead, e.g. "
+            "ds[variable].sel(monitoring_location_id=...)."
+        )
+    inst_coords = [c for c in ds.coords if ds[c].dims == ("timeseries",)]
+    mask = _np.ones(ds.sizes["timeseries"], dtype=bool)
+    for key, value in keys.items():
+        if key not in inst_coords:
+            raise KeyError(
+                f"{key!r} is not a per-series coordinate; choose from {inst_coords}."
+            )
+        mask &= ds[key].to_numpy() == value
+    matches = _np.flatnonzero(mask)
+    if matches.size == 0:
+        raise KeyError(f"no time series matches {keys}.")
+    if matches.size > 1:
+        raise ValueError(
+            f"{matches.size} time series match {keys}; add more of "
+            f"{inst_coords} to select exactly one."
+        )
+    i = int(matches[0])
+    # Slice this instance's contiguous obs block (the CF sample_dimension link),
+    # then promote time from an obs coordinate to the dimension.
+    starts = _np.concatenate([[0], _np.cumsum(ds["row_size"].to_numpy())])
+    block = slice(int(starts[i]), int(starts[i + 1]))
+    series = ds.isel(timeseries=i, obs=block).swap_dims(obs="time")
+    return series.drop_vars("row_size", errors="ignore")
+
+
 # === column schemas ========================================================
 
 # Water-quality samples (Samples DB / WQX) speak a different column vocabulary
@@ -516,6 +612,10 @@ class _DatasetBuilder:
     :func:`_build_stats` helpers rather than directly.
     """
 
+    # The CF ``featureType`` this layout produces (None to omit it). The series
+    # layouts are timeSeries DSGs; the flat stats table overrides this to None.
+    feature_type = "timeSeries"
+
     def __init__(self, df, base_meta, *, service):
         self.df = df
         self.base_meta = base_meta
@@ -528,11 +628,15 @@ def _is_empty(self):
         return self.df is None or len(self.df) == 0
 
     def _empty(self):
-        return _empty_dataset(self.service, self.base_meta)
+        return _empty_dataset(
+            self.service, self.base_meta, feature_type=self.feature_type
+        )
 
     def _apply_provenance(self, ds):
         """Set the dataset-level CF/ACDD attributes and ``date_modified``."""
-        ds.attrs = _dataset_attrs(self.service, self.base_meta)
+        ds.attrs = _dataset_attrs(
+            self.service, self.base_meta, feature_type=self.feature_type
+        )
         dm = _date_modified(self.df)
         if dm:
             ds.attrs["date_modified"] = dm
@@ -593,9 +697,12 @@ def _add_spatial_coords(self, ds, dim, order):
         coords = _point_coords(self.df, _SITE)
         if coords is not None:
             lon, lat = coords
+            # Fill sites lacking point geometry with NaN (not None) so the
+            # coordinate stays a numeric float array -- a CF longitude/latitude
+            # coord must be numeric; an object array with None is CF-invalid.
             ds = ds.assign_coords(
-                longitude=(dim, [lon.get(k) for k in order]),
-                latitude=(dim, [lat.get(k) for k in order]),
+                longitude=(dim, [lon.get(k, _np.nan) for k in order]),
+                latitude=(dim, [lat.get(k, _np.nan) for k in order]),
             )
             ds["longitude"].attrs = {
                 "standard_name": "longitude",
@@ -617,7 +724,7 @@ def _add_spatial_coords(self, ds, dim, order):
 
 
 class _RaggedBuilder(_SeriesBuilder):
-    """CF *contiguous ragged array* layout (the default).
+    """CF *contiguous ragged array* layout (the ``dense=False`` opt-in).
 
     All observations are concatenated into one ``obs`` dimension; each series (a
     ``schema.group_cols`` combination at a site) is an instance along
@@ -734,10 +841,15 @@ def _finalize(self, ds, inst_frame, ancillary, value_attrs):
         }
         # Join the instance keys into a cf_role id, skipping null parts so a
         # missing key (e.g. a characteristic with no sample fraction) doesn't
-        # render as a literal "nan" token.
-        ts_id = inst_frame.apply(
-            lambda r: ":".join(str(x) for x in r if _pd.notna(x)), axis=1
-        ).to_numpy()
+        # render as a literal "nan" token. Iterating the row arrays avoids the
+        # per-instance Series boxing of ``apply(axis=1)``.
+        ts_id = _np.array(
+            [
+                ":".join(str(x) for x in row if _pd.notna(x))
+                for row in inst_frame.to_numpy()
+            ],
+            dtype=object,
+        )
         ds = ds.assign_coords(timeseries_id=("timeseries", ts_id))
         ds["timeseries_id"].attrs["cf_role"] = "timeseries_id"
         ds[_SITE].attrs.setdefault("long_name", "monitoring location identifier")
@@ -745,10 +857,10 @@ def _finalize(self, ds, inst_frame, ancillary, value_attrs):
 
 
 class _DenseBuilder(_SeriesBuilder):
-    """Dense ``(monitoring_location_id, time)`` grid: one named variable per
-    parameter, NaN where a series has no observation. Ergonomic for a few
-    overlapping series but memory-costly for large ragged collections; see
-    :class:`_RaggedBuilder` for the default.
+    """Dense ``(monitoring_location_id, time)`` grid (the default): one named
+    variable per parameter, NaN where a series has no observation. Ergonomic for
+    a few overlapping series but memory-costly for large ragged collections; see
+    :class:`_RaggedBuilder` for the ``dense=False`` scaling layout.
     """
 
     def _build_series(self, work, group_cols, ancillary, has_unit):
@@ -793,10 +905,13 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
             if not sub.index.is_unique:
                 _warnings.warn(
                     f"'{name}' has multiple values per (site, time) -- two series "
-                    "share this (site, parameter, statistic); keeping the first. "
-                    "Filter the query to separate them.",
+                    "share this (site, parameter, statistic); keeping the smallest "
+                    "value. Filter the query to separate them.",
                     stacklevel=3,
                 )
+                # Sort by value first so the retained value is deterministic
+                # rather than dependent on the upstream response row order.
+                sub = sub.sort_values("value")
                 sub = sub[~sub.index.duplicated(keep="first")]
             ds_g = sub.to_xarray().rename(
                 {"value": name, **{c: f"{name}_{c}" for c in ancillary}}
@@ -818,7 +933,7 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
     @staticmethod
     def _variable_name(desc, pcode, stat, used):
         """A unique slug for a variable; disambiguate same-parameter series."""
-        name = _slug(desc.get("parameter_name") or pcode or "value")
+        name = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value")
         if name in used:  # same parameter, different statistic -> distinct var
             op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None)
             name = f"{name}_{_slug(op)}" if op else name
@@ -836,6 +951,10 @@ class _StatsBuilder(_DatasetBuilder):
     provenance only. A richer percentile / day-of-year layout is future work.
     """
 
+    # Not a CF discrete-sampling geometry: a flat percentile table has no
+    # obs/time/cf_role/sample_dimension, so it must not claim a ``featureType``.
+    feature_type = None
+
     def build(self):
         if self._is_empty():
             return self._empty()
@@ -896,9 +1015,10 @@ def _xr_doc(func, *, cf_metadata=True, allow_dense=True):
         )
     elif allow_dense:
         returns = (
-            "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array "
-            "(pass ``dense=True`` for the NaN-filled (site, time) grid with one "
-            "named variable per parameter)"
+            "a CF-conventions ``xarray.Dataset`` on a (site, time) grid with one "
+            "named variable per parameter (pass ``dense=False`` for the "
+            "contiguous ragged array that stores only real observations and "
+            "scales to large, very ragged multi-site pulls)"
         )
     else:
         returns = (
@@ -924,9 +1044,10 @@ def _public_signature(wrapped, *, has_dense):
     """The signature ``help()`` / IDEs should show for a wrapper.
 
     Keeps the wrapped getter's own parameters (so its real arguments stay
-    discoverable), adds the keyword-only ``dense`` toggle where the layout
-    offers it, and declares the ``xarray.Dataset`` return -- correcting the
-    DataFrame signature that ``functools.wraps`` would otherwise carry over.
+    discoverable), adds the keyword-only ``dense`` toggle (default ``True``;
+    pass ``dense=False`` for the ragged layout) where the layout offers it, and
+    declares the ``xarray.Dataset`` return -- correcting the DataFrame signature
+    that ``functools.wraps`` would otherwise carry over.
     """
     sig = _inspect.signature(wrapped)
     params = list(sig.parameters.values())
@@ -944,7 +1065,7 @@ def _public_signature(wrapped, *, has_dense):
             _inspect.Parameter(
                 "dense",
                 _inspect.Parameter.KEYWORD_ONLY,
-                default=False,
+                default=True,
                 annotation="bool",
             ),
         )
@@ -974,10 +1095,14 @@ class _Service:
 def _make_getter(spec):
     """Build the public getter for one ``_Service`` spec (Factory)."""
 
-    has_dense = spec.layout != "stats" and spec.allow_dense
+    is_stats = spec.layout == "stats"
+    has_dense = not is_stats and spec.allow_dense
 
+    # The (site, time) grid is the default where it is offered; ``dense=False``
+    # opts into the ragged array. Services without a grid (samples, stats)
+    # default to their only layout, so a plain call never trips the warning.
     @_wraps(spec.getter)  # carry __name__/__wrapped__ through to help()/IDEs
-    def getter(*args, dense=False, **kwargs):
+    def getter(*args, dense=has_dense, **kwargs):
         if dense and not has_dense:
             _warnings.warn(
                 f"{spec.getter.__name__} has no dense layout; ignoring dense=True.",
@@ -985,7 +1110,7 @@ def getter(*args, dense=False, **kwargs):
             )
             dense = False
         df, base_meta = _fetch(spec.getter, args, kwargs)
-        if spec.layout == "stats":
+        if is_stats:
             return _build_stats(df, base_meta, spec.service)
         if spec.metadata_cache is not None:
             series_meta, site_meta = spec.metadata_cache.lookup(_sites(df))
@@ -1009,7 +1134,7 @@ def getter(*args, dense=False, **kwargs):
     # returning an ``xarray.Dataset``.
     getter.__doc__ = _xr_doc(
         spec.getter,
-        cf_metadata=(spec.layout != "stats"),
+        cf_metadata=not is_stats,
         allow_dense=spec.allow_dense,
     )
     getter.__signature__ = _public_signature(spec.getter, has_dense=has_dense)
diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb
index 3aa28246..55e9fcea 100644
--- a/demos/waterdata_xarray_demo.ipynb
+++ b/demos/waterdata_xarray_demo.ipynb
@@ -14,12 +14,17 @@
     "written onto the dataset as CF attributes, so the result is self-describing and\n",
     "ready to write to netCDF.\n",
     "\n",
+    "By **default** the data is returned on a `(monitoring_location_id, time)` grid —\n",
+    "one named variable per parameter, so `ds[\"discharge\"].sel(time=...)` just works.\n",
+    "For large, very ragged multi-site pulls, pass `dense=False` to get the compact CF\n",
+    "*contiguous ragged array* instead.\n",
+    "\n",
     "This notebook covers:\n",
     "\n",
-    "1. the default **ragged** layout and how to read it;\n",
-    "2. **why** ragged is the default (and the `dense=True` opt-out);\n",
-    "3. the one trade-off you need to know — **selecting by time**;\n",
-    "4. multiple parameters in one pull;\n",
+    "1. a single time series (the default dense grid);\n",
+    "2. multiple parameters and sites on one grid;\n",
+    "3. the **ragged** layout (`dense=False`) for large, very ragged pulls;\n",
+    "4. selecting one series from a ragged dataset;\n",
     "5. water-quality samples (`get_samples`);\n",
     "6. writing CF netCDF."
    ]
@@ -48,8 +53,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "\n",
     "from dataretrieval.waterdata import xarray as wdx"
    ]
   },
@@ -60,7 +63,8 @@
     "## 1. A single time series\n",
     "\n",
     "Pull a year of daily mean discharge (parameter `00060`, statistic `00003`) at one\n",
-    "gage. The wrapper takes the same arguments as `waterdata.get_daily`."
+    "gage. The wrapper takes the same arguments as `waterdata.get_daily` and returns a\n",
+    "CF dataset:"
    ]
   },
   {
@@ -71,8 +75,8 @@
    "source": [
     "ds = wdx.get_daily(\n",
     "    monitoring_location_id=\"USGS-05407000\",\n",
-    "    parameter_code=\"00060\",\n",
-    "    statistic_id=\"00003\",\n",
+    "    parameter_code=\"00060\",  # discharge\n",
+    "    statistic_id=\"00003\",  # daily mean\n",
     "    time=\"2023-01-01/2024-01-01\",\n",
     ")\n",
     "ds"
@@ -82,19 +86,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note the shape of the result — this is a CF **contiguous ragged array**\n",
+    "The result is a CF `(monitoring_location_id, time)` grid\n",
     "(`featureType = \"timeSeries\"`):\n",
     "\n",
-    "* every observation lives along a single `obs` dimension;\n",
-    "* each *series* — one `(monitoring location, parameter, statistic)` — is one\n",
-    "  instance along the `timeseries` dimension;\n",
-    "* `row_size` records how many observations each series contributes (the CF\n",
-    "  `sample_dimension` link), and `timeseries_id` carries `cf_role`;\n",
-    "* because there is a single, homogeneous parameter here, the descriptors land\n",
-    "  directly on `value` (`long_name`, `units`, `cell_methods`, `standard_name`).\n",
-    "\n",
-    "The flag columns (`qualifier`, `approval_status`) are linked as\n",
-    "`ancillary_variables`, and dataset attributes carry provenance:"
+    "* each parameter is its own **named variable** (`discharge`), so `time` is a real\n",
+    "  dimension you can index by label;\n",
+    "* the descriptors derived from the data land on the variable as CF attributes\n",
+    "  (`long_name`, `units`, `cell_methods`, `standard_name`, `usgs_parameter_code`);\n",
+    "* `monitoring_location_id` is the instance dimension and carries\n",
+    "  `cf_role = \"timeseries_id\"`; the flag columns (`qualifier`, `approval_status`)\n",
+    "  are linked as `ancillary_variables`;\n",
+    "* dataset attributes carry provenance (`Conventions`, `references`,\n",
+    "  `date_modified`)."
    ]
   },
   {
@@ -103,19 +106,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds[\"value\"].attrs)\n",
-    "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\", \"date_modified\")})"
+    "print(ds[\"discharge\"].attrs)\n",
+    "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\")})\n",
+    "\n",
+    "# time is a real dimension -> label-based selection just works\n",
+    "ds[\"discharge\"].sel(time=\"2023-07-01\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Why ragged is the default\n",
+    "## 2. Multiple parameters and sites\n",
     "\n",
-    "Real collections are *ragged*: some gages have a century of record, others a few\n",
-    "years. Pull discharge at two gages with very different start dates and look at\n",
-    "`row_size`:"
+    "Ask for several parameters across several gages in one call. Each parameter\n",
+    "becomes its own named variable on a shared `(monitoring_location_id, time)`\n",
+    "grid, NaN where a series has no observation (e.g. a gage that does not report a\n",
+    "given parameter):"
    ]
   },
   {
@@ -124,27 +131,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sites = [\"USGS-05407000\", \"USGS-02238500\"]  # records since 1913 vs 2008\n",
-    "# Bound the window so the docs build stays light; the start-date gap still\n",
-    "# shows up as different row_size (~24 vs ~16 years of daily values).\n",
-    "ragged = wdx.get_daily(\n",
-    "    monitoring_location_id=sites,\n",
-    "    parameter_code=\"00060\",\n",
+    "grid = wdx.get_daily(\n",
+    "    monitoring_location_id=[\"USGS-05407000\", \"USGS-03339000\"],\n",
+    "    parameter_code=[\"00060\", \"00010\"],  # discharge + water temperature\n",
     "    statistic_id=\"00003\",\n",
-    "    time=\"2000-01-01/2024-01-01\",\n",
+    "    time=\"2023-01-01/2024-01-01\",\n",
     ")\n",
-    "print(\"dims      :\", dict(ragged.sizes))\n",
-    "print(\"row_size  :\", dict(zip(ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))"
+    "print(\"dims      :\", dict(grid.sizes))\n",
+    "print(\"variables :\", list(grid.data_vars))\n",
+    "grid"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The alternative is a **dense** `(monitoring_location_id, time)` grid — one named\n",
-    "variable per parameter, NaN where a series has no observation. It is convenient\n",
-    "(see the next section) but pays for a union time axis and NaN fill. Pass\n",
-    "`dense=True` to get it, and compare the in-memory size:"
+    "Now any slice is a labelled `.sel(...)` — one site's series, or all sites on a\n",
+    "given day as a vector:"
    ]
   },
   {
@@ -153,38 +156,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dense = wdx.get_daily(\n",
-    "    monitoring_location_id=sites,\n",
-    "    parameter_code=\"00060\",\n",
-    "    statistic_id=\"00003\",\n",
-    "    time=\"2000-01-01/2024-01-01\",\n",
-    "    dense=True,\n",
-    ")\n",
-    "print(\"dense dims :\", dict(dense.sizes))\n",
-    "print(\"dense var  :\", list(dense.data_vars))\n",
-    "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB   dense {dense.nbytes/1e6:6.2f} MB\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "With only two gages the gap is modest, but it grows fast: a whole state's daily\n",
-    "discharge can be 15–30% dense, where the gridded layout balloons to hundreds of\n",
-    "megabytes (mostly NaN) while the ragged array stores only real observations.\n",
-    "That is why ragged is the default."
+    "# one site's discharge series\n",
+    "q = grid[\"discharge\"].sel(monitoring_location_id=\"USGS-05407000\")\n",
+    "print(\"single series :\", dict(q.sizes))\n",
+    "\n",
+    "# every site's discharge on one day\n",
+    "grid[\"discharge\"].sel(time=\"2023-07-01\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. The trade-off: selecting by time\n",
+    "## 3. The ragged layout for large, very ragged pulls\n",
     "\n",
-    "This is the one thing to internalize about the ragged layout.\n",
+    "The dense grid is ergonomic, but it pays for a union time axis and NaN fill.\n",
+    "Real collections are *ragged*: some gages have a century of record, others a few\n",
+    "years. For large multi-site pulls, pass `dense=False` to get the CF **contiguous\n",
+    "ragged array**, which stores only real observations — no NaN fill.\n",
     "\n",
-    "In the **dense** dataset, `time` is a real dimension with an index, so\n",
-    "label-based selection just works:"
+    "Pull discharge at two gages with very different start dates:"
    ]
   },
   {
@@ -193,39 +184,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# all sites on a given day, addressed as a (site, time) grid\n",
-    "dense[\"discharge\"].sel(time=\"2020-06-01\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the **ragged** dataset, `value` is one flat list along `obs`, and `time` is\n",
-    "*just another variable* riding along `obs` — not a dimension. So `value` cannot\n",
-    "be addressed as a `(site, time)` grid, and a time selection returns whatever\n",
-    "observations happen to match, **mixed across sites with no labels** (and\n",
-    "identical dates across sites collide onto the same axis):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"value dims:\", ragged[\"value\"].dims)  # ('obs',) -- not (site, time)\n",
-    "# .sel(time=...) on the flat obs axis can't give you a per-series slice\n",
-    "ragged[\"value\"].sel(time=\"2020-06-01\")"
+    "sites = [\"USGS-05407000\", \"USGS-02238500\"]  # records since 1913 vs 2008\n",
+    "# Bound the window so the docs build stays light; the start-date gap still\n",
+    "# shows up as different row_size.\n",
+    "ragged = wdx.get_daily(\n",
+    "    monitoring_location_id=sites,\n",
+    "    parameter_code=\"00060\",\n",
+    "    statistic_id=\"00003\",\n",
+    "    time=\"2000-01-01/2024-01-01\",\n",
+    "    dense=False,\n",
+    ")\n",
+    "print(\"dims     :\", dict(ragged.sizes))\n",
+    "print(\"row_size :\", dict(zip(\n",
+    "    ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))\n",
+    "ragged"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To recover the convenient, time-indexed view you first **regroup** the flat\n",
-    "`obs` back into per-series pieces using the offsets implied by `row_size`. A\n",
-    "tiny helper:"
+    "In the ragged array every observation lives along a single `obs` dimension; each\n",
+    "series — one `(monitoring location, parameter, statistic)` — is one instance\n",
+    "along `timeseries`, and `row_size` records how many observations it contributes\n",
+    "(the CF `sample_dimension` link). Because it stores no NaN fill, it stays compact\n",
+    "as collections get ragged. Compare the in-memory size against the dense grid:"
    ]
   },
   {
@@ -234,42 +217,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def series(ds, i):\n",
-    "    \"\"\"Instance ``i`` of a ragged dataset as a time-indexed DataArray.\"\"\"\n",
-    "    starts = np.concatenate([[0], np.cumsum(ds[\"row_size\"].values)])\n",
-    "    sl = slice(int(starts[i]), int(starts[i + 1]))\n",
-    "    da = ds[\"value\"].isel(obs=sl)\n",
-    "    return da.assign_coords(time=ds[\"time\"].isel(obs=sl)).swap_dims(obs=\"time\")\n",
-    "\n",
-    "\n",
-    "s0 = series(ragged, 0)\n",
-    "print(ragged[\"timeseries_id\"].values[0])\n",
-    "s0.sel(time=\"2020-06-01\")  # now .sel(time=...) works on a single series"
+    "dense = wdx.get_daily(\n",
+    "    monitoring_location_id=sites,\n",
+    "    parameter_code=\"00060\",\n",
+    "    statistic_id=\"00003\",\n",
+    "    time=\"2000-01-01/2024-01-01\",\n",
+    ")  # default grid\n",
+    "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB   dense {dense.nbytes/1e6:6.2f} MB\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For anything beyond a one-off slice, the [`cf-xarray`](https://cf-xarray.readthedocs.io/)\n",
-    "package understands the CF ragged encoding (`cf_role`, `sample_dimension`) and\n",
-    "will regroup/decode for you. Or, when you know the pull is small and overlapping,\n",
-    "just ask for `dense=True` up front and use `.sel(time=...)` directly.\n",
-    "\n",
-    "**Rule of thumb:** ragged for storage and large multi-site pulls; `dense=True`\n",
-    "for ergonomic time-based slicing of a few overlapping series."
+    "With two gages the gap is modest, but it grows fast: a whole state's daily\n",
+    "discharge spans gages with wildly different record lengths, where the gridded\n",
+    "layout balloons (mostly NaN) while the ragged array stores only real\n",
+    "observations. **Rule of thumb:** the default grid for ergonomic, label-based\n",
+    "slicing of a few series; `dense=False` for storage and large, very ragged\n",
+    "multi-site pulls."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4. Multiple parameters in one pull\n",
+    "## 4. Selecting one series from a ragged dataset\n",
     "\n",
-    "When a pull mixes parameters (and units), the ragged layout keeps a single\n",
-    "`value` and records the parameter/unit per *instance* — so nothing is\n",
-    "mislabeled. The homogeneous descriptors drop off `value` and live on the\n",
-    "`parameter_code` / `unit_of_measure` coordinates instead:"
+    "In the ragged layout `time` is a flat `obs` axis, not a dimension, so you can't\n",
+    "`.sel(time=...)` directly. Use `wdx.select_series` to pull one series by its\n",
+    "labels; it returns a tidy, time-indexed dataset on which `.sel(time=...)` works:"
    ]
   },
   {
@@ -278,22 +255,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "multi = wdx.get_daily(\n",
-    "    monitoring_location_id=\"USGS-05407000\",\n",
-    "    parameter_code=[\"00060\", \"00045\"],  # discharge + precipitation\n",
-    "    time=\"2023-06-01/2023-07-01\",\n",
+    "one = wdx.select_series(\n",
+    "    ragged, monitoring_location_id=\"USGS-05407000\", parameter_code=\"00060\"\n",
     ")\n",
-    "print(\"value long_name:\", multi[\"value\"].attrs.get(\"long_name\"))\n",
-    "print(\"per-instance parameter_code:\", multi[\"parameter_code\"].values)\n",
-    "print(\"per-instance unit_of_measure:\", multi[\"unit_of_measure\"].values)"
+    "print(\"dims :\", dict(one.sizes))\n",
+    "one[\"value\"].sel(time=\"2020-07\").mean()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In the `dense=True` form each parameter instead becomes its own named variable\n",
-    "(`discharge`, `precipitation`, …) on the shared time grid."
+    "For richer work, the [`cf-xarray`](https://cf-xarray.readthedocs.io/) package\n",
+    "understands the CF ragged encoding (`cf_role`, `sample_dimension`) and will\n",
+    "regroup/decode the whole dataset for you."
    ]
   },
   {
@@ -302,12 +277,13 @@
    "source": [
     "## 5. Water-quality samples\n",
     "\n",
-    "`get_samples` returns discrete water-quality results in the same ragged shape:\n",
-    "one instance per `(monitoring location, characteristic, sample fraction)`, with\n",
-    "the result value plus `detection_condition` and `status` as ancillary\n",
-    "(censoring) variables. Characteristics are free text, so no CF `standard_name`\n",
-    "is inferred and non-numeric results coerce to NaN (the `detection_condition`\n",
-    "variable preserves non-detects)."
+    "`get_samples` returns discrete water-quality results. Samples are irregular in\n",
+    "time and characteristic, so this getter is always ragged: one instance per\n",
+    "`(monitoring location, characteristic, sample fraction)`, with the result value\n",
+    "plus `detection_condition` and `status` as ancillary (censoring) variables.\n",
+    "Characteristics are free text, so no CF `standard_name` is inferred and\n",
+    "non-numeric results coerce to NaN (the `detection_condition` variable preserves\n",
+    "non-detects)."
    ]
   },
   {
@@ -334,10 +310,10 @@
    "source": [
     "## 6. Writing CF netCDF\n",
     "\n",
-    "Because the dataset already carries CF metadata in the standard ragged-array\n",
-    "encoding, it serializes straight to a self-describing netCDF file that\n",
-    "CF-aware tools (THREDDS, `cf-xarray`, Panoply, …) can read back (requires a\n",
-    "netCDF backend, e.g. `pip install netCDF4`):"
+    "Because the dataset already carries CF metadata, it serializes straight to a\n",
+    "self-describing netCDF file that CF-aware tools (THREDDS, `cf-xarray`, Panoply,\n",
+    "…) can read back. This works for either layout (requires a netCDF backend, e.g.\n",
+    "`pip install netCDF4`):"
    ]
   },
   {
@@ -350,7 +326,7 @@
     "import tempfile\n",
     "\n",
     "path = os.path.join(tempfile.mkdtemp(), \"daily_discharge.nc\")\n",
-    "ragged.to_netcdf(path)  # CF-1.11 contiguous ragged array\n",
+    "ds.to_netcdf(path)  # the dense grid from section 1\n",
     "print(\"wrote\", path)"
    ]
   },
@@ -378,5 +354,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 5
+ "nbformat_minor": 4
 }
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
index d74fc64c..27c4cd6a 100644
--- a/tests/waterdata_xarray_test.py
+++ b/tests/waterdata_xarray_test.py
@@ -109,6 +109,20 @@ def test_ancillary_variables_linked():
     )
 
 
+def test_list_valued_qualifier_is_flattened_to_string():
+    # The API returns ``qualifier`` as a list of codes per observation; the
+    # ancillary variable must be flattened to a netCDF-encodable string (object
+    # arrays of lists can't be written), empty lists -> missing.
+    df = _daily_frame()
+    df["qualifier"] = [["A", "e"], []]  # multi-code list, then empty list
+    for build in (wdx._build_ragged, wdx._build_dense):
+        ds = build(df, _meta(), service="daily", series_meta=_DISCHARGE_META)
+        qual = ds["qualifier"] if "qualifier" in ds else ds["discharge_qualifier"]
+        flat = qual.values.ravel().tolist()
+        assert not any(isinstance(v, (list, tuple)) for v in flat)  # no list cells
+        assert "A e" in flat  # multi-code list joined with a space
+
+
 def test_unknown_unit_passes_through_verbatim():
     df = _daily_frame()
     df["unit_of_measure"] = "widgets/s"  # units are read from the frame
@@ -197,6 +211,22 @@ def test_build_stats_flat_dataset():
     assert isinstance(ds, xr.Dataset)
     assert "p50_va" in ds.data_vars
     assert ds.attrs["Conventions"] == "CF-1.11"
+    # A flat percentile table is not a CF discrete-sampling geometry, so it must
+    # NOT advertise featureType=timeSeries (which would mislead cf-xarray/CF
+    # tooling into treating it as a timeseries DSG).
+    assert "featureType" not in ds.attrs
+
+
+def test_stats_empty_omits_feature_type():
+    # The empty-stats path must also stay free of the timeSeries featureType.
+    ds = wdx._build_stats(pd.DataFrame(), _meta(), "statistics")
+    assert list(ds.data_vars) == []
+    assert "featureType" not in ds.attrs
+    # the series builders still advertise the DSG featureType
+    series = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert series.attrs["featureType"] == "timeSeries"
 
 
 def test_build_stats_drops_hash_columns():
@@ -578,33 +608,95 @@ def test_ragged_field_schema_without_statistic():
     assert ds["value"].attrs["long_name"] == "Discharge, cubic feet per second"
 
 
-# --- dense opt-out wiring ---------------------------------------------------
+# --- select_series (label-based selection on the ragged layout) -------------
+
+
+def _two_instance_ragged():
+    """Ragged Dataset with two instances at one site: 00060 and 00010."""
+    meta = {**_DISCHARGE_META, **_TEMP_META}
+    return wdx._build_ragged(
+        pd.concat([_daily_frame(), _temp_frame()]),
+        _meta(),
+        service="continuous",
+        series_meta=meta,
+    )
+
+
+def test_select_series_returns_time_indexed_single_series():
+    ds = _two_instance_ragged()
+    s = wdx.select_series(ds, monitoring_location_id="USGS-1", parameter_code="00060")
+    # time is now a real dimension (not the flat obs axis)
+    assert set(s.sizes) == {"time"}
+    assert s["value"].dims == ("time",)
+    assert list(s["value"].values) == [100, 110]
+    # the series identity rides along as scalar coordinates
+    assert str(s["parameter_code"].values) == "00060"
+    assert str(s["unit_of_measure"].values) == "ft^3/s"
+    # ancillary flags follow the series; row_size is dropped
+    assert "approval_status" in s.data_vars
+    assert "row_size" not in s.variables
+    # .sel(time=...) works now that time is the dimension
+    assert s["value"].sel(time="2024-06-01").item() == 100
+
+
+def test_select_series_ambiguous_raises():
+    # selecting by site alone matches both instances -> ask for more keys
+    ds = _two_instance_ragged()
+    with pytest.raises(ValueError, match="2 time series match"):
+        wdx.select_series(ds, monitoring_location_id="USGS-1")
 
 
-def test_wrapper_defaults_to_ragged_and_dense_opt_out(monkeypatch):
-    # Default wrapper output is ragged; dense=True returns the gridded layout.
+def test_select_series_no_match_raises():
+    ds = _two_instance_ragged()
+    with pytest.raises(KeyError, match="no time series matches"):
+        wdx.select_series(ds, parameter_code="99999")
+
+
+def test_select_series_unknown_key_raises():
+    ds = _two_instance_ragged()
+    with pytest.raises(KeyError, match="not a per-series coordinate"):
+        wdx.select_series(ds, bogus="x")
+
+
+def test_select_series_on_dense_raises_helpful_error():
+    # On a dense Dataset, parameters are named variables; select_series points
+    # the user at ds[name].sel(...) instead of failing cryptically.
+    dense = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    with pytest.raises(ValueError, match="expects a ragged Dataset"):
+        wdx.select_series(dense, monitoring_location_id="USGS-1")
+
+
+# --- ragged opt-out wiring --------------------------------------------------
+
+
+def test_wrapper_defaults_to_dense_and_ragged_opt_out(monkeypatch):
+    # Default wrapper output is the (site, time) grid; dense=False opts into the
+    # ragged array.
     monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta()))
     monkeypatch.setattr(wdx._TS_CACHE, "lookup", lambda site_ids: (_DISCHARGE_META, {}))
 
-    ragged = wdx.get_daily(monitoring_location_id="USGS-1")
-    assert "obs" in ragged.sizes and "value" in ragged.data_vars
-    assert "discharge" not in ragged.data_vars
-
-    dense = wdx.get_daily(monitoring_location_id="USGS-1", dense=True)
+    dense = wdx.get_daily(monitoring_location_id="USGS-1")
     assert "discharge" in dense.data_vars
     assert "obs" not in dense.sizes
 
+    ragged = wdx.get_daily(monitoring_location_id="USGS-1", dense=False)
+    assert "obs" in ragged.sizes and "value" in ragged.data_vars
+    assert "discharge" not in ragged.data_vars
+
 
 def test_series_wrapper_signature_advertises_dense_and_dataset_return():
     # functools.wraps would otherwise carry over the getter's DataFrame
     # signature and hide ``dense``; _make_getter publishes an accurate one.
+    # dense defaults to True (the grid is the default; dense=False -> ragged).
     import inspect
 
     sig = inspect.signature(wdx.get_daily)
     assert "monitoring_location_id" in sig.parameters  # real args stay visible
     dense = sig.parameters["dense"]
     assert dense.kind is inspect.Parameter.KEYWORD_ONLY
-    assert dense.default is False
+    assert dense.default is True
     assert "Dataset" in str(sig.return_annotation)
     assert wdx.get_daily.__annotations__["return"] == "xarray.Dataset"
 
@@ -755,8 +847,10 @@ def boom(monitoring_location_id):
 
 
 def test_wrapper_survives_metadata_failure(monkeypatch):
-    # End to end: a failing metadata endpoint still yields a CF dataset built
-    # from the data; only the metadata-sourced long_name is missing.
+    # End to end on the DEFAULT (dense) layout: a failing metadata endpoint still
+    # yields a CF dataset built from the data. With no metadata name the variable
+    # falls back to the parameter code -- never the literal "nan" -- and only the
+    # metadata-sourced long_name is missing.
     monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta()))
 
     def boom(monitoring_location_id):
@@ -765,11 +859,66 @@ def boom(monitoring_location_id):
     monkeypatch.setattr(wdx._TS_CACHE, "_getter", boom)
     wdx._TS_CACHE.clear()
     with pytest.warns(UserWarning, match="metadata lookup failed"):
-        ds = wdx.get_daily(monitoring_location_id="USGS-1")
-    assert "value" in ds.data_vars  # observations survived
-    # standard_name/units come from the frame; long_name (metadata) is absent
-    assert "long_name" not in ds["value"].attrs
-    assert ds["value"].attrs["units"] == "ft3 s-1"
+        ds = wdx.get_daily(monitoring_location_id="USGS-1")  # default (dense)
+    assert "00060" in ds.data_vars  # observations survived under the code name
+    assert "nan" not in ds.data_vars  # never a literal "nan" variable
+    v = ds["00060"]
+    assert "long_name" not in v.attrs  # name comes from metadata, which failed
+    assert v.attrs["units"] == "ft3 s-1"  # units come from the frame
+
+
+def test_nan_parameter_description_falls_back_to_name():
+    # A present-but-null parameter_description (NaN is truthy) must not mask the
+    # valid parameter_name; long_name falls back to the name.
+    meta = {
+        "00060": {"parameter_name": "Discharge", "parameter_description": float("nan")}
+    }
+    ds = wdx._build_dense(_daily_frame(), _meta(), service="daily", series_meta=meta)
+    assert ds["discharge"].attrs["long_name"] == "Discharge"
+
+
+def test_nan_parameter_name_uses_code_not_literal_nan():
+    # A present-but-null parameter_name must fall back to the parameter code,
+    # never produce a variable literally named "nan".
+    meta = {"00060": {"parameter_name": float("nan")}}
+    ds = wdx._build_dense(_daily_frame(), _meta(), service="daily", series_meta=meta)
+    assert "nan" not in ds.data_vars
+    assert "00060" in ds.data_vars
+
+
+def test_dense_collision_dedup_is_order_independent():
+    # Colliding (site, time) values must dedup deterministically (smallest),
+    # independent of the upstream row order.
+    a = _daily_frame(values=(100,), times=("2024-06-01",))
+    b = _daily_frame(values=(200,), times=("2024-06-01",))
+    kept = []
+    for frame in (pd.concat([a, b]), pd.concat([b, a])):
+        with pytest.warns(UserWarning, match="multiple values per"):
+            ds = wdx._build_dense(
+                frame, _meta(), service="daily", series_meta=_DISCHARGE_META
+            )
+        kept.append(
+            ds["discharge"]
+            .sel(monitoring_location_id="USGS-1", time="2024-06-01")
+            .item()
+        )
+    assert kept == [100, 100]  # smallest value, both input orders
+
+
+def test_partial_geometry_keeps_numeric_lonlat_coord():
+    # When only some sites have point geometry, the lon/lat coordinate must stay
+    # numeric (float, NaN-filled) -- not an object array with None, which is a
+    # CF-invalid spatial coordinate.
+    a = _daily_frame(site="USGS-A", values=(1, 2), times=("2024-06-01", "2024-06-02"))
+    b = _daily_frame(site="USGS-B", values=(3, 4), times=("2024-06-01", "2024-06-02"))
+    a["geometry"] = [[-90.44, 43.19]] * len(a)
+    b["geometry"] = [None] * len(b)  # this site carries no point geometry
+    ds = wdx._build_dense(
+        pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert ds["longitude"].dtype.kind == "f"  # float, not object
+    assert ds["longitude"].sel(monitoring_location_id="USGS-A").item() == -90.44
+    assert pd.isna(ds["longitude"].sel(monitoring_location_id="USGS-B").item())
 
 
 def test_metadata_cache_bounded_and_clearable():

From e47ad689586aa16c4783ea3e22277acfb2e118b8 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 09:21:52 -0500
Subject: [PATCH 3/7] fix(waterdata.xarray): post-review hardening of the
 dense-default commit

Address the second code-review pass:
- dense (site, time) collision dedup is now fully deterministic: stable sort
  on value then the flag columns, so the retained ancillary is order-independent
- _scalarize handles numpy arrays and nested/array elements without raising
  (was list/tuple-only and called pd.notna on possibly-non-scalar elements)
- _none_if_nan is array-safe via an is_scalar guard (new _is_missing helper)
- normalize NaN name descriptors once in _MetadataCache._ingest
- only map _scalarize over flag columns that are actually sequence-valued
- fix the stale examples/index.rst (dense is the default; dense=False ragged)

58 offline tests pass; live get_daily/get_samples + to_netcdf verified.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 dataretrieval/waterdata/xarray.py | 52 +++++++++++++++++++++----------
 docs/source/examples/index.rst    |  7 +++--
 tests/waterdata_xarray_test.py    | 40 +++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index b15912c0..d542e389 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -155,6 +155,16 @@ def _unique_present(frame, col):
     return frame[col].dropna().unique() if col in frame else []
 
 
+def _is_missing(value):
+    """True for None / a *scalar* NaN; False for any non-scalar (list, array).
+
+    ``_pd.notna`` raises on a list/array (it returns an element-wise mask), so
+    guard with ``is_scalar`` before testing -- callers pass cells that may be
+    sequences.
+    """
+    return value is None or (_pd.api.types.is_scalar(value) and _pd.isna(value))
+
+
 def _none_if_nan(value):
     """Collapse a pandas NaN to None.
 
@@ -162,19 +172,21 @@ def _none_if_nan(value):
     falling back. Metadata frames yield NaN for a column that is present but null
     for a given parameter, so normalize it to None before any such ``or`` chain.
     """
-    return value if _pd.notna(value) else None
+    return None if _is_missing(value) else value
 
 
 def _scalarize(cell):
-    """Collapse a list-valued flag cell into a single string; pass scalars through.
+    """Collapse a sequence-valued flag cell into a single string; pass scalars through.
 
     The Water Data API returns some ancillary (flag) columns -- notably
     ``qualifier`` -- as a *list* of codes per observation. xarray/netCDF cannot
-    encode an object array whose elements are lists, so join the codes into one
-    space-separated string (an empty list becomes None).
+    encode an object array whose elements are sequences, so join the codes into
+    one space-separated string (an empty/all-missing sequence becomes None).
+    Handles lists, tuples, and numpy arrays; missing elements are dropped
+    element-by-element without assuming the element is scalar.
     """
-    if isinstance(cell, (list, tuple)):
-        parts = [str(v) for v in cell if v is not None and _pd.notna(v)]
+    if isinstance(cell, (list, tuple, _np.ndarray)):
+        parts = [str(v) for v in cell if not _is_missing(v)]
         return " ".join(parts) if parts else None
     return cell
 
@@ -265,11 +277,14 @@ def _prepare_values(df, group_cols, ancillary_cols):
         work["time"], format="ISO8601", errors="coerce", utc=True
     ).dt.tz_localize(None)
     work["value"] = _pd.to_numeric(work["value"], errors="coerce")
-    # Flatten any list-valued flag cells (e.g. ``qualifier``) to strings so the
-    # ancillary variables stay netCDF-encodable (``_scalarize`` passes scalars
-    # through unchanged).
+    # Flatten any sequence-valued flag cells (e.g. ``qualifier``) to strings so
+    # the ancillary variables stay netCDF-encodable. Only the columns that are
+    # actually sequence-valued pay the per-row map -- a flag column is uniformly
+    # typed, so the first non-null cell decides (skips the common scalar case).
     for c in ancillary:
-        work[c] = work[c].map(_scalarize)
+        nonnull = work[c].dropna()
+        if len(nonnull) and isinstance(nonnull.iloc[0], (list, tuple, _np.ndarray)):
+            work[c] = work[c].map(_scalarize)
     n_before = len(work)
     work = work[work["time"].notna()]
     dropped = n_before - len(work)
@@ -443,12 +458,16 @@ def _ingest(self, meta, todo):
                     continue
                 if has_pcode:
                     pcode = row.get("parameter_code")
-                    if _pd.notna(pcode):
+                    if not _is_missing(pcode):
+                        # Normalize NaN -> None at the source so every consumer of
+                        # the name descriptors gets a clean ``or``-able value.
                         fresh[site]["params"][str(pcode)] = {
-                            c: row.get(c) for c in name_cols
+                            c: _none_if_nan(row.get(c)) for c in name_cols
                         }
                 if not fresh[site]["site"]:
-                    desc = {c: row.get(c) for c in site_cols if _pd.notna(row.get(c))}
+                    desc = {
+                        c: row.get(c) for c in site_cols if not _is_missing(row.get(c))
+                    }
                     if desc:
                         fresh[site]["site"] = desc
         with self._lock:
@@ -909,9 +928,10 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
                     "value. Filter the query to separate them.",
                     stacklevel=3,
                 )
-                # Sort by value first so the retained value is deterministic
-                # rather than dependent on the upstream response row order.
-                sub = sub.sort_values("value")
+                # Sort by value then the flag columns, with a stable sort, so the
+                # whole retained row (value *and* its ancillary flags) is
+                # deterministic rather than dependent on the upstream row order.
+                sub = sub.sort_values(["value", *ancillary], kind="stable")
                 sub = sub[~sub.index.duplicated(keep="first")]
             ds_g = sub.to_xarray().rename(
                 {"value": name, **{c: f"{name}_{c}" for c in ancillary}}
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
index c86f6f4d..af9db9c6 100644
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@@ -35,9 +35,10 @@ functions and is executed against the live USGS Water Data API.
 CF-conventions ``xarray`` datasets from the ``waterdata`` module
 ----------------------------------------------------------------
 The ``waterdata.xarray`` wrappers return CF-conventions ``xarray.Dataset``
-objects (a ragged array by default, with a ``dense=True`` gridded opt-out).
-This notebook demonstrates the layouts, the time-selection trade-off, and
-writing CF netCDF.
+objects (a dense ``(monitoring_location_id, time)`` grid by default, with a
+``dense=False`` contiguous-ragged-array opt-out for large multi-site pulls).
+This notebook demonstrates the layouts, selecting one series, and writing CF
+netCDF.
 
 .. toctree::
     :maxdepth: 1
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
index 27c4cd6a..735e82ba 100644
--- a/tests/waterdata_xarray_test.py
+++ b/tests/waterdata_xarray_test.py
@@ -109,6 +109,20 @@ def test_ancillary_variables_linked():
     )
 
 
+def test_scalarize_handles_list_array_nested_and_scalar():
+    import numpy as np
+
+    assert wdx._scalarize(["A", "e"]) == "A e"
+    assert wdx._scalarize(("A", "e")) == "A e"
+    assert wdx._scalarize(np.array(["A", "e"])) == "A e"  # numpy array, not list
+    assert wdx._scalarize([]) is None  # empty -> missing
+    assert wdx._scalarize(["A", None]) == "A"  # missing element dropped
+    assert wdx._scalarize("A") == "A"  # scalar passes through
+    assert wdx._scalarize(None) is None
+    # a nested element must not raise (array-truth pitfall); it is stringified
+    assert isinstance(wdx._scalarize([["A", "e"], "z"]), str)
+
+
 def test_list_valued_qualifier_is_flattened_to_string():
     # The API returns ``qualifier`` as a list of codes per observation; the
     # ancillary variable must be flattened to a netCDF-encodable string (object
@@ -176,7 +190,9 @@ def test_multiple_parameters_outer_join_on_time():
 
 def test_collision_dedups_with_warning():
     # two values for the same (site, parameter, statistic, time) are ambiguous
-    # without the hash key -> keep the first and warn; site stays the dim.
+    # without the hash key -> keep the smallest (deterministically) and warn;
+    # site stays the dim. (test_dense_collision_dedup_is_order_independent pins
+    # the order-independence; here 100 is both the first and the smallest.)
     a = _daily_frame(values=(100,), times=("2024-06-01",))
     b = _daily_frame(values=(200,), times=("2024-06-01",))
     with pytest.warns(UserWarning, match="multiple values per"):
@@ -905,6 +921,28 @@ def test_dense_collision_dedup_is_order_independent():
     assert kept == [100, 100]  # smallest value, both input orders
 
 
+def test_dense_collision_kept_ancillary_is_order_independent():
+    # Two rows collide on (site, time) with the SAME value but different
+    # qualifiers; the retained flag must be deterministic (stable sort on value
+    # then ancillary), not dependent on upstream row order.
+    a = _daily_frame(values=(100,), times=("2024-06-01",))
+    b = _daily_frame(values=(100,), times=("2024-06-01",))
+    a["qualifier"] = ["X"]
+    b["qualifier"] = ["Y"]
+    kept = []
+    for frame in (pd.concat([a, b]), pd.concat([b, a])):
+        with pytest.warns(UserWarning, match="multiple values per"):
+            ds = wdx._build_dense(
+                frame, _meta(), service="daily", series_meta=_DISCHARGE_META
+            )
+        kept.append(
+            ds["discharge_qualifier"]
+            .sel(monitoring_location_id="USGS-1", time="2024-06-01")
+            .item()
+        )
+    assert kept == ["X", "X"]  # smaller flag kept, both input orders
+
+
 def test_partial_geometry_keeps_numeric_lonlat_coord():
     # When only some sites have point geometry, the lon/lat coordinate must stay
     # numeric (float, NaN-filled) -- not an object array with None, which is a

From 961be13da3e07502f7360c37f6002475ed087f26 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 09:56:17 -0500
Subject: [PATCH 4/7] fix(waterdata.xarray): resolve remaining review findings

- samples now surface station longitude/latitude (mapped from
  Location_Longitude/Location_Latitude; _point_coords reads explicit lon/lat
  columns in addition to an OGC geometry column)
- metadata cache: a single large pull is no longer subject to within-batch
  FIFO eviction (the call's result is built from the freshly-parsed entries),
  and sites with no metadata are no longer negatively cached, so they retry
- dense variable naming is deterministic and unambiguous: a bare name
  (e.g. discharge) is used only when unique; same-named series are all
  disambiguated by cell method / statistic / parameter code
- dense multi-unit label is deterministic (sorted) instead of row-order dependent
- row_size is int64 (was int32) to avoid overflow / cumsum truncation
- select_series rejects descriptor coords as keys (lon/lat float-equality
  footgun) and can match a null instance key

64 offline tests pass; live samples lon/lat + dense naming verified.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 dataretrieval/waterdata/xarray.py | 138 +++++++++++++++++++++++-------
 tests/waterdata_xarray_test.py    | 108 ++++++++++++++++++++++-
 2 files changed, 215 insertions(+), 31 deletions(-)

diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index d542e389..867d1ab9 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -232,7 +232,27 @@ def _lonlat(geom):
 
 
 def _point_coords(df, site):
-    """lon/lat dicts keyed by site from point geometry, or None."""
+    """lon/lat dicts keyed by site, or None.
+
+    Reads either a ``geometry`` column (the time-series getters' OGC response) or
+    explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via
+    :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates.
+    """
+    if {"longitude", "latitude"}.issubset(df.columns):
+        geo = df.dropna(subset=["longitude", "latitude"]).drop_duplicates(site)
+        if geo.empty:
+            return None
+        lon, lat = {}, {}
+        for site_id, x, y in zip(
+            geo[site].to_numpy(),
+            geo["longitude"].to_numpy(),
+            geo["latitude"].to_numpy(),
+        ):
+            try:
+                lon[site_id], lat[site_id] = float(x), float(y)
+            except (TypeError, ValueError):
+                continue
+        return (lon, lat) if lon else None
     if "geometry" not in df.columns:
         return None
     geo = df.dropna(subset=["geometry"]).drop_duplicates(site)
@@ -408,8 +428,9 @@ def lookup(self, site_ids):
         """
         sites = sorted({str(s) for s in site_ids if _pd.notna(s)})
         # Racy read of the keys is fine: a concurrent miss just re-fetches (the
-        # fetch is idempotent); only the writes in _ingest take the lock.
+        # fetch is idempotent); only the writes in _store take the lock.
         todo = [s for s in sites if s not in self._entries]
+        fresh: dict[str, dict] = {}
         if todo:
             try:
                 meta, _ = self._getter(monitoring_location_id=todo)
@@ -420,12 +441,17 @@ def lookup(self, site_ids):
                     stacklevel=2,
                 )
             else:
-                self._ingest(meta, todo)
+                fresh = self._parse(meta, todo)
+                self._store(fresh)
         param_meta: dict[str, dict] = {}
         site_meta: dict[str, dict] = {}
         with self._lock:
             for s in sites:
-                entry = self._entries.get(s, {})
+                # Prefer this call's freshly-parsed entry over the cache: the
+                # bounded cache may have already evicted just-fetched sites when a
+                # single pull's ``todo`` exceeds maxsize, but the current call
+                # must still see every site it fetched.
+                entry = fresh.get(s) or self._entries.get(s, {})
                 param_meta.update(entry.get("params", {}))
                 if entry.get("site"):
                     site_meta[s] = entry["site"]
@@ -439,12 +465,8 @@ def clear(self):
     def __len__(self):
         return len(self._entries)
 
-    def _ingest(self, meta, todo):
-        """Parse ``meta`` into per-site entries, then merge + evict under lock.
-
-        The parsing runs lock-free on a local dict; only the (cheap) merge into
-        the shared cache and the FIFO eviction past ``maxsize`` hold the lock.
-        """
+    def _parse(self, meta, todo):
+        """Parse ``meta`` into per-site ``{params, site}`` entries (lock-free)."""
         fresh = {s: {"params": {}, "site": {}} for s in todo}
         if not meta.empty:
             name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns]
@@ -470,8 +492,20 @@ def _ingest(self, meta, todo):
                     }
                     if desc:
                         fresh[site]["site"] = desc
+        return fresh
+
+    def _store(self, fresh):
+        """Merge non-empty entries into the bounded cache (FIFO eviction).
+
+        Sites that came back with no metadata are *not* cached, so a later call
+        retries them rather than being stuck with a sticky empty result; the
+        current call still sees them via the freshly-parsed ``fresh`` dict.
+        """
+        keep = {s: e for s, e in fresh.items() if e["params"] or e["site"]}
+        if not keep:
+            return
         with self._lock:
-            self._entries.update(fresh)
+            self._entries.update(keep)
             while len(self._entries) > self._maxsize:
                 self._entries.pop(next(iter(self._entries)))
 
@@ -524,14 +558,24 @@ def select_series(ds, **keys):
             "so select by name instead, e.g. "
             "ds[variable].sel(monitoring_location_id=...)."
         )
-    inst_coords = [c for c in ds.coords if ds[c].dims == ("timeseries",)]
+    # Selectable keys are the series *identity* coordinates only -- exclude the
+    # per-series descriptors (lon/lat are a float-equality footgun; unit/HUC/state
+    # are not series identifiers).
+    descriptors = {"longitude", "latitude", "unit_of_measure", *_SITE_DESCRIPTORS}
+    inst_coords = [
+        c for c in ds.coords if ds[c].dims == ("timeseries",) and c not in descriptors
+    ]
     mask = _np.ones(ds.sizes["timeseries"], dtype=bool)
     for key, value in keys.items():
         if key not in inst_coords:
             raise KeyError(
-                f"{key!r} is not a per-series coordinate; choose from {inst_coords}."
+                f"{key!r} is not a per-series identity coordinate; choose from "
+                f"{inst_coords}."
             )
-        mask &= ds[key].to_numpy() == value
+        arr = ds[key].to_numpy()
+        # NaN never equals anything, so match a missing instance key (e.g. a
+        # characteristic with no sample fraction) by null-ness instead.
+        mask &= _pd.isna(arr) if _is_missing(value) else (arr == value)
     matches = _np.flatnonzero(mask)
     if matches.size == 0:
         raise KeyError(f"no time series matches {keys}.")
@@ -563,6 +607,10 @@ def select_series(ds, **keys):
     "Result_SampleFraction": "sample_fraction",
     "Result_ResultDetectionCondition": "detection_condition",
     "Result_MeasureStatusIdentifier": "status",
+    # Samples carry position as explicit columns (no OGC ``geometry``); map them
+    # to the canonical names so _point_coords surfaces station lon/lat.
+    "Location_Longitude": "longitude",
+    "Location_Latitude": "latitude",
 }
 _CANONICAL_COORD_ATTRS = {
     "parameter_code": {"long_name": "USGS parameter code"},
@@ -786,7 +834,9 @@ def _assemble(self, work, inst_cols, ancillary, has_unit):
         )
         data_vars = {
             "value": ("obs", work["value"].to_numpy()),
-            "row_size": ("timeseries", row_size.to_numpy().astype("int32")),
+            # int64 (not int32): a single long, high-frequency series can exceed
+            # 2^31 observations, and the select_series cumsum must not overflow.
+            "row_size": ("timeseries", row_size.to_numpy().astype("int64")),
         }
         for c in ancillary:
             data_vars[c] = ("obs", work[c].to_numpy())
@@ -899,16 +949,25 @@ def _build_series(self, work, group_cols, ancillary, has_unit):
 
     def _variable_datasets(self, work, group_cols, ancillary, has_unit):
         """One pivoted ``(site, time)`` Dataset per (parameter, statistic)."""
-        datasets, used = [], set()
+        # First pass: gather each group's identity and base name, so naming can
+        # see the whole set (a bare name is only used when it is unambiguous).
+        specs = []
         for _, group in work.groupby(group_cols, dropna=False):
             pcode = _first_present(group, "parameter_code")
             stat = _first_present(group, "statistic_id")
-            group_units = group["unit_of_measure"].dropna().unique() if has_unit else ()
-            unit = group_units[0] if len(group_units) else None
             desc = self.series_meta.get(str(pcode), {}) if pcode is not None else {}
-
-            name = self._variable_name(desc, pcode, stat, used)
-            used.add(name)
+            base = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value")
+            specs.append((group, pcode, stat, desc, base))
+        names = self._disambiguate([s[4] for s in specs], [(s[1], s[2]) for s in specs])
+
+        datasets = []
+        for (group, pcode, stat, desc, _base), name in zip(specs, names):
+            # Sort the units so the chosen label is deterministic across pulls
+            # (values are not converted either way; see the multi-unit warning).
+            group_units = (
+                sorted(group["unit_of_measure"].dropna().unique()) if has_unit else []
+            )
+            unit = group_units[0] if group_units else None
 
             if len(group_units) > 1:
                 # One variable can carry only one ``units`` attr; surface the
@@ -951,15 +1010,34 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
         return datasets
 
     @staticmethod
-    def _variable_name(desc, pcode, stat, used):
-        """A unique slug for a variable; disambiguate same-parameter series."""
-        name = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value")
-        if name in used:  # same parameter, different statistic -> distinct var
-            op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None)
-            name = f"{name}_{_slug(op)}" if op else name
-        while name in used:
-            name += "_x"
-        return name
+    def _disambiguate(bases, keys):
+        """Map per-group base slugs to unique, deterministic variable names.
+
+        ``keys[i]`` is the group's ``(parameter_code, statistic_id)``. A base used
+        by exactly one group stays bare (e.g. ``discharge``); a base shared by
+        several groups is disambiguated for *all* of them -- by the statistic's
+        cell-method operator (``discharge_maximum`` / ``discharge_mean``), falling
+        back to the statistic id then the parameter code -- so a bare name never
+        silently refers to an arbitrary one of several same-named series.
+        """
+        counts: dict[str, int] = {}
+        for b in bases:
+            counts[b] = counts.get(b, 0) + 1
+        names, used = [], set()
+        for base, (pcode, stat) in zip(bases, keys):
+            if counts[base] == 1:
+                name = base
+            else:
+                op = CF_CELL_METHODS.get(str(stat)) if stat is not None else None
+                suffix = op or (str(stat) if stat is not None else None)
+                name = f"{base}_{_slug(suffix)}" if suffix else base
+                if name == base or name in used:  # statistic didn't separate them
+                    name = f"{base}_{_slug(pcode)}" if pcode is not None else base
+            while name in used:
+                name += "_x"
+            used.add(name)
+            names.append(name)
+        return names
 
 
 class _StatsBuilder(_DatasetBuilder):
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
index 735e82ba..4a5cc5d2 100644
--- a/tests/waterdata_xarray_test.py
+++ b/tests/waterdata_xarray_test.py
@@ -655,6 +655,41 @@ def test_select_series_returns_time_indexed_single_series():
     assert s["value"].sel(time="2024-06-01").item() == 100
 
 
+def test_dense_same_parameter_two_statistics_no_bare_name():
+    # 00060 under both 00001 (max) and 00003 (mean): the bare 'discharge' name is
+    # ambiguous, so BOTH variables are disambiguated by their cell method -- no
+    # order-dependent bare 'discharge' that silently means one of them.
+    mx = _daily_frame(values=(500,), times=("2024-06-01",))
+    mx["statistic_id"] = "00001"
+    mn = _daily_frame(values=(100,), times=("2024-06-01",))
+    ds = wdx._build_dense(
+        pd.concat([mx, mn]), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "discharge" not in ds.data_vars  # no bare (ambiguous) name
+    assert {"discharge_maximum", "discharge_mean"} <= set(ds.data_vars)
+    assert ds["discharge_maximum"].attrs["cell_methods"] == "time: maximum"
+    assert ds["discharge_mean"].attrs["cell_methods"] == "time: mean"
+
+
+def test_dense_single_statistic_keeps_bare_name():
+    # The common single-statistic case keeps the clean bare name.
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "discharge" in ds.data_vars
+
+
+def test_select_series_matches_nan_instance_key():
+    # An instance whose key is null (samples with no sample_fraction) must be
+    # selectable by passing None, since `== NaN` never matches.
+    df = _samples_frame()
+    df["Result_SampleFraction"] = None
+    ds = _samples_ds(df)
+    s = wdx.select_series(ds, characteristic="Temperature, water", sample_fraction=None)
+    assert set(s.sizes) == {"time"}
+    assert "value" in s.data_vars
+
+
 def test_select_series_ambiguous_raises():
     # selecting by site alone matches both instances -> ask for more keys
     ds = _two_instance_ragged()
@@ -670,8 +705,11 @@ def test_select_series_no_match_raises():
 
 def test_select_series_unknown_key_raises():
     ds = _two_instance_ragged()
-    with pytest.raises(KeyError, match="not a per-series coordinate"):
+    with pytest.raises(KeyError, match="not a per-series identity coordinate"):
         wdx.select_series(ds, bogus="x")
+    # descriptor coords (lon/lat/unit/HUC/state) are not selectable identity keys
+    with pytest.raises(KeyError, match="not a per-series identity coordinate"):
+        wdx.select_series(ds, unit_of_measure="ft^3/s")
 
 
 def test_select_series_on_dense_raises_helpful_error():
@@ -760,6 +798,20 @@ def _samples_ds(frame):
     )
 
 
+def test_samples_surface_lonlat_from_location_columns():
+    # Samples carry position as Location_Latitude/Location_Longitude (no OGC
+    # geometry); the dataset must still get numeric longitude/latitude coords.
+    frame = _samples_frame()
+    frame["Location_Longitude"] = [-90.44]
+    frame["Location_Latitude"] = [43.19]
+    ds = _samples_ds(frame)
+    assert "longitude" in ds.coords and "latitude" in ds.coords
+    assert ds["longitude"].dtype.kind == "f"
+    assert float(ds["longitude"].values[0]) == -90.44
+    assert float(ds["latitude"].values[0]) == 43.19
+    assert ds["longitude"].attrs["units"] == "degrees_east"
+
+
 def test_build_samples_single_characteristic():
     ds = _samples_ds(_samples_frame())
     assert set(ds.sizes) == {"obs", "timeseries"}
@@ -988,3 +1040,57 @@ def fake(monitoring_location_id):
     wdx._FIELD_CACHE._entries["Y"] = {"params": {}, "site": {}}
     wdx.clear_metadata_cache()
     assert len(wdx._TS_CACHE) == 0 and len(wdx._FIELD_CACHE) == 0
+
+
+def test_metadata_missing_site_is_not_negatively_cached():
+    # A site the metadata endpoint returns nothing for must NOT be cached as an
+    # empty entry (which would never be retried); a later call re-fetches it.
+    calls = []
+
+    def fake(monitoring_location_id):
+        calls.append(list(monitoring_location_id))
+        # respond only for S1, never for S2
+        rows = [
+            {
+                "monitoring_location_id": s,
+                "parameter_code": "00060",
+                "parameter_name": s,
+            }
+            for s in monitoring_location_id
+            if s == "S1"
+        ]
+        return pd.DataFrame(rows), SimpleNamespace(url=None)
+
+    cache = wdx._MetadataCache(fake)
+    cache.lookup(["S1", "S2"])
+    cache.lookup(["S1", "S2"])
+    # S1 cached (hit, not re-fetched); S2 never cached, so it is re-requested.
+    assert calls[0] == ["S1", "S2"]
+    assert calls[1] == ["S2"]  # only the still-uncached S2
+    assert "S1" in cache._entries and "S2" not in cache._entries
+
+
+def test_metadata_lookup_survives_within_batch_eviction():
+    # A single pull whose site count exceeds maxsize must still return metadata
+    # for every requested site, even though the bounded cache can't hold them all.
+    sites = ["S0", "S1", "S2", "S3", "S4"]
+
+    def fake(monitoring_location_id):
+        rows = [
+            {
+                "monitoring_location_id": s,
+                "parameter_code": f"p{s}",  # distinct per site
+                "parameter_name": f"name-{s}",
+                "hydrologic_unit_code": f"huc-{s}",
+            }
+            for s in monitoring_location_id
+        ]
+        return pd.DataFrame(rows), SimpleNamespace(url=None)
+
+    cache = wdx._MetadataCache(fake, maxsize=2)
+    param_meta, site_meta = cache.lookup(sites)
+    # every requested site's metadata is in the result even though the bounded
+    # cache evicted most of the just-fetched batch.
+    assert {f"p{s}" for s in sites} <= set(param_meta)
+    assert set(site_meta) == set(sites)
+    assert len(cache) <= 2  # cache stayed bounded

From 1e771581b96b40a8e7a3af856cb15a420cb8dfb0 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 10:21:25 -0500
Subject: [PATCH 5/7] feat(waterdata.xarray): add to_awkward() ragged ->
 awkward view

Convert a ragged (dense=False) Dataset to a one-record-per-series
awkward.Array: row_size is awkward's offsets and obs is its flat content, so
it is a near-zero-copy re-view -- each series carries its scalar identity
metadata plus jagged time/value/flag fields, no NaN fill, with per-series ops
vectorized across the whole collection (e.g. ak.mean(arr.value, axis=1)).

awkward is NOT a dependency: to_awkward lazy-imports it and raises an
informative ModuleNotFoundError ("pip install awkward") when absent. Object
columns route through ak.from_iter (NaN -> missing) so flags become a clean
option[string]; numeric/datetime content stays numpy.

Adds offline tests (importorskip awkward; the missing-dep error is tested by
simulating the absent import) and a demo note.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 dataretrieval/waterdata/xarray.py | 58 +++++++++++++++++++++++++++++++
 demos/waterdata_xarray_demo.ipynb | 23 ++++++++++++
 tests/waterdata_xarray_test.py    | 47 +++++++++++++++++++++++++
 3 files changed, 128 insertions(+)

diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index 867d1ab9..7f4b9248 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -84,6 +84,7 @@
 __all__ = [
     "clear_metadata_cache",
     "select_series",
+    "to_awkward",
     "get_continuous",
     "get_daily",
     "get_field_measurements",
@@ -593,6 +594,63 @@ def select_series(ds, **keys):
     return series.drop_vars("row_size", errors="ignore")
 
 
+def to_awkward(ds):
+    """Convert a ragged (``dense=False``) Dataset to a per-series ``awkward.Array``.
+
+    The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs``
+    dimension) is structurally identical to awkward's jagged ``ListOffsetArray``,
+    so this is a near-zero-copy re-view: each timeseries instance becomes one
+    record carrying its per-series identity/metadata (scalar fields such as
+    ``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its
+    observations as variable-length jagged fields (``time`` / ``value`` / flags).
+    No NaN fill, each series on its own time axis -- per-series operations then
+    vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``::
+
+        ds = wdx.get_daily(..., dense=False)
+        arr = wdx.to_awkward(ds)
+        ak.mean(arr.value, axis=1)  # per-series means
+        arr[arr.parameter_code == "00060"]  # filter series by metadata
+
+    ``awkward`` is an optional dependency that is *not* installed with
+    ``dataretrieval``; install it separately (``pip install awkward``).
+    """
+    try:
+        import awkward as ak
+    except ModuleNotFoundError as exc:  # pragma: no cover - exercised only sans awkward
+        raise ModuleNotFoundError(
+            "to_awkward requires the optional 'awkward' dependency, which is not "
+            "installed with dataretrieval. Install it with:  pip install awkward"
+        ) from exc
+    if "row_size" not in ds.variables or "obs" not in ds.dims:
+        raise ValueError(
+            "to_awkward expects a ragged Dataset (from dense=False); the default "
+            "dense Dataset is already a (monitoring_location_id, time) grid."
+        )
+    counts = ds["row_size"].to_numpy()
+
+    def _content(values):
+        # awkward rejects numpy object dtype; route string/None columns through
+        # from_iter (normalizing NaN -> missing), and keep numeric/datetime
+        # content as numpy -- that part is the zero-copy re-view.
+        if values.dtype == object:
+            return ak.from_iter([_none_if_nan(v) for v in values.tolist()])
+        return values
+
+    # Per-series (timeseries-dim) coords -> scalar record fields; obs-dim
+    # variables/coords -> jagged fields (unflattened by row_size). ``row_size``
+    # itself is the offsets, already encoded in the jagged structure.
+    record = {}
+    for name in (*ds.data_vars, *ds.coords):
+        if name == "row_size":
+            continue
+        da = ds[name]
+        if da.dims == ("timeseries",):
+            record[name] = _content(da.to_numpy())
+        elif da.dims == ("obs",):
+            record[name] = ak.unflatten(_content(da.to_numpy()), counts)
+    return ak.zip(record, depth_limit=1)
+
+
 # === column schemas ========================================================
 
 # Water-quality samples (Samples DB / WQX) speak a different column vocabulary
diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb
index 55e9fcea..641ec498 100644
--- a/demos/waterdata_xarray_demo.ipynb
+++ b/demos/waterdata_xarray_demo.ipynb
@@ -271,6 +271,29 @@
     "regroup/decode the whole dataset for you."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The whole collection at once: `to_awkward`\n",
+    "\n",
+    "For analysis across *all* series at once, convert the ragged dataset to an\n",
+    "[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n",
+    "(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n",
+    "a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n",
+    "fields, its observations jagged), and per-series operations vectorize with no NaN\n",
+    "fill. `awkward` is an optional dependency that is *not* installed with\n",
+    "`dataretrieval` (`pip install awkward`):\n",
+    "\n",
+    "```python\n",
+    "import awkward as ak\n",
+    "\n",
+    "arr = wdx.to_awkward(ragged)        # one record per series\n",
+    "ak.mean(arr.value, axis=1)          # per-series means, all at once\n",
+    "arr[arr.parameter_code == \"00060\"]  # filter series by metadata\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
index 4a5cc5d2..5293f60c 100644
--- a/tests/waterdata_xarray_test.py
+++ b/tests/waterdata_xarray_test.py
@@ -722,6 +722,53 @@ def test_select_series_on_dense_raises_helpful_error():
         wdx.select_series(dense, monitoring_location_id="USGS-1")
 
 
+def test_to_awkward_missing_dependency_raises_informative(monkeypatch):
+    # awkward is NOT a dependency; calling to_awkward without it must raise a
+    # clear, actionable error rather than a bare ImportError. (Simulated so the
+    # test holds whether or not awkward happens to be installed.)
+    import builtins
+
+    real_import = builtins.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "awkward":
+            raise ModuleNotFoundError("No module named 'awkward'")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
+    ds = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    with pytest.raises(ModuleNotFoundError, match="pip install awkward"):
+        wdx.to_awkward(ds)
+
+
+def test_to_awkward_converts_ragged_to_jagged_records():
+    ak = pytest.importorskip("awkward")
+    ds = _two_instance_ragged()  # two series at USGS-1: 00060 and 00010
+    arr = wdx.to_awkward(ds)
+    assert len(arr) == ds.sizes["timeseries"]  # one record per series
+    # scalar identity fields + jagged observation fields
+    assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set(
+        arr.fields
+    )
+    # faithful: per-series lengths == row_size, total obs preserved, no fill
+    assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist()
+    assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"]
+    # per-series reductions vectorize across all series at once
+    means = ak.mean(arr.value, axis=1)
+    assert len(means) == len(arr)
+
+
+def test_to_awkward_on_dense_raises():
+    pytest.importorskip("awkward")
+    dense = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    with pytest.raises(ValueError, match="expects a ragged Dataset"):
+        wdx.to_awkward(dense)
+
+
 # --- ragged opt-out wiring --------------------------------------------------
 
 

From b83b9a5725717b19ce795f54907576212168b345 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 10:44:25 -0500
Subject: [PATCH 6/7] refactor(waterdata.xarray): nest to_awkward observations
 under an `obs` field

Group each series' per-observation fields (time, value, qualifier,
approval_status) into a single jagged `obs` list of records, instead of
several parallel top-level jagged fields. Each series now reads as
"identity metadata (scalar fields) + obs (a list of {time, value, ...}
observations)" -- `arr.obs.value`, `ak.mean(arr.obs.value, axis=1)` -- which
is clearer and the conventional awkward idiom.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 dataretrieval/waterdata/xarray.py | 37 +++++++++++++++++++------------
 demos/waterdata_xarray_demo.ipynb | 15 +++++++------
 tests/waterdata_xarray_test.py    | 14 ++++++------
 3 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index 7f4b9248..0cc6e254 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -599,16 +599,19 @@ def to_awkward(ds):
 
     The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs``
     dimension) is structurally identical to awkward's jagged ``ListOffsetArray``,
-    so this is a near-zero-copy re-view: each timeseries instance becomes one
-    record carrying its per-series identity/metadata (scalar fields such as
-    ``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its
-    observations as variable-length jagged fields (``time`` / ``value`` / flags).
-    No NaN fill, each series on its own time axis -- per-series operations then
-    vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``::
+    so this is a near-zero-copy re-view. Each timeseries instance becomes one
+    record: its per-series identity/metadata as scalar fields
+    (``monitoring_location_id`` / ``parameter_code`` / ``longitude`` / ...) plus
+    a single ``obs`` field holding that series' observations as a
+    variable-length list of ``{time, value, <flags>}`` records. No NaN fill, each
+    series on its own time axis -- per-series operations then vectorize across
+    every series at once, e.g. ``ak.mean(arr.obs.value, axis=1)``::
 
         ds = wdx.get_daily(..., dense=False)
         arr = wdx.to_awkward(ds)
-        ak.mean(arr.value, axis=1)  # per-series means
+        arr[0].monitoring_location_id  # one series' metadata
+        arr.obs.value  # jagged values, all series
+        ak.mean(arr.obs.value, axis=1)  # per-series means
         arr[arr.parameter_code == "00060"]  # filter series by metadata
 
     ``awkward`` is an optional dependency that is *not* installed with
@@ -636,19 +639,25 @@ def _content(values):
             return ak.from_iter([_none_if_nan(v) for v in values.tolist()])
         return values
 
-    # Per-series (timeseries-dim) coords -> scalar record fields; obs-dim
-    # variables/coords -> jagged fields (unflattened by row_size). ``row_size``
-    # itself is the offsets, already encoded in the jagged structure.
-    record = {}
+    # Per-series (timeseries-dim) coords -> scalar identity fields. Obs-dim
+    # variables/coords -> a single jagged ``obs`` field whose elements are the
+    # observation records, so each series is "metadata + a list of observations"
+    # rather than several parallel jagged fields. ``row_size`` is the offsets,
+    # already encoded by the unflatten.
+    scalars, obs_fields = {}, {}
     for name in (*ds.data_vars, *ds.coords):
         if name == "row_size":
             continue
         da = ds[name]
         if da.dims == ("timeseries",):
-            record[name] = _content(da.to_numpy())
+            scalars[name] = _content(da.to_numpy())
         elif da.dims == ("obs",):
-            record[name] = ak.unflatten(_content(da.to_numpy()), counts)
-    return ak.zip(record, depth_limit=1)
+            obs_fields[name] = ak.unflatten(_content(da.to_numpy()), counts)
+    # time first, then value, then any flags, for a readable observation record.
+    order = [n for n in ("time", "value") if n in obs_fields]
+    order += [n for n in obs_fields if n not in order]
+    obs = ak.zip({n: obs_fields[n] for n in order})
+    return ak.zip({**scalars, "obs": obs}, depth_limit=1)
 
 
 # === column schemas ========================================================
diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb
index 641ec498..a0775e1b 100644
--- a/demos/waterdata_xarray_demo.ipynb
+++ b/demos/waterdata_xarray_demo.ipynb
@@ -280,17 +280,18 @@
     "For analysis across *all* series at once, convert the ragged dataset to an\n",
     "[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n",
     "(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n",
-    "a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n",
-    "fields, its observations jagged), and per-series operations vectorize with no NaN\n",
-    "fill. `awkward` is an optional dependency that is *not* installed with\n",
-    "`dataretrieval` (`pip install awkward`):\n",
+    "a near-zero-copy re-view: each series becomes one record — its identity as scalar\n",
+    "fields plus an `obs` list of `{time, value, …}` observations — so per-series\n",
+    "operations vectorize with no NaN fill. `awkward` is an optional dependency that\n",
+    "is *not* installed with `dataretrieval` (`pip install awkward`):\n",
     "\n",
     "```python\n",
     "import awkward as ak\n",
     "\n",
-    "arr = wdx.to_awkward(ragged)        # one record per series\n",
-    "ak.mean(arr.value, axis=1)          # per-series means, all at once\n",
-    "arr[arr.parameter_code == \"00060\"]  # filter series by metadata\n",
+    "arr = wdx.to_awkward(ragged)            # one record per series\n",
+    "arr[0].parameter_code                   # a series' metadata\n",
+    "ak.mean(arr.obs.value, axis=1)          # per-series means, all at once\n",
+    "arr[arr.parameter_code == \"00060\"]      # filter series by metadata\n",
     "```"
    ]
   },
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
index 5293f60c..0c96d4dd 100644
--- a/tests/waterdata_xarray_test.py
+++ b/tests/waterdata_xarray_test.py
@@ -748,15 +748,15 @@ def test_to_awkward_converts_ragged_to_jagged_records():
     ds = _two_instance_ragged()  # two series at USGS-1: 00060 and 00010
     arr = wdx.to_awkward(ds)
     assert len(arr) == ds.sizes["timeseries"]  # one record per series
-    # scalar identity fields + jagged observation fields
-    assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set(
-        arr.fields
-    )
+    # per-series identity is scalar fields; observations are nested under ``obs``
+    assert {"monitoring_location_id", "parameter_code", "obs"} <= set(arr.fields)
+    assert {"time", "value"} <= set(arr.obs.fields)
+    assert "value" not in arr.fields  # not a parallel top-level series
     # faithful: per-series lengths == row_size, total obs preserved, no fill
-    assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist()
-    assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"]
+    assert ak.num(arr.obs.value).tolist() == ds["row_size"].values.tolist()
+    assert int(ak.sum(ak.num(arr.obs.value))) == ds.sizes["obs"]
     # per-series reductions vectorize across all series at once
-    means = ak.mean(arr.value, axis=1)
+    means = ak.mean(arr.obs.value, axis=1)
     assert len(means) == len(arr)
 
 

From 6f819bcbc1711fcd82ecc5f5a9202d1f491e19ba Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Fri, 5 Jun 2026 10:53:23 -0500
Subject: [PATCH 7/7] refactor(waterdata.xarray): simplify point-coords and
 naming dedup (/simplify)

- _point_coords: merge the explicit-longitude/latitude path and the geometry
  path into one dedup/loop/return scaffold with a swappable per-row extractor,
  and route the lon/lat float coercion through _lonlat instead of a second
  open-coded try/except (kills the duplicated branch).
- _DenseBuilder._disambiguate: replace the `name == base` proxy (a confusing
  stand-in for "no suffix") with an explicit "suffix didn't separate them"
  condition, and use collections.Counter for the base-name counts.

No behavior change; 67 tests pass, both spatial-coordinate paths verified live.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 dataretrieval/waterdata/xarray.py | 50 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
index 0cc6e254..24e09489 100644
--- a/dataretrieval/waterdata/xarray.py
+++ b/dataretrieval/waterdata/xarray.py
@@ -57,6 +57,7 @@
 import re as _re
 import threading as _threading
 import warnings as _warnings
+from collections import Counter as _Counter
 from collections.abc import Callable
 from dataclasses import dataclass, field, replace
 from functools import wraps as _wraps
@@ -239,34 +240,31 @@ def _point_coords(df, site):
     explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via
     :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates.
     """
+    # Both sources reduce to a per-row "lonlat-able" value that _lonlat decodes
+    # (a (lon, lat) tuple for the explicit columns, the geometry object
+    # otherwise), so the dedup/loop/coercion scaffolding is shared.
     if {"longitude", "latitude"}.issubset(df.columns):
-        geo = df.dropna(subset=["longitude", "latitude"]).drop_duplicates(site)
-        if geo.empty:
-            return None
-        lon, lat = {}, {}
-        for site_id, x, y in zip(
-            geo[site].to_numpy(),
-            geo["longitude"].to_numpy(),
-            geo["latitude"].to_numpy(),
-        ):
-            try:
-                lon[site_id], lat[site_id] = float(x), float(y)
-            except (TypeError, ValueError):
-                continue
-        return (lon, lat) if lon else None
-    if "geometry" not in df.columns:
+        subset = ["longitude", "latitude"]
+
+        def _geoms(g):
+            return list(zip(g["longitude"].to_numpy(), g["latitude"].to_numpy()))
+    elif "geometry" in df.columns:
+        subset = ["geometry"]
+
+        def _geoms(g):
+            return g["geometry"].to_numpy()
+    else:
         return None
-    geo = df.dropna(subset=["geometry"]).drop_duplicates(site)
+
+    geo = df.dropna(subset=subset).drop_duplicates(site)
     if geo.empty:
         return None
     lon, lat = {}, {}
-    for site_id, geom in zip(geo[site].to_numpy(), geo["geometry"].to_numpy()):
-        xy = _lonlat(geom)
+    for site_id, geom in zip(geo[site].to_numpy(), _geoms(geo)):
+        xy = _lonlat(geom)  # skips non-point / unparseable rather than guessing
         if xy is not None:
             lon[site_id], lat[site_id] = xy
-    if not lon:
-        return None  # no point geometry; skip rather than guess
-    return lon, lat
+    return (lon, lat) if lon else None
 
 
 def _prepare_values(df, group_cols, ancillary_cols):
@@ -1087,20 +1085,20 @@ def _disambiguate(bases, keys):
         back to the statistic id then the parameter code -- so a bare name never
         silently refers to an arbitrary one of several same-named series.
         """
-        counts: dict[str, int] = {}
-        for b in bases:
-            counts[b] = counts.get(b, 0) + 1
+        counts = _Counter(bases)
         names, used = [], set()
         for base, (pcode, stat) in zip(bases, keys):
             if counts[base] == 1:
                 name = base
             else:
+                # statistic cell-method (or raw id); if that doesn't yield a
+                # fresh name, fall back to the parameter code.
                 op = CF_CELL_METHODS.get(str(stat)) if stat is not None else None
                 suffix = op or (str(stat) if stat is not None else None)
                 name = f"{base}_{_slug(suffix)}" if suffix else base
-                if name == base or name in used:  # statistic didn't separate them
+                if name in used or suffix is None:
                     name = f"{base}_{_slug(pcode)}" if pcode is not None else base
-            while name in used:
+            while name in used:  # final guard: append until unique
                 name += "_x"
             used.add(name)
             names.append(name)