From 20daa0020b92325e35200cdf74b42e82c72b7b2c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 29 May 2026 09:01:18 -0400 Subject: [PATCH 1/7] feat(waterdata): add waterdata.xarray module returning CF datasets Add dataretrieval.waterdata.xarray, optional-dependency wrappers that mirror the Water Data time-series getters but return CF-conventions xarray.Dataset objects instead of bare DataFrames. - Ragged (CF contiguous ragged array) layout by default; pass dense=True for the NaN-filled (monitoring_location_id, time) grid with one named variable per parameter. - CF metadata is derived from columns the getters already return (unit_of_measure -> units, statistic_id -> cell_methods, parameter_code -> standard_name/vertical_datum), plus a cached parameter-name lookup; sites carry cf_role=timeseries_id with lon/lat. - Coverage: get_daily, get_continuous, get_latest_continuous, get_latest_daily, get_nearest_continuous, get_peaks, get_field_measurements, get_samples, and preliminary get_stats_por / get_stats_date_range. - xarray is an optional extra (pip install dataretrieval[xarray]); the core package never imports it. Hash-valued ID columns are dropped inside the xarray builders, so the plain getters are left untouched. CF vocabulary maps live in waterdata.types (xarray-free, plain data). Adds a demo notebook + docs entry and offline converter unit tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/waterdata/types.py | 59 + dataretrieval/waterdata/xarray.py | 1081 +++++++++++++++++ demos/waterdata_xarray_demo.ipynb | 382 ++++++ docs/source/examples/index.rst | 12 + .../examples/waterdata_xarray_demo.nblink | 3 + pyproject.toml | 8 + tests/waterdata_xarray_test.py | 803 ++++++++++++ 7 files changed, 2348 insertions(+) create mode 100644 dataretrieval/waterdata/xarray.py create mode 100644 demos/waterdata_xarray_demo.ipynb create mode 100644 docs/source/examples/waterdata_xarray_demo.nblink create mode 100644 tests/waterdata_xarray_test.py diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index f5e1496b..93b8d19b 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -74,3 +74,62 @@ "count", ], } + + +# --- CF / xarray vocabulary mappings --------------------------------------- +# Lookup tables used by :mod:`dataretrieval.waterdata.xarray` to translate +# USGS terms into CF-conventions metadata. Each is intentionally partial: +# anything not listed falls back to a sensible default (raw unit string kept +# verbatim; no standard_name emitted) rather than guessing a wrong CF term. +# They are plain data, so they live here rather than in the (xarray-optional) +# converter module and can be extended without importing xarray. + +# USGS unit strings -> UDUNITS / CF-canonical form. +CF_UNIT_MAP = { + "ft^3/s": "ft3 s-1", + "ft3/s": "ft3 s-1", + "ft": "ft", + "in": "in", + "degC": "degC", + "deg C": "degC", + "uS/cm": "uS/cm", + "mg/l": "mg L-1", + "mg/L": "mg L-1", + # UDUNITS 'ton' is the US short ton; 'short_ton' is not a valid UDUNITS name. + "tons/day": "ton day-1", + "%": "percent", +} + +# USGS statistic_id -> the operator in a CF ``cell_methods`` string. +CF_CELL_METHODS = { + "00001": "maximum", + "00002": "minimum", + "00003": "mean", + "00006": "sum", + "00008": "median", + "00011": "point", # instantaneous +} + +# USGS 5-digit parameter code -> CF standard_name. Deliberately conservative; +# codes without a confident match are left without a standard_name. +CF_STANDARD_NAMES = { + "00060": "water_volume_transport_in_river_channel", + # 00010 (water temperature) is intentionally omitted: ``water_temperature`` + # is NOT a CF standard name, and the only valid CF water-temperature name, + # ``sea_water_temperature``, is wrong-domain for USGS freshwater/groundwater. + # Leaving it unmapped keeps the variable's ``long_name`` without emitting an + # invalid or misleading ``standard_name``. + "00065": "water_surface_height_above_reference_datum", + "63160": "water_surface_height_above_reference_datum", + "00045": "lwe_thickness_of_precipitation_amount", +} + +# USGS parameter code -> vertical reference datum, attached as a +# ``vertical_datum`` attribute. The two water-surface-height parameters share +# the CF standard_name water_surface_height_above_reference_datum, so the datum +# distinguishes them: gage height (00065) is measured from a local site (gage) +# datum, while stream water level (63160) is referenced to NAVD88. +CF_VERTICAL_DATUM = { + "00065": "local site datum", + "63160": "NAVD88", +} diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py new file mode 100644 index 00000000..0eef2c2e --- /dev/null +++ b/dataretrieval/waterdata/xarray.py @@ -0,0 +1,1081 @@ +"""xarray-returning wrappers for the Water Data getters. + +Each public function here mirrors the same-named function in +:mod:`dataretrieval.waterdata`, but returns a CF-conventions +:class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`. + +By default the data is returned as a CF *contiguous ragged array* +(``featureType = "timeSeries"``): every observation is concatenated along a +single ``obs`` dimension, and each (monitoring location, parameter, +statistic) series is one instance along a ``timeseries`` dimension whose +``row_size`` records how many observations it contributes. This stores only +real observations -- no NaN fill -- so it scales to large, very ragged +multi-site pulls where record lengths differ by decades. Parameter, +statistic, unit, and location identity become per-instance metadata, and +``time`` is a 1-D coordinate along ``obs``. The trade-off is that ``time`` +is no longer a dimension you can index directly: to select by time you +first regroup the flat ``obs`` back into per-series views (e.g. with +``cf-xarray``, or via the offsets implied by ``row_size``). + +Pass ``dense=True`` for the alternative gridded layout: one named data +variable per parameter on a ``(monitoring_location_id, time)`` grid, NaN +where a series has no observation. This is ergonomic for a few overlapping +series (``ds["discharge"].sel(time=...)`` just works) but the union time +axis and NaN fill make it memory-costly for large ragged collections. + +Either way the CF metadata is derived from columns the getter already +returns (``unit_of_measure`` -> ``units``, ``statistic_id`` -> +``cell_methods``, ``parameter_code`` -> ``standard_name``), plus the +human-readable parameter name from a small cached metadata lookup. The +timeseries identity carries ``cf_role = "timeseries_id"`` -- the synthesized +``timeseries_id`` coordinate when ragged, ``monitoring_location_id`` when +dense -- each site has ``longitude`` / ``latitude`` (and +``hydrologic_unit_code`` / ``state_name`` when the metadata call already +provides them), and dataset attributes carry ``Conventions``, provenance, +the request URL, and ``date_modified``. + +Internally the conversion is organized around three small object roles: + +* a :class:`_Schema` value object describes a service's column dialect; +* a :class:`_DatasetBuilder` strategy turns a values frame into a Dataset -- + the time-series :class:`_RaggedBuilder` / :class:`_DenseBuilder` (sharing a + :class:`_SeriesBuilder` base) and the flat :class:`_StatsBuilder`; +* a :class:`_MetadataCache` memoizes the supplementary metadata lookup. + +A :class:`_Service` record wires those together for each public getter. + +This module requires the optional ``xarray`` dependency:: + + pip install dataretrieval[xarray] +""" + +from __future__ import annotations + +import datetime as _dt +import inspect as _inspect +import re as _re +import threading as _threading +import warnings as _warnings +from collections.abc import Callable +from dataclasses import dataclass, field, replace +from functools import wraps as _wraps + +import pandas as _pd + +try: + import xarray as _xr +except ModuleNotFoundError as _exc: # pragma: no cover - exercised only sans xarray + raise ModuleNotFoundError( + "dataretrieval.waterdata.xarray requires the optional 'xarray' " + "dependency. Install it with: pip install dataretrieval[xarray]" + ) from _exc + +from . import api as _api +from .nearest import get_nearest_continuous as _get_nearest_continuous +from .types import ( + CF_CELL_METHODS, + CF_STANDARD_NAMES, + CF_UNIT_MAP, + CF_VERTICAL_DATUM, +) + +__all__ = [ + "clear_metadata_cache", + "get_continuous", + "get_daily", + "get_field_measurements", + "get_latest_continuous", + "get_latest_daily", + "get_nearest_continuous", + "get_peaks", + "get_samples", + "get_stats_date_range", + "get_stats_por", +] + + +# === constants ============================================================= + +# The CF vocabulary lookups (USGS units -> UDUNITS, statistic_id -> +# cell_methods operator, parameter_code -> standard_name) are plain data and +# live in ``types`` -- imported as CF_UNIT_MAP / CF_CELL_METHODS / +# CF_STANDARD_NAMES at the top of this module. + +# Columns kept off the value pivot but surfaced as ancillary (flag) variables. +_ANCILLARY = ("qualifier", "approval_status") + +# The monitoring-location ("site") column. A site is the DSG location; a +# timeseries *instance* is a site plus its group columns (parameter/statistic), +# so one site can host several instances. +_SITE = "monitoring_location_id" + +# Only the human-readable name is sourced from the metadata endpoint; units, +# statistic, and parameter code all come from the values frame itself. +_NAME_DESCRIPTORS = ("parameter_name", "parameter_description") + +# Per-site descriptors that ride along on the same metadata call (no extra +# request) and are surfaced as per-site auxiliary coordinates. Only +# fields the endpoint already returns are used -- station name/altitude live on +# the monitoring-locations endpoint and would need a separate call, so they are +# intentionally not fetched here. +_SITE_DESCRIPTORS = ("hydrologic_unit_code", "state_name") + +# CF attributes for the per-site coordinates built from the descriptors above. +_SITE_COORD_ATTRS = { + "hydrologic_unit_code": {"long_name": "hydrologic unit code (HUC)"}, + "state_name": {"long_name": "state name"}, +} + +# Upper bound on the per-process metadata cache (see :class:`_MetadataCache`). +_CACHE_MAXSIZE = 4096 + + +# === stateless helpers ===================================================== + + +def _slug(name) -> str: + """A lower_snake_case, identifier-safe variable name.""" + s = _re.sub(r"[^0-9a-zA-Z]+", "_", str(name).strip().lower()).strip("_") + return s or "value" + + +def _first(series): + """First non-null value of a column, or None.""" + nonnull = series.dropna() + return nonnull.iloc[0] if len(nonnull) else None + + +def _first_present(frame, col): + """First non-null value of ``col`` if the column is present, else None.""" + return _first(frame[col]) if col in frame else None + + +def _unique_present(frame, col): + """Distinct non-null values of ``col`` if present, else an empty list.""" + return frame[col].dropna().unique() if col in frame else [] + + +def _sites(df): + """Unique monitoring-location ids present in a values frame.""" + if _SITE in df: + return df[_SITE].unique() + return [] + + +def _date_modified(df): + """Most recent upstream record-refresh time in the frame, ISO-8601 or None. + + ``last_modified`` reflects when each record was last refreshed in the USGS + database; the maximum is surfaced as the dataset's ``date_modified`` (ACDD) + so a reader knows how current the pull is. + """ + if df is None or "last_modified" not in getattr(df, "columns", ()): + return None + ts = _pd.to_datetime(df["last_modified"], errors="coerce", utc=True).dropna() + return ts.max().isoformat() if len(ts) else None + + +def _lonlat(geom): + """``(lon, lat)`` from a geometry, or None if it isn't a point. + + Geometry comes back as a shapely ``Point`` when geopandas is installed but + as a plain ``[lon, lat]`` list when it is not (the OGC GeoJSON coordinates + are flattened into a list). Handle both so the spatial coordinates survive + either way; anything else (a polygon, a malformed cell) is skipped rather + than guessed. + """ + x, y = getattr(geom, "x", None), getattr(geom, "y", None) + if x is not None and y is not None: + return x, y + if isinstance(geom, (list, tuple)) and len(geom) >= 2: + try: + return float(geom[0]), float(geom[1]) + except (TypeError, ValueError): + return None + return None + + +def _point_coords(df, site): + """lon/lat dicts keyed by site from point geometry, or None.""" + if "geometry" not in df.columns: + return None + geo = df.dropna(subset=["geometry"]).drop_duplicates(site) + if geo.empty: + return None + lon, lat = {}, {} + for _, row in geo.iterrows(): + xy = _lonlat(row["geometry"]) + if xy is not None: + lon[row[site]], lat[row[site]] = xy + if not lon: + return None # no point geometry; skip rather than guess + return lon, lat + + +def _prepare_values(df, group_cols, ancillary_cols): + """Slim the frame and coerce types, shared by the dense / ragged builders. + + Keeps only the columns we convert, parses ``time`` to naive-UTC + ``datetime64`` (xarray has no tz dtype), coerces ``value`` to numeric, and + drops rows whose ``time`` is unparseable/missing (with a warning, so + observations are never lost without a trace). Returns + ``(work, group_cols, ancillary, has_unit)`` filtered to columns present. + """ + group_cols = [c for c in group_cols if c in df.columns] + ancillary = [c for c in ancillary_cols if c in df.columns] + has_unit = "unit_of_measure" in df.columns + cols = [_SITE, "time", "value", *group_cols, *ancillary] + if has_unit: + cols.append("unit_of_measure") + present = [c for c in dict.fromkeys(cols) if c in df.columns] + work = df.loc[:, present].copy() + # Instance id, time, and value are mandatory to build a series. A response + # that lacks any of them (e.g. a non-result Samples profile) has nothing to + # convert, so return an empty frame -> empty Dataset rather than KeyError. + if not {_SITE, "time", "value"}.issubset(work.columns): + return work.iloc[0:0], group_cols, ancillary, has_unit + # ``format="ISO8601"`` matches the Water Data API timestamps and avoids + # pandas' slow per-element ``dateutil`` fallback (and its warning) on large + # frames; the tz is normalized to UTC then dropped (xarray has no tz dtype). + work["time"] = _pd.to_datetime( + work["time"], format="ISO8601", errors="coerce", utc=True + ).dt.tz_localize(None) + work["value"] = _pd.to_numeric(work["value"], errors="coerce") + n_before = len(work) + work = work[work["time"].notna()] + dropped = n_before - len(work) + if dropped: + _warnings.warn( + f"dropped {dropped} row(s) with an unparseable or missing time.", + stacklevel=3, + ) + return work, group_cols, ancillary, has_unit + + +def _cf_units(unit): + """Map a USGS unit string to its CF / UDUNITS form, or pass it through.""" + return CF_UNIT_MAP.get(str(unit), str(unit)) + + +def _var_attrs(desc, *, unit, pcode, stat, default_cell_method): + """Build the CF attribute dict for one data variable. + + ``unit``, ``pcode`` and ``stat`` are read from the values frame; ``desc`` + supplies only the human-readable name from the metadata lookup. Linking the + ``ancillary_variables`` is left to each builder, since the flag-variable + names follow the (layout-specific) data-variable name. + """ + attrs: dict[str, str] = {} + long_name = desc.get("parameter_description") or desc.get("parameter_name") + if long_name and _pd.notna(long_name): + attrs["long_name"] = str(long_name) + + if unit is not None and _pd.notna(unit): + attrs["units"] = _cf_units(unit) + + op = ( + CF_CELL_METHODS.get(str(stat)) if stat is not None and _pd.notna(stat) else None + ) + op = op or default_cell_method + if op: + attrs["cell_methods"] = f"time: {op}" + + if pcode is not None and _pd.notna(pcode): + sn = CF_STANDARD_NAMES.get(str(pcode)) + if sn: + attrs["standard_name"] = sn + datum = CF_VERTICAL_DATUM.get(str(pcode)) + if datum: + attrs["vertical_datum"] = datum + attrs["usgs_parameter_code"] = str(pcode) + + if stat is not None and _pd.notna(stat): + attrs["usgs_statistic_id"] = str(stat) + return attrs + + +def _dataset_attrs(service, base_meta): + """Dataset-level provenance (CF + ACDD) attributes.""" + attrs = { + "Conventions": "CF-1.11", + "institution": "U.S. Geological Survey", + "source": f"USGS Water Data API ({service})", + "featureType": "timeSeries", + "history": ( + f"{_dt.datetime.now(_dt.timezone.utc).isoformat(timespec='seconds')} " + "created by dataretrieval.waterdata.xarray" + ), + } + url = getattr(base_meta, "url", None) + if url: + attrs["references"] = str(url) + return attrs + + +def _empty_dataset(service, base_meta): + """An attribute-only Dataset for an empty / unconvertible response.""" + ds = _xr.Dataset() + ds.attrs = _dataset_attrs(service, base_meta) + return ds + + +# === metadata cache ======================================================== + + +class _MetadataCache: + """Per-process, bounded, thread-safe cache of supplementary metadata. + + Keyed by monitoring location, this memoizes the human-readable parameter + name and a couple of site descriptors that the values getters don't return. + ``getter`` is called as ``getter(monitoring_location_id=[...])`` and must + return ``(DataFrame, metadata)``; one fetch is issued per not-yet-cached + batch of sites and reused thereafter. + + The metadata is supplementary, so a fetch failure degrades to "no metadata" + with a warning rather than discarding the already-retrieved observations, + and the failed sites are left uncached so a later, recovered call retries + them. The cache is bounded (oldest-inserted entries are evicted past + ``maxsize``) and its mutation is serialized, so a long-running, many-site, + multi-threaded process stays both correct and bounded. + """ + + def __init__(self, getter, *, maxsize=_CACHE_MAXSIZE): + self._getter = getter + self._maxsize = maxsize + self._entries: dict[str, dict] = {} + self._lock = _threading.Lock() + + def lookup(self, site_ids): + """``(param_meta, site_meta)`` for ``site_ids``, fetching any misses. + + ``param_meta`` is ``{parameter_code: {name descriptors}}`` (keyed by the + stable parameter code, so no hash id is needed) and ``site_meta`` is + ``{monitoring_location_id: {site descriptors present in the response}}``. + """ + sites = sorted({str(s) for s in site_ids if _pd.notna(s)}) + # Racy read of the keys is fine: a concurrent miss just re-fetches (the + # fetch is idempotent); only the writes in _ingest take the lock. + todo = [s for s in sites if s not in self._entries] + if todo: + try: + meta, _ = self._getter(monitoring_location_id=todo) + except Exception as exc: # supplementary lookup; never lose the data + _warnings.warn( + f"metadata lookup failed ({exc!r}); returning the dataset " + "without parameter names / site descriptors.", + stacklevel=2, + ) + else: + self._ingest(meta, todo) + param_meta: dict[str, dict] = {} + site_meta: dict[str, dict] = {} + with self._lock: + for s in sites: + entry = self._entries.get(s, {}) + param_meta.update(entry.get("params", {})) + if entry.get("site"): + site_meta[s] = entry["site"] + return param_meta, site_meta + + def clear(self): + """Empty the cache (release memory / force a refresh).""" + with self._lock: + self._entries.clear() + + def __len__(self): + return len(self._entries) + + def _ingest(self, meta, todo): + """Parse ``meta`` into per-site entries, then merge + evict under lock. + + The parsing runs lock-free on a local dict; only the (cheap) merge into + the shared cache and the FIFO eviction past ``maxsize`` hold the lock. + """ + fresh = {s: {"params": {}, "site": {}} for s in todo} + if not meta.empty: + name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns] + site_cols = [c for c in _SITE_DESCRIPTORS if c in meta.columns] + has_pcode = "parameter_code" in meta.columns + # to_dict("records") is markedly faster than iterrows() (no per-row + # Series boxing) and the dict access below is identical. + for row in meta.to_dict("records"): + site = row.get("monitoring_location_id") + if site not in fresh: + continue + if has_pcode: + pcode = row.get("parameter_code") + if _pd.notna(pcode): + fresh[site]["params"][str(pcode)] = { + c: row.get(c) for c in name_cols + } + if not fresh[site]["site"]: + desc = {c: row.get(c) for c in site_cols if _pd.notna(row.get(c))} + if desc: + fresh[site]["site"] = desc + with self._lock: + self._entries.update(fresh) + while len(self._entries) > self._maxsize: + self._entries.pop(next(iter(self._entries))) + + +# One cache per metadata endpoint, shared across getters for the process life. +_TS_CACHE = _MetadataCache(_api.get_time_series_metadata) +_FIELD_CACHE = _MetadataCache(_api.get_field_measurements_metadata) + + +def clear_metadata_cache(): + """Empty the per-process metadata caches (parameter names, site descriptors). + + The xarray getters cache the supplementary metadata lookup per monitoring + location for the life of the process. Long-running services can call this to + release memory or to force a refresh after the upstream metadata changed. + """ + _TS_CACHE.clear() + _FIELD_CACHE.clear() + + +# === column schemas ======================================================== + +# Water-quality samples (Samples DB / WQX) speak a different column vocabulary +# than the time-series getters; map their tidy backbone onto the canonical +# names the builders understand. +_SAMPLES_RENAME = { + "Location_Identifier": _SITE, + "Activity_StartDateTime": "time", + "Result_Measure": "value", + "Result_MeasureUnit": "unit_of_measure", + "Result_Characteristic": "characteristic", + "Result_SampleFraction": "sample_fraction", + "Result_ResultDetectionCondition": "detection_condition", + "Result_MeasureStatusIdentifier": "status", +} +_CANONICAL_COORD_ATTRS = { + "parameter_code": {"long_name": "USGS parameter code"}, + "statistic_id": {"long_name": "USGS statistic code"}, + "unit_of_measure": {"long_name": "unit of measurement"}, +} +_SAMPLES_COORD_ATTRS = { + "characteristic": {"long_name": "characteristic name"}, + "sample_fraction": {"long_name": "sample fraction"}, + "unit_of_measure": {"long_name": "unit of measurement"}, + "detection_condition": {"long_name": "result detection condition"}, + "status": {"long_name": "result status"}, +} + + +@dataclass(frozen=True) +class _Schema: + """How a service's columns map onto the canonical builder vocabulary. + + One object describes a column dialect so a single set of builders serves + both the time-series getters and the (differently-named) Samples data: + + * ``rename`` -- source -> canonical column names (empty for the getters + that already use the canonical names); + * ``group_cols`` -- columns that identify one series (an instance); + * ``ancillary`` -- per-observation flag columns carried alongside ``value``; + * ``label_col`` -- the column whose value names the variable / ``long_name`` + (``parameter_code`` for the getters, ``characteristic`` for samples); + * ``infer_standard_name`` -- whether ``label_col`` is a USGS parameter code + eligible for a CF ``standard_name`` lookup (False for free-text + characteristics); + * ``coord_attrs`` -- ``long_name`` etc. for the per-instance metadata vars. + """ + + rename: dict = field(default_factory=dict) + group_cols: tuple = ("parameter_code", "statistic_id") + ancillary: tuple = _ANCILLARY + label_col: str = "parameter_code" + infer_standard_name: bool = True + coord_attrs: dict = field(default_factory=dict) + + +_CANONICAL = _Schema(coord_attrs=_CANONICAL_COORD_ATTRS) +_FIELD = replace(_CANONICAL, group_cols=("parameter_code",)) +_SAMPLES = _Schema( + rename=_SAMPLES_RENAME, + group_cols=("characteristic", "sample_fraction"), + ancillary=("detection_condition", "status"), + label_col="characteristic", + infer_standard_name=False, + coord_attrs=_SAMPLES_COORD_ATTRS, +) + + +# === dataset builders ====================================================== + + +class _DatasetBuilder: + """Strategy base: convert one values frame into a CF ``xarray.Dataset``. + + Holds only what every layout needs -- the frame, the request metadata, and + the dataset-level provenance scaffolding -- and leaves :meth:`build` to the + subclass. The series layouts share more state and live under + :class:`_SeriesBuilder`; :class:`_StatsBuilder` extends this base directly. + Construct via the :func:`_build_ragged` / :func:`_build_dense` / + :func:`_build_stats` helpers rather than directly. + """ + + def __init__(self, df, base_meta, *, service): + self.df = df + self.base_meta = base_meta + self.service = service + + def build(self): + raise NotImplementedError + + def _is_empty(self): + return self.df is None or len(self.df) == 0 + + def _empty(self): + return _empty_dataset(self.service, self.base_meta) + + def _apply_provenance(self, ds): + """Set the dataset-level CF/ACDD attributes and ``date_modified``.""" + ds.attrs = _dataset_attrs(self.service, self.base_meta) + dm = _date_modified(self.df) + if dm: + ds.attrs["date_modified"] = dm + return ds + + +class _SeriesBuilder(_DatasetBuilder): + """Base for the time-series layouts (ragged / dense). + + Adds the column ``schema``, the metadata lookups, and the prepare / + spatial-coordinate scaffolding the two series layouts share. :meth:`build` + is a template: it renames the dialect, slims and type-checks the frame, + short-circuits an empty result, and hands the cleaned frame to the subclass + via :meth:`_build_series`. + """ + + def __init__( + self, + df, + base_meta, + *, + service, + schema=_CANONICAL, + series_meta=None, + site_meta=None, + default_cell_method=None, + ): + # Rename the source dialect onto the canonical names once, up front, so + # every downstream step speaks one vocabulary. + if df is not None and schema.rename: + df = df.rename(columns=schema.rename) + super().__init__(df, base_meta, service=service) + self.schema = schema + self.series_meta = series_meta or {} + self.site_meta = site_meta or {} + self.default_cell_method = default_cell_method + + def build(self): + if self._is_empty(): + return self._empty() + work, group_cols, ancillary, has_unit = _prepare_values( + self.df, self.schema.group_cols, self.schema.ancillary + ) + if work.empty: + return self._empty() + return self._build_series(work, group_cols, ancillary, has_unit) + + def _build_series(self, work, group_cols, ancillary, has_unit): + raise NotImplementedError + + def _add_spatial_coords(self, ds, dim, order): + """Attach lon/lat and per-site descriptor coords along ``dim``. + + ``order`` is the list of site ids in ``dim`` order (one per instance), + so the same logic serves both the dense (the site dimension) and ragged + (the timeseries dimension) layouts. + """ + coords = _point_coords(self.df, _SITE) + if coords is not None: + lon, lat = coords + ds = ds.assign_coords( + longitude=(dim, [lon.get(k) for k in order]), + latitude=(dim, [lat.get(k) for k in order]), + ) + ds["longitude"].attrs = { + "standard_name": "longitude", + "units": "degrees_east", + } + ds["latitude"].attrs = { + "standard_name": "latitude", + "units": "degrees_north", + } + # Instance-indexed site descriptors carried along on the metadata call + # (HUC, state); added only where present so absent fields don't appear. + if self.site_meta: + for col, col_attrs in _SITE_COORD_ATTRS.items(): + vals = [self.site_meta.get(str(k), {}).get(col) for k in order] + if any(v is not None for v in vals): + ds = ds.assign_coords({col: (dim, vals)}) + ds[col].attrs.update(col_attrs) + return ds + + +class _RaggedBuilder(_SeriesBuilder): + """CF *contiguous ragged array* layout (the default). + + All observations are concatenated into one ``obs`` dimension; each series (a + ``schema.group_cols`` combination at a site) is an instance along + ``timeseries`` with ``row_size`` giving its length. Only real observations + are stored (no NaN fill), so this scales to large, very ragged multi-site + pulls. Per-series parameter / statistic / unit are instance coordinates; + descriptors homogeneous across instances are also written onto ``value``. + """ + + def _build_series(self, work, group_cols, ancillary, has_unit): + inst_cols = [_SITE, *group_cols] + ds, inst_frame = self._assemble(work, inst_cols, ancillary, has_unit) + value_attrs = self._value_attrs(ds, inst_frame) + ds = self._finalize(ds, inst_frame, ancillary, value_attrs) + + for col, col_attrs in self.schema.coord_attrs.items(): + if col in ds.variables: + ds[col].attrs.update(col_attrs) + order = inst_frame[_SITE].to_numpy() + return self._add_spatial_coords(ds, "timeseries", order) + + def _assemble(self, work, inst_cols, ancillary, has_unit): + """Cleaned values frame -> CF contiguous-ragged Dataset skeleton. + + Observations are concatenated along a single ``obs`` dimension, sorted + so each instance's rows are contiguous; ``row_size`` records how many + obs each instance contributes (the CF ``sample_dimension`` link). + ``unit_of_measure`` (when present) is carried as one value per instance. + Returns ``(ds, inst_frame)`` where ``inst_frame`` is one row/instance. + """ + inst_first = ["unit_of_measure"] if has_unit else [] + work = work.sort_values([*inst_cols, "time"], kind="stable") + grp = work.groupby(inst_cols, dropna=False, sort=False) + row_size = grp.size() + idx = row_size.index + inst_frame = ( + idx.to_frame(index=False) + if isinstance(idx, _pd.MultiIndex) + else _pd.DataFrame({inst_cols[0]: idx}) + ) + data_vars = { + "value": ("obs", work["value"].to_numpy()), + "row_size": ("timeseries", row_size.to_numpy().astype("int32")), + } + for c in ancillary: + data_vars[c] = ("obs", work[c].to_numpy()) + coords = {"time": ("obs", work["time"].to_numpy())} + for c in inst_cols: + coords[c] = ("timeseries", inst_frame[c].to_numpy()) + for c in inst_first: + coords[c] = ("timeseries", grp[c].first().to_numpy()) + return _xr.Dataset(data_vars, coords=coords), inst_frame + + def _value_attrs(self, ds, inst_frame): + """CF attributes for ``value``. + + Homogeneous descriptors (a single parameter / statistic / unit across + all instances) are written onto ``value``; otherwise ``value`` stays + generic and the per-instance coordinates carry each series' identity. + """ + schema = self.schema + labels = _unique_present(inst_frame, schema.label_col) + stats = _unique_present(inst_frame, "statistic_id") + units = ( + ds["unit_of_measure"].to_series().dropna().unique() + if "unit_of_measure" in ds.coords + else [] + ) + unit = units[0] if len(units) == 1 else None + if len(labels) == 1: + if schema.infer_standard_name: + desc, pcode = self.series_meta.get(str(labels[0]), {}), labels[0] + else: # free-text label (e.g. a characteristic): it *is* the name + desc, pcode = {"parameter_name": str(labels[0])}, None + return _var_attrs( + desc, + unit=unit, + pcode=pcode, + stat=stats[0] if len(stats) == 1 else None, + default_cell_method=self.default_cell_method, + ) + value_attrs = { + "long_name": "measured value", + "comment": ( + "multiple series with differing metadata are stacked here; see " + "the per-timeseries coordinates for each series' identity" + ), + } + if unit is not None: + value_attrs["units"] = _cf_units(unit) + # A service-wide cell method (e.g. samples are instantaneous grabs) + # still applies when the statistic doesn't vary; the time-series getters + # leave per-parameter cell methods to the per-instance coordinates. + if self.default_cell_method and not schema.infer_standard_name: + value_attrs["cell_methods"] = f"time: {self.default_cell_method}" + return value_attrs + + def _finalize(self, ds, inst_frame, ancillary, value_attrs): + """Attach the structural + provenance CF metadata for a ragged Dataset. + + Sets the ``value`` attrs (with ``ancillary_variables`` linked), dataset + provenance, the ``row_size`` ``sample_dimension`` link, and a + per-instance ``timeseries_id`` carrying ``cf_role`` (a site alone isn't + unique once it has several series). + """ + if ancillary: + value_attrs = {**value_attrs, "ancillary_variables": " ".join(ancillary)} + ds["value"].attrs = value_attrs + ds = self._apply_provenance(ds) + ds["time"].attrs.setdefault("standard_name", "time") + ds["row_size"].attrs = { + "long_name": "number of observations per time series", + "sample_dimension": "obs", + } + # Join the instance keys into a cf_role id, skipping null parts so a + # missing key (e.g. a characteristic with no sample fraction) doesn't + # render as a literal "nan" token. + ts_id = inst_frame.apply( + lambda r: ":".join(str(x) for x in r if _pd.notna(x)), axis=1 + ).to_numpy() + ds = ds.assign_coords(timeseries_id=("timeseries", ts_id)) + ds["timeseries_id"].attrs["cf_role"] = "timeseries_id" + ds[_SITE].attrs.setdefault("long_name", "monitoring location identifier") + return ds + + +class _DenseBuilder(_SeriesBuilder): + """Dense ``(monitoring_location_id, time)`` grid: one named variable per + parameter, NaN where a series has no observation. Ergonomic for a few + overlapping series but memory-costly for large ragged collections; see + :class:`_RaggedBuilder` for the default. + """ + + def _build_series(self, work, group_cols, ancillary, has_unit): + # Outer join on time: parameters sampled on different clocks share a + # union time axis, NaN where a given parameter has no observation. + ds = _xr.merge( + self._variable_datasets(work, group_cols, ancillary, has_unit), + combine_attrs="drop_conflicts", + join="outer", + ) + ds = self._apply_provenance(ds) + ds["time"].attrs.setdefault("standard_name", "time") + if _SITE in ds.coords: + ds[_SITE].attrs.setdefault("cf_role", "timeseries_id") + order = list(ds[_SITE].values) if _SITE in ds.coords else [] + return self._add_spatial_coords(ds, _SITE, order) + + def _variable_datasets(self, work, group_cols, ancillary, has_unit): + """One pivoted ``(site, time)`` Dataset per (parameter, statistic).""" + datasets, used = [], set() + for _, group in work.groupby(group_cols, dropna=False): + pcode = _first_present(group, "parameter_code") + stat = _first_present(group, "statistic_id") + group_units = group["unit_of_measure"].dropna().unique() if has_unit else () + unit = group_units[0] if len(group_units) else None + desc = self.series_meta.get(str(pcode), {}) if pcode is not None else {} + + name = self._variable_name(desc, pcode, stat, used) + used.add(name) + + if len(group_units) > 1: + # One variable can carry only one ``units`` attr; surface the + # mix instead of silently labeling every value with the first. + _warnings.warn( + f"'{name}' spans multiple units {list(group_units)}; labeling " + f"with '{unit}'. Filter by site/parameter to avoid mixing " + "units in one variable.", + stacklevel=3, + ) + + sub = group.set_index([_SITE, "time"])[["value", *ancillary]] + if not sub.index.is_unique: + _warnings.warn( + f"'{name}' has multiple values per (site, time) -- two series " + "share this (site, parameter, statistic); keeping the first. " + "Filter the query to separate them.", + stacklevel=3, + ) + sub = sub[~sub.index.duplicated(keep="first")] + ds_g = sub.to_xarray().rename( + {"value": name, **{c: f"{name}_{c}" for c in ancillary}} + ) + ds_g[name].attrs = _var_attrs( + desc, + unit=unit, + pcode=pcode, + stat=stat, + default_cell_method=self.default_cell_method, + ) + if ancillary: + ds_g[name].attrs["ancillary_variables"] = " ".join( + f"{name}_{c}" for c in ancillary + ) + datasets.append(ds_g) + return datasets + + @staticmethod + def _variable_name(desc, pcode, stat, used): + """A unique slug for a variable; disambiguate same-parameter series.""" + name = _slug(desc.get("parameter_name") or pcode or "value") + if name in used: # same parameter, different statistic -> distinct var + op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None) + name = f"{name}_{_slug(op)}" if op else name + while name in used: + name += "_x" + return name + + +class _StatsBuilder(_DatasetBuilder): + """Best-effort, preliminary conversion of the statistics tables. + + The statistics service returns percentile tables keyed by time-of-year + rather than a (time, value) series, so this produces a flat Dataset (one + variable per column over an ``index`` dimension) with dataset-level + provenance only. A richer percentile / day-of-year layout is future work. + """ + + def build(self): + if self._is_empty(): + return self._empty() + # The series builders surface only the columns they convert, so opaque + # hash IDs never reach those datasets. This flat path keeps every column, + # so drop the stats service's hash-valued IDs (and geometry) here to keep + # the CF dataset free of per-record UUID coordinates. + drop = ("geometry", "computation_id", "parent_time_series_id", "time_series_id") + flat = self.df.drop(columns=[c for c in drop if c in self.df.columns]) + ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True)) + ds = self._apply_provenance(ds) + ds.attrs["comment"] = "preliminary flat conversion; see module docs" + return ds + + +def _build_ragged(df, base_meta, **kwargs): + """Functional entry point for the ragged layout (see :class:`_RaggedBuilder`).""" + return _RaggedBuilder(df, base_meta, **kwargs).build() + + +def _build_dense(df, base_meta, **kwargs): + """Functional entry point for the dense layout (see :class:`_DenseBuilder`).""" + return _DenseBuilder(df, base_meta, **kwargs).build() + + +def _build_stats(df, base_meta, service): + """Functional entry point for the stats layout (see :class:`_StatsBuilder`).""" + return _StatsBuilder(df, base_meta, service=service).build() + + +# === public getters ======================================================== + + +def _fetch(func, args, kwargs): + """Call the underlying getter, dropping a stray ``include_hash`` kwarg. + + The xarray builders surface only the columns they convert, so the opaque + hash-valued ID columns (per-record UUIDs, per-series join keys) never reach + the dataset regardless of what the getter returns. ``include_hash`` is not a + parameter of the plain getters, so it is swallowed here to keep passing it + to an xarray wrapper harmless. + """ + kwargs.pop("include_hash", None) + return func(*args, **kwargs) + + +def _xr_doc(func, *, cf_metadata=True, allow_dense=True): + """Prepend an xarray note to the wrapped getter's docstring. + + ``cf_metadata=False`` describes the preliminary stats path (a flat Dataset + without per-variable CF attributes); ``allow_dense=False`` describes a + ragged-only path (samples), where ``dense=`` does not apply. + """ + if not cf_metadata: + returns = ( + "a preliminary, flat ``xarray.Dataset`` (dataset-level provenance " + "only; per-variable CF metadata is not yet populated)" + ) + elif allow_dense: + returns = ( + "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array " + "(pass ``dense=True`` for the NaN-filled (site, time) grid with one " + "named variable per parameter)" + ) + else: + returns = ( + "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array " + "(always ragged; discrete samples are too sparse for a dense grid, " + "so ``dense=`` does not apply)" + ) + # A column-0 summary paragraph + a cleandoc'd body keeps the combined + # docstring valid numpydoc (the wrapped getter's first line is unindented + # but its body is source-indented, which would otherwise break dedent and + # over-indent the Parameters/Returns sections). + note = ( + "xarray wrapper: same arguments as " + f"``dataretrieval.waterdata.{func.__name__}``, but returns {returns}. " + "Hash-valued ID columns are always omitted here; the ``include_hash`` " + "flag does not apply." + ) + body = _inspect.cleandoc(func.__doc__) if func.__doc__ else "" + return f"{note}\n\n{body}" if body else note + + +def _public_signature(wrapped, *, has_dense): + """The signature ``help()`` / IDEs should show for a wrapper. + + Keeps the wrapped getter's own parameters (so its real arguments stay + discoverable), adds the keyword-only ``dense`` toggle where the layout + offers it, and declares the ``xarray.Dataset`` return -- correcting the + DataFrame signature that ``functools.wraps`` would otherwise carry over. + """ + sig = _inspect.signature(wrapped) + params = list(sig.parameters.values()) + if has_dense: + cut = next( + ( + i + for i, p in enumerate(params) + if p.kind is _inspect.Parameter.VAR_KEYWORD + ), + len(params), + ) + params.insert( + cut, + _inspect.Parameter( + "dense", + _inspect.Parameter.KEYWORD_ONLY, + default=False, + annotation="bool", + ), + ) + return sig.replace(parameters=params, return_annotation="xarray.Dataset") + + +@dataclass(frozen=True) +class _Service: + """Per-service configuration that drives one public xarray getter. + + The variation across services is data, not behaviour: which getter to call, + which :class:`_MetadataCache` (if any) supplies the parameter names, the + column ``schema``, the fallback ``cell_methods`` operator, whether the + result is a time series or the preliminary stats table, and whether a dense + grid is offered. + """ + + getter: Callable + service: str + metadata_cache: _MetadataCache | None = None + schema: _Schema = _CANONICAL + default_cell_method: str | None = None + layout: str = "series" # "series" | "stats" + allow_dense: bool = True + + +def _make_getter(spec): + """Build the public getter for one ``_Service`` spec (Factory).""" + + has_dense = spec.layout != "stats" and spec.allow_dense + + @_wraps(spec.getter) # carry __name__/__wrapped__ through to help()/IDEs + def getter(*args, dense=False, **kwargs): + if dense and not has_dense: + _warnings.warn( + f"{spec.getter.__name__} has no dense layout; ignoring dense=True.", + stacklevel=2, + ) + dense = False + df, base_meta = _fetch(spec.getter, args, kwargs) + if spec.layout == "stats": + return _build_stats(df, base_meta, spec.service) + if spec.metadata_cache is not None: + series_meta, site_meta = spec.metadata_cache.lookup(_sites(df)) + else: + series_meta, site_meta = {}, {} + build = _build_dense if dense else _build_ragged + return build( + df, + base_meta, + service=spec.service, + schema=spec.schema, + series_meta=series_meta, + site_meta=site_meta, + default_cell_method=spec.default_cell_method, + ) + + # ``@_wraps`` copied the getter's docstring, signature and annotations + # verbatim -- which would advertise a DataFrame return and hide ``dense``. + # Replace the docstring with the xarray note and publish an accurate + # signature: the getter's own args, plus ``dense`` where it applies, + # returning an ``xarray.Dataset``. + getter.__doc__ = _xr_doc( + spec.getter, + cf_metadata=(spec.layout != "stats"), + allow_dense=spec.allow_dense, + ) + getter.__signature__ = _public_signature(spec.getter, has_dense=has_dense) + getter.__annotations__ = { + **getter.__annotations__, + "return": "xarray.Dataset", + **({"dense": "bool"} if has_dense else {}), + } + return getter + + +get_daily = _make_getter(_Service(_api.get_daily, "daily", metadata_cache=_TS_CACHE)) +get_continuous = _make_getter( + _Service(_api.get_continuous, "continuous", metadata_cache=_TS_CACHE) +) +get_latest_continuous = _make_getter( + _Service( + _api.get_latest_continuous, + "latest-continuous", + metadata_cache=_TS_CACHE, + default_cell_method="point", + ) +) +get_latest_daily = _make_getter( + _Service( + _api.get_latest_daily, + "latest-daily", + metadata_cache=_TS_CACHE, + default_cell_method="point", + ) +) +get_nearest_continuous = _make_getter( + _Service( + _get_nearest_continuous, + "continuous", + metadata_cache=_TS_CACHE, + default_cell_method="point", + ) +) +get_peaks = _make_getter( + _Service( + _api.get_peaks, + "peaks", + metadata_cache=_TS_CACHE, + default_cell_method="maximum", + ) +) +get_field_measurements = _make_getter( + _Service( + _api.get_field_measurements, + "field-measurements", + metadata_cache=_FIELD_CACHE, + schema=_FIELD, + default_cell_method="point", + ) +) +get_stats_por = _make_getter(_Service(_api.get_stats_por, "statistics", layout="stats")) +get_stats_date_range = _make_getter( + _Service(_api.get_stats_date_range, "statistics", layout="stats") +) +get_samples = _make_getter( + _Service( + _api.get_samples, + "samples", + schema=_SAMPLES, + default_cell_method="point", + allow_dense=False, + ) +) diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb new file mode 100644 index 00000000..3aa28246 --- /dev/null +++ b/demos/waterdata_xarray_demo.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CF-conventions `xarray` datasets from the `waterdata` module\n", + "\n", + "The `dataretrieval.waterdata.xarray` module mirrors the `waterdata` getters\n", + "(`get_daily`, `get_continuous`, `get_peaks`, `get_samples`, …) but returns a\n", + "[CF-conventions](https://cfconventions.org/) [`xarray.Dataset`](https://docs.xarray.dev/)\n", + "instead of a `pandas.DataFrame`. Units, statistics, and station coordinates come\n", + "straight from the data, and parameter names from a small cached lookup; all are\n", + "written onto the dataset as CF attributes, so the result is self-describing and\n", + "ready to write to netCDF.\n", + "\n", + "This notebook covers:\n", + "\n", + "1. the default **ragged** layout and how to read it;\n", + "2. **why** ragged is the default (and the `dense=True` opt-out);\n", + "3. the one trade-off you need to know — **selecting by time**;\n", + "4. multiple parameters in one pull;\n", + "5. water-quality samples (`get_samples`);\n", + "6. writing CF netCDF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "The xarray helpers are an optional extra:\n", + "\n", + "```bash\n", + "pip install dataretrieval[xarray]\n", + "```\n", + "\n", + "As with the rest of `waterdata`, signing up for a free\n", + "[API key](https://api.waterdata.usgs.gov/signup/) gives you higher rate limits.\n", + "Import the xarray wrappers under a short alias so they don't shadow the plain\n", + "`DataFrame`-returning getters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from dataretrieval.waterdata import xarray as wdx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. A single time series\n", + "\n", + "Pull a year of daily mean discharge (parameter `00060`, statistic `00003`) at one\n", + "gage. The wrapper takes the same arguments as `waterdata.get_daily`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = wdx.get_daily(\n", + " monitoring_location_id=\"USGS-05407000\",\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2023-01-01/2024-01-01\",\n", + ")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note the shape of the result — this is a CF **contiguous ragged array**\n", + "(`featureType = \"timeSeries\"`):\n", + "\n", + "* every observation lives along a single `obs` dimension;\n", + "* each *series* — one `(monitoring location, parameter, statistic)` — is one\n", + " instance along the `timeseries` dimension;\n", + "* `row_size` records how many observations each series contributes (the CF\n", + " `sample_dimension` link), and `timeseries_id` carries `cf_role`;\n", + "* because there is a single, homogeneous parameter here, the descriptors land\n", + " directly on `value` (`long_name`, `units`, `cell_methods`, `standard_name`).\n", + "\n", + "The flag columns (`qualifier`, `approval_status`) are linked as\n", + "`ancillary_variables`, and dataset attributes carry provenance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(ds[\"value\"].attrs)\n", + "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\", \"date_modified\")})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Why ragged is the default\n", + "\n", + "Real collections are *ragged*: some gages have a century of record, others a few\n", + "years. Pull discharge at two gages with very different start dates and look at\n", + "`row_size`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sites = [\"USGS-05407000\", \"USGS-02238500\"] # records since 1913 vs 2008\n", + "# Bound the window so the docs build stays light; the start-date gap still\n", + "# shows up as different row_size (~24 vs ~16 years of daily values).\n", + "ragged = wdx.get_daily(\n", + " monitoring_location_id=sites,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2000-01-01/2024-01-01\",\n", + ")\n", + "print(\"dims :\", dict(ragged.sizes))\n", + "print(\"row_size :\", dict(zip(ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The alternative is a **dense** `(monitoring_location_id, time)` grid — one named\n", + "variable per parameter, NaN where a series has no observation. It is convenient\n", + "(see the next section) but pays for a union time axis and NaN fill. Pass\n", + "`dense=True` to get it, and compare the in-memory size:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dense = wdx.get_daily(\n", + " monitoring_location_id=sites,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2000-01-01/2024-01-01\",\n", + " dense=True,\n", + ")\n", + "print(\"dense dims :\", dict(dense.sizes))\n", + "print(\"dense var :\", list(dense.data_vars))\n", + "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB dense {dense.nbytes/1e6:6.2f} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With only two gages the gap is modest, but it grows fast: a whole state's daily\n", + "discharge can be 15–30% dense, where the gridded layout balloons to hundreds of\n", + "megabytes (mostly NaN) while the ragged array stores only real observations.\n", + "That is why ragged is the default." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. The trade-off: selecting by time\n", + "\n", + "This is the one thing to internalize about the ragged layout.\n", + "\n", + "In the **dense** dataset, `time` is a real dimension with an index, so\n", + "label-based selection just works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# all sites on a given day, addressed as a (site, time) grid\n", + "dense[\"discharge\"].sel(time=\"2020-06-01\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the **ragged** dataset, `value` is one flat list along `obs`, and `time` is\n", + "*just another variable* riding along `obs` — not a dimension. So `value` cannot\n", + "be addressed as a `(site, time)` grid, and a time selection returns whatever\n", + "observations happen to match, **mixed across sites with no labels** (and\n", + "identical dates across sites collide onto the same axis):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"value dims:\", ragged[\"value\"].dims) # ('obs',) -- not (site, time)\n", + "# .sel(time=...) on the flat obs axis can't give you a per-series slice\n", + "ragged[\"value\"].sel(time=\"2020-06-01\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To recover the convenient, time-indexed view you first **regroup** the flat\n", + "`obs` back into per-series pieces using the offsets implied by `row_size`. A\n", + "tiny helper:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def series(ds, i):\n", + " \"\"\"Instance ``i`` of a ragged dataset as a time-indexed DataArray.\"\"\"\n", + " starts = np.concatenate([[0], np.cumsum(ds[\"row_size\"].values)])\n", + " sl = slice(int(starts[i]), int(starts[i + 1]))\n", + " da = ds[\"value\"].isel(obs=sl)\n", + " return da.assign_coords(time=ds[\"time\"].isel(obs=sl)).swap_dims(obs=\"time\")\n", + "\n", + "\n", + "s0 = series(ragged, 0)\n", + "print(ragged[\"timeseries_id\"].values[0])\n", + "s0.sel(time=\"2020-06-01\") # now .sel(time=...) works on a single series" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For anything beyond a one-off slice, the [`cf-xarray`](https://cf-xarray.readthedocs.io/)\n", + "package understands the CF ragged encoding (`cf_role`, `sample_dimension`) and\n", + "will regroup/decode for you. Or, when you know the pull is small and overlapping,\n", + "just ask for `dense=True` up front and use `.sel(time=...)` directly.\n", + "\n", + "**Rule of thumb:** ragged for storage and large multi-site pulls; `dense=True`\n", + "for ergonomic time-based slicing of a few overlapping series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Multiple parameters in one pull\n", + "\n", + "When a pull mixes parameters (and units), the ragged layout keeps a single\n", + "`value` and records the parameter/unit per *instance* — so nothing is\n", + "mislabeled. The homogeneous descriptors drop off `value` and live on the\n", + "`parameter_code` / `unit_of_measure` coordinates instead:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "multi = wdx.get_daily(\n", + " monitoring_location_id=\"USGS-05407000\",\n", + " parameter_code=[\"00060\", \"00045\"], # discharge + precipitation\n", + " time=\"2023-06-01/2023-07-01\",\n", + ")\n", + "print(\"value long_name:\", multi[\"value\"].attrs.get(\"long_name\"))\n", + "print(\"per-instance parameter_code:\", multi[\"parameter_code\"].values)\n", + "print(\"per-instance unit_of_measure:\", multi[\"unit_of_measure\"].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the `dense=True` form each parameter instead becomes its own named variable\n", + "(`discharge`, `precipitation`, …) on the shared time grid." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Water-quality samples\n", + "\n", + "`get_samples` returns discrete water-quality results in the same ragged shape:\n", + "one instance per `(monitoring location, characteristic, sample fraction)`, with\n", + "the result value plus `detection_condition` and `status` as ancillary\n", + "(censoring) variables. Characteristics are free text, so no CF `standard_name`\n", + "is inferred and non-numeric results coerce to NaN (the `detection_condition`\n", + "variable preserves non-detects)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wq = wdx.get_samples(\n", + " service=\"results\",\n", + " profile=\"basicphyschem\",\n", + " monitoringLocationIdentifier=\"USGS-05406500\",\n", + " activityStartDateLower=\"2019-01-01\",\n", + " activityStartDateUpper=\"2020-01-01\",\n", + ")\n", + "print(\"dims:\", dict(wq.sizes))\n", + "print(\"characteristics:\", sorted(set(wq[\"characteristic\"].values))[:8], \"...\")\n", + "print(\"ancillary:\", wq[\"value\"].attrs.get(\"ancillary_variables\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Writing CF netCDF\n", + "\n", + "Because the dataset already carries CF metadata in the standard ragged-array\n", + "encoding, it serializes straight to a self-describing netCDF file that\n", + "CF-aware tools (THREDDS, `cf-xarray`, Panoply, …) can read back (requires a\n", + "netCDF backend, e.g. `pip install netCDF4`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "\n", + "path = os.path.join(tempfile.mkdtemp(), \"daily_discharge.nc\")\n", + "ragged.to_netcdf(path) # CF-1.11 contiguous ragged array\n", + "print(\"wrote\", path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "* CF conventions — [Discrete Sampling Geometries](https://cfconventions.org/cf-conventions/cf-conventions.html#discrete-sampling-geometries)\n", + " (contiguous ragged array representation)\n", + "* [`cf-xarray`](https://cf-xarray.readthedocs.io/) — decode/group CF DSG datasets\n", + "* [`xarray`](https://docs.xarray.dev/) documentation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 91d7bd1f..c86f6f4d 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -32,6 +32,18 @@ functions and is executed against the live USGS Water Data API. USGS_WaterData_ContinuousData_Examples USGS_WaterData_ReferenceLists_Examples +CF-conventions ``xarray`` datasets from the ``waterdata`` module +---------------------------------------------------------------- +The ``waterdata.xarray`` wrappers return CF-conventions ``xarray.Dataset`` +objects (a ragged array by default, with a ``dense=True`` gridded opt-out). +This notebook demonstrates the layouts, the time-selection trade-off, and +writing CF netCDF. + +.. toctree:: + :maxdepth: 1 + + waterdata_xarray_demo + Simple uses of the ``dataretrieval`` package -------------------------------------------- diff --git a/docs/source/examples/waterdata_xarray_demo.nblink b/docs/source/examples/waterdata_xarray_demo.nblink new file mode 100644 index 00000000..9c886545 --- /dev/null +++ b/docs/source/examples/waterdata_xarray_demo.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../demos/waterdata_xarray_demo.ipynb" +} diff --git a/pyproject.toml b/pyproject.toml index 62ac7478..2c90f806 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,10 +57,18 @@ doc = [ # bound as the [nldi] extra. "geopandas>=0.10", "mapclassify", + # waterdata_xarray_demo.ipynb imports dataretrieval.waterdata.xarray and + # writes a CF netCDF file; nbsphinx executes it at build time, so the doc + # environment needs xarray and a netCDF backend. + "xarray>=2023.1.0", + "netCDF4", ] nldi = [ 'geopandas>=0.10' ] +xarray = [ + 'xarray>=2023.1.0' +] [project.urls] homepage = "https://github.com/DOI-USGS/dataretrieval-python" diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py new file mode 100644 index 00000000..d74fc64c --- /dev/null +++ b/tests/waterdata_xarray_test.py @@ -0,0 +1,803 @@ +"""Offline unit tests for dataretrieval.waterdata.xarray converters. + +These exercise the pure DataFrame -> xarray.Dataset converters with synthetic +frames, so they run without network access. Live end-to-end behavior is +covered by the waterdata getters' own tests. +""" + +from types import SimpleNamespace + +import pandas as pd +import pytest + +xr = pytest.importorskip("xarray") +from dataretrieval.waterdata import xarray as wdx # noqa: E402 + + +def _meta(url="https://example.test/items"): + return SimpleNamespace(url=url) + + +def _daily_frame( + time_series_id="A", + site="USGS-1", + values=(100, 110), + times=("2024-06-01", "2024-06-02"), +): + n = len(values) + return pd.DataFrame( + { + "time": list(times), + "value": list(values), + "monitoring_location_id": [site] * n, + "parameter_code": ["00060"] * n, + "statistic_id": ["00003"] * n, + "unit_of_measure": ["ft^3/s"] * n, + "qualifier": [None] * n, + "approval_status": ["Approved"] * n, + "time_series_id": [time_series_id] * n, + } + ) + + +# series_meta is keyed by parameter_code and supplies only the readable name; +# units/statistic/parameter_code come from the values frame itself. +_DISCHARGE_META = { + "00060": { + "parameter_name": "Discharge", + "parameter_description": "Discharge, cubic feet per second", + } +} + + +def _temp_frame(times=("2024-06-02", "2024-06-03"), values=(18.0, 19.0)): + """A water-temperature (00010, instantaneous) frame, parallel to _daily_frame.""" + n = len(values) + return pd.DataFrame( + { + "time": list(times), + "value": list(values), + "monitoring_location_id": ["USGS-1"] * n, + "parameter_code": ["00010"] * n, + "statistic_id": ["00011"] * n, + "unit_of_measure": ["degC"] * n, + "qualifier": [None] * n, + "approval_status": ["Approved"] * n, + "time_series_id": ["B"] * n, + } + ) + + +_TEMP_META = { + "00010": { + "parameter_name": "Temperature, water", + "parameter_description": "Temperature, water, degrees Celsius", + } +} + + +def test_build_timeseries_cf_attributes(): + ds = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert isinstance(ds, xr.Dataset) + assert "discharge" in ds.data_vars + v = ds["discharge"] + assert v.attrs["long_name"] == "Discharge, cubic feet per second" + assert v.attrs["units"] == "ft3 s-1" # UDUNITS-normalized from "ft^3/s" + assert v.attrs["cell_methods"] == "time: mean" + assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel" + assert v.attrs["usgs_parameter_code"] == "00060" + assert v.attrs["usgs_statistic_id"] == "00003" + # dataset-level provenance + assert ds.attrs["Conventions"] == "CF-1.11" + assert ds.attrs["featureType"] == "timeSeries" + assert ds.attrs["references"] == "https://example.test/items" + # site is the DSG instance dimension + assert ds["monitoring_location_id"].attrs.get("cf_role") == "timeseries_id" + assert ds.sizes["time"] == 2 + + +def test_ancillary_variables_linked(): + ds = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "discharge_qualifier" in ds.data_vars + assert "discharge_approval_status" in ds.data_vars + assert ds["discharge"].attrs["ancillary_variables"] == ( + "discharge_qualifier discharge_approval_status" + ) + + +def test_unknown_unit_passes_through_verbatim(): + df = _daily_frame() + df["unit_of_measure"] = "widgets/s" # units are read from the frame + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds["discharge"].attrs["units"] == "widgets/s" + + +def test_missing_standard_name_is_omitted(): + # parameter_code with no curated CF mapping -> no standard_name key + meta = {"99999": {"parameter_name": "Mystery", "parameter_description": "Mystery"}} + df = _daily_frame() + df["parameter_code"] = "99999" + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=meta) + assert "standard_name" not in ds["mystery"].attrs + assert ds["mystery"].attrs["usgs_parameter_code"] == "99999" + + +def test_vertical_datum_distinguishes_stage_parameters(): + # 00065 (gage height) and 63160 (water level above NAVD88) share the CF + # standard_name water_surface_height_above_reference_datum; the vertical_datum + # attribute records the differing reference datum so they're distinguishable. + for pcode, datum in (("00065", "local site datum"), ("63160", "NAVD88")): + df = _daily_frame() + df["parameter_code"] = pcode + ds = wdx._build_ragged(df, _meta(), service="continuous", series_meta={}) + v = ds["value"] + assert v.attrs["standard_name"] == "water_surface_height_above_reference_datum" + assert v.attrs["vertical_datum"] == datum + # a parameter with no datum mapping gets no vertical_datum attr + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "vertical_datum" not in ds["value"].attrs + + +def test_multiple_parameters_outer_join_on_time(): + # discharge at t1,t2 ; temperature at t2,t3 -> union time, NaN fill + q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02")) + t = _temp_frame() + meta = {**_DISCHARGE_META, **_TEMP_META} + ds = wdx._build_dense( + pd.concat([q, t]), _meta(), service="continuous", series_meta=meta + ) + assert {"discharge", "temperature_water"} <= set(ds.data_vars) + assert ds.sizes["time"] == 3 # union of {t1,t2,t3} + # temperature has no value at t1 -> NaN + assert pd.isna(ds["temperature_water"].sel(time="2024-06-01").item()) + # cell_methods derived from statistic_id 00011 (instantaneous) -> point + assert ds["temperature_water"].attrs["cell_methods"] == "time: point" + + +def test_collision_dedups_with_warning(): + # two values for the same (site, parameter, statistic, time) are ambiguous + # without the hash key -> keep the first and warn; site stays the dim. + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(200,), times=("2024-06-01",)) + with pytest.warns(UserWarning, match="multiple values per"): + ds = wdx._build_dense( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "monitoring_location_id" in ds.dims + assert ds.sizes["time"] == 1 + assert ( + ds["discharge"].sel(monitoring_location_id="USGS-1", time="2024-06-01").item() + == 100 + ) + + +def test_empty_frame_returns_dataset_with_conventions(): + ds = wdx._build_dense(pd.DataFrame(), _meta(), service="daily", series_meta={}) + assert isinstance(ds, xr.Dataset) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_build_stats_flat_dataset(): + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00060", "00060"], + "month": [1, 2], + "p50_va": [120.0, 130.0], + } + ) + ds = wdx._build_stats(df, _meta(), "statistics") + assert isinstance(ds, xr.Dataset) + assert "p50_va" in ds.data_vars + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_build_stats_drops_hash_columns(): + # The plain getters no longer drop hash IDs, so the flat stats builder must + # drop the stats service's per-record / per-series UUIDs itself to keep the + # CF dataset free of opaque coordinates. + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "parameter_code": ["00060"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "time_series_id": ["b026-24dfa11f8503"], + "p50_va": [120.0], + } + ) + ds = wdx._build_stats(df, _meta(), "statistics") + for hash_col in ("computation_id", "parent_time_series_id", "time_series_id"): + assert hash_col not in ds.variables + assert "p50_va" in ds.data_vars + + +def test_ragged_omits_hash_columns(): + # The synthetic daily frame carries a time_series_id hash column; the ragged + # builder whitelists the columns it converts, so the hash never surfaces in + # the dataset (the timeseries path stays hash-free without the getter's + # help). + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "time_series_id" not in ds.variables + + +def test_public_wrappers_exist_and_are_documented(): + for name in [n for n in wdx.__all__ if n.startswith("get_")]: + fn = getattr(wdx, name) + assert callable(fn) + # _make_getter carries the wrapped getter's name through, so the public + # symbol, its __name__, and the docstring reference all agree. + assert fn.__name__ == name + assert "xarray wrapper" in (fn.__doc__ or "") + assert f"dataretrieval.waterdata.{name}" in (fn.__doc__ or "") + + +def test_fetch_strips_include_hash(): + # The xarray path never surfaces hash columns, so include_hash must be + # dropped before the underlying getter is called (no wasted fetch). + captured = {} + + def fake_getter(*args, **kwargs): + captured.update(kwargs) + return "df", "meta" + + df, meta = wdx._fetch( + fake_getter, (), {"include_hash": True, "parameter_code": "00060"} + ) + assert (df, meta) == ("df", "meta") + assert "include_hash" not in captured + assert captured == {"parameter_code": "00060"} + + +def test_every_wrapper_routes_through_fetch(monkeypatch): + # Pin the wiring: each public wrapper must delegate the underlying fetch to + # _fetch (which is what strips include_hash). Guards against a wrapper + # quietly reverting to calling the getter directly and leaking the flag. + seen = [] + + def spy(func, args, kwargs): + seen.append(dict(kwargs)) + return pd.DataFrame(), SimpleNamespace(url=None) # empty -> no network + + monkeypatch.setattr(wdx, "_fetch", spy) + for name in [n for n in wdx.__all__ if n.startswith("get_")]: + seen.clear() + ds = getattr(wdx, name)(monitoring_location_id="USGS-1", include_hash=True) + assert isinstance(ds, xr.Dataset) + assert len(seen) == 1, f"{name} did not route through _fetch" + # the wrapper hands include_hash to _fetch; _fetch is what drops it + # (asserted in test_fetch_strips_include_hash) + assert seen[0].get("include_hash") is True + + +def test_unparseable_time_dropped_with_warning(): + # A bad/missing time must be dropped explicitly (with a warning), not + # silently swallowed by to_xarray. + df = _daily_frame( + values=(100, 110, 120), + times=("2024-06-01", "not-a-date", "2024-06-03"), + ) + with pytest.warns(UserWarning, match="unparseable or missing time"): + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds.sizes["time"] == 2 # the bad-time row is gone, the good ones stay + assert 110 not in ds["discharge"].values # the dropped value did not survive + + +def test_all_unparseable_time_returns_empty_dataset(): + df = _daily_frame(values=(1, 2), times=("bad-a", "bad-b")) + with pytest.warns(UserWarning, match="unparseable or missing time"): + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_mixed_units_in_one_variable_warns(): + # Same (parameter, statistic) across two sites but different units -> one + # variable can hold only one units attr; warn instead of silently mislabeling. + a = _daily_frame(site="USGS-1", values=(100,), times=("2024-06-01",)) + b = _daily_frame(site="USGS-2", values=(3,), times=("2024-06-01",)) + b["unit_of_measure"] = "m3 s-1" + with pytest.warns(UserWarning, match="spans multiple units"): + ds = wdx._build_dense( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.sizes["monitoring_location_id"] == 2 + assert ds["discharge"].attrs["units"] == "ft3 s-1" # first unit (ft^3/s) + + +def test_point_coords_from_list_geometry(): + # Without geopandas the OGC geometry is a plain [lon, lat] list, not a + # shapely Point. lon/lat must still be extracted (regression: the old + # ``.x``/``.y`` access raised AttributeError and silently dropped them). + df = _daily_frame() + df["geometry"] = [[-90.44, 43.19]] * len(df) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert "longitude" in ds.coords and "latitude" in ds.coords + assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 + assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 + assert ds["longitude"].attrs["units"] == "degrees_east" + assert ds["latitude"].attrs["standard_name"] == "latitude" + + +def test_point_coords_from_pointlike_geometry(): + # The shapely-Point path (geopandas installed) still works: any object + # exposing .x/.y is read directly. + df = _daily_frame() + df["geometry"] = [SimpleNamespace(x=-90.44, y=43.19)] * len(df) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 + assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 + + +def test_non_point_geometry_skipped(): + # A non-point geometry (no .x/.y, not a [lon, lat] pair) is skipped, not + # guessed -- no lon/lat coords are added. + df = _daily_frame() + df["geometry"] = [object()] * len(df) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert "longitude" not in ds.coords and "latitude" not in ds.coords + + +def test_date_modified_from_last_modified(): + # The newest last_modified becomes the dataset-level date_modified (ACDD). + df = _daily_frame() + df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"] + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds.attrs["date_modified"].startswith("2024-06-10") + + +def test_no_date_modified_without_last_modified(): + # The default frame has no last_modified column -> no date_modified attr. + ds = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "date_modified" not in ds.attrs + + +def test_site_metadata_coordinates(): + # HUC / state_name ride along on the metadata call and become + # instance-indexed auxiliary coordinates. + site_meta = { + "USGS-1": {"hydrologic_unit_code": "07070005", "state_name": "Wisconsin"} + } + ds = wdx._build_dense( + _daily_frame(), + _meta(), + service="daily", + series_meta=_DISCHARGE_META, + site_meta=site_meta, + ) + huc = ds["hydrologic_unit_code"].sel(monitoring_location_id="USGS-1").item() + assert huc == "07070005" + assert ds["state_name"].sel(monitoring_location_id="USGS-1").item() == "Wisconsin" + assert ds["hydrologic_unit_code"].attrs["long_name"] == "hydrologic unit code (HUC)" + + +def test_site_metadata_absent_adds_no_coords(): + # No site_meta -> no HUC/state coords (back-compat with the old signature). + ds = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "hydrologic_unit_code" not in ds.coords + assert "state_name" not in ds.coords + + +# --- ragged layout (the default) ------------------------------------------- + + +def test_ragged_structure_and_cf_attrs(): + # Single (site, parameter, statistic) -> one instance; value attrs are set + # because the descriptors are homogeneous across the (one) instance. + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert set(ds.sizes) == {"obs", "timeseries"} + assert ds.sizes == {"obs": 2, "timeseries": 1} + assert ds["value"].dims == ("obs",) + assert list(ds["value"].values) == [100, 110] + assert ds["row_size"].dims == ("timeseries",) + assert int(ds["row_size"][0]) == 2 + assert ds["row_size"].attrs["sample_dimension"] == "obs" + # per-instance identity + cf_role on the synthesized timeseries_id + assert ds["monitoring_location_id"].dims == ("timeseries",) + assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id" + assert ds["timeseries_id"].values[0] == "USGS-1:00060:00003" + # homogeneous descriptors land on value + v = ds["value"] + assert v.attrs["long_name"] == "Discharge, cubic feet per second" + assert v.attrs["units"] == "ft3 s-1" + assert v.attrs["cell_methods"] == "time: mean" + assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel" + assert v.attrs["ancillary_variables"] == "qualifier approval_status" + assert ds.attrs["featureType"] == "timeSeries" + # ancillary flags are per-observation; metadata is per-instance + assert ds["qualifier"].dims == ("obs",) + assert ds["approval_status"].dims == ("obs",) + assert ds["parameter_code"].dims == ("timeseries",) + assert ds["parameter_code"].values.tolist() == ["00060"] + assert ds["statistic_id"].values.tolist() == ["00003"] + assert ds["unit_of_measure"].values.tolist() == ["ft^3/s"] + assert ds["parameter_code"].attrs["long_name"] == "USGS parameter code" + + +def test_ragged_stores_only_real_observations(): + # Two sites of very different length: obs == sum of lengths, no NaN fill. + a = _daily_frame( + site="USGS-A", + values=(1, 2, 3), + times=("2024-06-01", "2024-06-02", "2024-06-03"), + ) + b = _daily_frame(site="USGS-B", values=(9,), times=("2024-06-03",)) + ds = wdx._build_ragged( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.sizes == {"obs": 4, "timeseries": 2} # 3 + 1, not 2 x 3 grid + assert not pd.isna(ds["value"].values).any() + assert sorted(ds["row_size"].values.tolist()) == [1, 3] + + +def test_ragged_mixed_parameters_value_is_generic(): + # Mixed parameters/units -> value carries no single units/standard_name; + # the per-instance parameter_code coordinate disambiguates. + q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02")) + t = _temp_frame() + meta = {**_DISCHARGE_META, **_TEMP_META} + ds = wdx._build_ragged( + pd.concat([q, t]), _meta(), service="continuous", series_meta=meta + ) + assert ds.sizes == {"obs": 4, "timeseries": 2} + assert ds["value"].attrs["long_name"] == "measured value" + assert "units" not in ds["value"].attrs # ft^3/s vs degC -> not homogeneous + assert "standard_name" not in ds["value"].attrs + assert set(ds["parameter_code"].values) == {"00060", "00010"} + + +def test_ragged_keeps_duplicate_observations_without_warning(): + # The dense path warns + dedups colliding (site, time); ragged just keeps + # both observations -- no grid, so no ambiguity. Assert specifically that + # neither dense-path diagnostic fires (not "no warning at all", so an + # unrelated pandas warning can't fail the test for the wrong reason). + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(200,), times=("2024-06-01",)) + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + ds = wdx._build_ragged( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + msgs = [str(w.message) for w in caught] + assert not any( + "multiple values per" in m or "spans multiple units" in m for m in msgs + ), msgs + assert ds.sizes["obs"] == 2 + assert sorted(ds["value"].values.tolist()) == [100, 200] + + +def test_ragged_lonlat_and_date_modified_per_instance(): + df = _daily_frame() + df["geometry"] = [[-90.44, 43.19]] * len(df) + df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"] + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds["longitude"].dims == ("timeseries",) + assert float(ds["longitude"][0]) == -90.44 + assert float(ds["latitude"][0]) == 43.19 + assert ds.attrs["date_modified"].startswith("2024-06-10") + + +def _instance_blocks(ds): + """Map each instance's (parameter, statistic) -> (time-sorted values, unit) + by walking the row_size offsets -- the reader's view of a ragged array.""" + starts, acc = [], 0 + for n in ds["row_size"].values.tolist(): + starts.append(acc) + acc += int(n) + starts.append(acc) + blocks = {} + for i in range(ds.sizes["timeseries"]): + sl = slice(starts[i], starts[i + 1]) + key = (str(ds["parameter_code"].values[i]), str(ds["statistic_id"].values[i])) + blocks[key] = ( + ds["value"].values[sl].tolist(), + str(ds["unit_of_measure"].values[i]), + ) + return blocks + + +def test_ragged_alignment_survives_interleaved_input(): + # The critical invariant: row_size, the contiguous obs blocks, and the + # per-instance metadata must all stay aligned even when the input rows are + # interleaved across instances and out of time order. A regression in the + # sort/group ordering would silently map values to the wrong series. + cols = [ + "monitoring_location_id", + "parameter_code", + "statistic_id", + "unit_of_measure", + "time", + "value", + ] + # Values are deliberately NON-monotonic with time within each instance, so + # a regression that ordered obs by value (instead of time) would produce a + # different sequence and fail -- "sorted values" alone wouldn't catch that. + rows = [ + ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-02", 100), # later, smaller + ("USGS-1", "00010", "00011", "degC", "2024-06-01", 19), + ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-03", 500), + ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-01", 150), # earlier, larger + ("USGS-1", "00010", "00011", "degC", "2024-06-03", 18), + ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-01", 480), + ] + df = pd.DataFrame(rows, columns=cols) + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta={}) + + assert ds.sizes == {"obs": 6, "timeseries": 3} + assert int(ds["row_size"].sum()) == ds.sizes["obs"] + blocks = _instance_blocks(ds) + # each instance's values + unit are the ones that belong to it, in TIME order + # (not value order: 150 precedes 100, and 19 precedes 18) + assert blocks[("00060", "00003")] == ([150, 100], "ft^3/s") + assert blocks[("00060", "00001")] == ([480, 500], "ft^3/s") + assert blocks[("00010", "00011")] == ([19, 18], "degC") + + +def test_ragged_field_schema_without_statistic(): + # The field-measurements schema groups by parameter_code only (no + # statistic_id): the instance has no statistic segment, and the service + # default cell method fills in. + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00060", "00060"], + "time": ["2024-06-01", "2024-06-02"], + "value": [100, 110], + "unit_of_measure": ["ft^3/s", "ft^3/s"], + } + ) + ds = wdx._build_ragged( + df, + _meta(), + service="field-measurements", + schema=wdx._FIELD, + series_meta=_DISCHARGE_META, + default_cell_method="point", + ) + assert ds.sizes == {"obs": 2, "timeseries": 1} + assert "statistic_id" not in ds.coords + assert ds["timeseries_id"].values[0] == "USGS-1:00060" # no stat segment + assert ds["value"].attrs["cell_methods"] == "time: point" + assert ds["value"].attrs["long_name"] == "Discharge, cubic feet per second" + + +# --- dense opt-out wiring --------------------------------------------------- + + +def test_wrapper_defaults_to_ragged_and_dense_opt_out(monkeypatch): + # Default wrapper output is ragged; dense=True returns the gridded layout. + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta())) + monkeypatch.setattr(wdx._TS_CACHE, "lookup", lambda site_ids: (_DISCHARGE_META, {})) + + ragged = wdx.get_daily(monitoring_location_id="USGS-1") + assert "obs" in ragged.sizes and "value" in ragged.data_vars + assert "discharge" not in ragged.data_vars + + dense = wdx.get_daily(monitoring_location_id="USGS-1", dense=True) + assert "discharge" in dense.data_vars + assert "obs" not in dense.sizes + + +def test_series_wrapper_signature_advertises_dense_and_dataset_return(): + # functools.wraps would otherwise carry over the getter's DataFrame + # signature and hide ``dense``; _make_getter publishes an accurate one. + import inspect + + sig = inspect.signature(wdx.get_daily) + assert "monitoring_location_id" in sig.parameters # real args stay visible + dense = sig.parameters["dense"] + assert dense.kind is inspect.Parameter.KEYWORD_ONLY + assert dense.default is False + assert "Dataset" in str(sig.return_annotation) + assert wdx.get_daily.__annotations__["return"] == "xarray.Dataset" + + +def test_no_dense_wrappers_omit_dense_from_signature(): + # stats + samples have no dense layout, so ``dense`` must not be advertised + # (the wrapper still accept-and-ignores it; see the dense-warning tests). + import inspect + + for name in ("get_stats_por", "get_samples"): + sig = inspect.signature(getattr(wdx, name)) + assert "dense" not in sig.parameters, name + assert "Dataset" in str(sig.return_annotation), name + + +# --- water-quality samples -------------------------------------------------- + + +def _samples_frame(characteristics=("Temperature, water",), units=("deg C",)): + rows = [] + for ch, u in zip(characteristics, units): + rows.append( + { + "Location_Identifier": "USGS-1", + "Activity_StartDateTime": "2020-07-10T12:00:00Z", + "Result_Characteristic": ch, + "Result_SampleFraction": "Total", + "Result_Measure": 12.5, + "Result_MeasureUnit": u, + "Result_ResultDetectionCondition": None, + "Result_MeasureStatusIdentifier": "Provisional", + } + ) + return pd.DataFrame(rows) + + +def _samples_ds(frame): + """Build a samples ragged Dataset the way the get_samples service does.""" + return wdx._build_ragged( + frame, + _meta(), + service="samples", + schema=wdx._SAMPLES, + default_cell_method="point", + ) + + +def test_build_samples_single_characteristic(): + ds = _samples_ds(_samples_frame()) + assert set(ds.sizes) == {"obs", "timeseries"} + assert ds["value"].dims == ("obs",) + assert ds["characteristic"].values[0] == "Temperature, water" + assert ds["value"].attrs["long_name"] == "Temperature, water" + assert ds["value"].attrs["cell_methods"] == "time: point" + # censoring columns are ancillary, linked from value + assert "detection_condition" in ds.variables and "status" in ds.variables + assert ds["value"].attrs["ancillary_variables"] == "detection_condition status" + assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id" + + +def test_build_samples_mixed_characteristics_generic_value(): + ds = _samples_ds( + _samples_frame(("Temperature, water", "pH"), ("deg C", "std units")) + ) + assert ds.sizes["timeseries"] == 2 + assert ds["value"].attrs["long_name"] == "measured value" + assert set(ds["characteristic"].values) == {"Temperature, water", "pH"} + + +def test_get_samples_wrapper_builds_ragged(monkeypatch): + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta())) + ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", include_hash=True) + assert "obs" in ds.sizes and "value" in ds.data_vars + assert ds["characteristic"].values[0] == "Temperature, water" + + +def test_prepare_values_missing_required_column_returns_empty(): + # A frame lacking a mandatory column (here: no value) must degrade to an + # empty Dataset, not raise KeyError from the column slim. + df = _daily_frame().drop(columns=["value"]) + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert isinstance(ds, xr.Dataset) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_build_samples_missing_value_returns_empty(): + # A non-result Samples profile (no Result_Measure -> no "value") must not + # crash; it has nothing to convert. + df = pd.DataFrame( + { + "Location_Identifier": ["USGS-1"], + "Activity_StartDateTime": ["2020-07-10T12:00:00Z"], + "Result_Characteristic": ["pH"], + } + ) + ds = _samples_ds(df) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_get_samples_ignores_dense_with_warning(monkeypatch): + # dense=True is advertised generically; get_samples is always ragged, so it + # must accept-and-ignore (with a warning) rather than leak dense= to the + # underlying getter (which would TypeError). + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta())) + with pytest.warns(UserWarning, match="no dense layout"): + ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", dense=True) + assert "obs" in ds.sizes # still ragged + + +def test_get_stats_ignores_dense_with_warning(monkeypatch): + # The stats layout has no dense grid either; dense=True is ignored with the + # same warning (consistent with samples), not silently swallowed. + frame = pd.DataFrame({"monitoring_location_id": ["USGS-1"], "p50_va": [120.0]}) + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (frame, _meta())) + with pytest.warns(UserWarning, match="no dense layout"): + ds = wdx.get_stats_por(monitoring_location_id="USGS-1", dense=True) + assert isinstance(ds, xr.Dataset) + + +def test_timeseries_id_skips_missing_instance_key(): + # A NaN instance key (e.g. a characteristic with no sample fraction) must + # not render as a literal "nan" token in the cf_role timeseries_id. + df = _samples_frame() + df["Result_SampleFraction"] = None + ds = _samples_ds(df) + assert all("nan" not in tid for tid in ds["timeseries_id"].values) + assert ds["timeseries_id"].values[0] == "USGS-1:Temperature, water" + + +# --- metadata-lookup resilience + cache control ----------------------------- + + +def test_metadata_lookup_failure_degrades_with_warning(): + # The metadata lookup is supplementary (only the readable name + site + # descriptors); a fetch failure must warn and return empty rather than + # discard already-fetched data -- and must NOT cache the failure, so a + # later recovered call retries. + def boom(monitoring_location_id): + raise RuntimeError("network down") + + cache = wdx._MetadataCache(boom) + with pytest.warns(UserWarning, match="metadata lookup failed"): + param_meta, site_meta = cache.lookup(["USGS-1"]) + assert param_meta == {} and site_meta == {} + assert len(cache) == 0 # failure not cached -> retryable + + +def test_wrapper_survives_metadata_failure(monkeypatch): + # End to end: a failing metadata endpoint still yields a CF dataset built + # from the data; only the metadata-sourced long_name is missing. + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta())) + + def boom(monitoring_location_id): + raise RuntimeError("network down") + + monkeypatch.setattr(wdx._TS_CACHE, "_getter", boom) + wdx._TS_CACHE.clear() + with pytest.warns(UserWarning, match="metadata lookup failed"): + ds = wdx.get_daily(monitoring_location_id="USGS-1") + assert "value" in ds.data_vars # observations survived + # standard_name/units come from the frame; long_name (metadata) is absent + assert "long_name" not in ds["value"].attrs + assert ds["value"].attrs["units"] == "ft3 s-1" + + +def test_metadata_cache_bounded_and_clearable(): + # The cache is bounded (FIFO eviction) so a long-running, many-site + # process can't grow it without limit. + def fake(monitoring_location_id): + rows = [ + { + "monitoring_location_id": s, + "parameter_code": "00060", + "parameter_name": f"name-{s}", + } + for s in monitoring_location_id + ] + return pd.DataFrame(rows), SimpleNamespace(url=None) + + cache = wdx._MetadataCache(fake, maxsize=3) + for i in range(10): + cache.lookup([f"S{i}"]) + assert len(cache) <= 3 + + cache.lookup(["S99"]) + assert len(cache) > 0 + cache.clear() # instance opt-out empties it + assert len(cache) == 0 + + # the public helper clears the real per-process caches + wdx._TS_CACHE._entries["X"] = {"params": {}, "site": {}} + wdx._FIELD_CACHE._entries["Y"] = {"params": {}, "site": {}} + wdx.clear_metadata_cache() + assert len(wdx._TS_CACHE) == 0 and len(wdx._FIELD_CACHE) == 0 From de3239f4064906acef5d590a6677f3a43542e6bc Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 08:34:32 -0500 Subject: [PATCH 2/7] fix(waterdata.xarray): default to the dense grid and harden edge cases Make the (monitoring_location_id, time) dense grid the default layout (pass dense=False for the contiguous ragged array), add select_series() and clear_metadata_cache(), and fix defects found in review: - NaN-truthiness: a present-but-null parameter_name/description no longer masks the fallback (was naming a variable "nan" / dropping long_name) - dense (site, time) collisions dedup deterministically (smallest value) rather than keeping an arbitrary upstream-order row - partial point geometry keeps a numeric NaN-filled lon/lat coordinate instead of a CF-invalid object array - list-valued flag columns (qualifier) are flattened to strings so the datasets serialize to netCDF - omit CF featureType on the preliminary flat stats layout Rewrite the demo notebook around the dense default and refresh the README. 56 offline tests pass; demo executes clean against the live API. Co-Authored-By: Claude Opus 4.8 --- README.md | 26 ++++ dataretrieval/waterdata/xarray.py | 249 ++++++++++++++++++++++-------- demos/waterdata_xarray_demo.ipynb | 244 +++++++++++++---------------- tests/waterdata_xarray_test.py | 181 ++++++++++++++++++++-- 4 files changed, 488 insertions(+), 212 deletions(-) diff --git a/README.md b/README.md index e56d31c2..c149e29e 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,32 @@ import logging logging.basicConfig(level=logging.DEBUG) ``` +#### Returning xarray Datasets + +The Water Data time-series, peaks, field-measurement, statistics, and samples +getters have [xarray](https://docs.xarray.dev/) counterparts in +`dataretrieval.waterdata.xarray` that return a CF-conventions +`xarray.Dataset` instead of a DataFrame — ready to write to netCDF or hand to +the CF-aware scientific Python stack. Install the optional dependency with +`pip install dataretrieval[xarray]`: + +```python +from dataretrieval.waterdata import xarray as wdx + +# Same arguments as waterdata.get_daily, but returns a CF Dataset +ds = wdx.get_daily( + monitoring_location_id='USGS-01646500', + parameter_code='00060', # Discharge + time='2024-10-01/2025-09-30', +) +``` + +See the +[xarray demo notebook](https://github.com/DOI-USGS/dataretrieval-python/blob/main/demos/waterdata_xarray_demo.ipynb) +for a walkthrough of the default dense `(site, time)` grid and the +`dense=False` contiguous-ragged layout for large multi-site pulls, the CF +metadata, and writing to netCDF. + ### Water Quality Portal (WQP) Access water quality data from multiple agencies: diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 0eef2c2e..b15912c0 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -4,24 +4,25 @@ :mod:`dataretrieval.waterdata`, but returns a CF-conventions :class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`. -By default the data is returned as a CF *contiguous ragged array* -(``featureType = "timeSeries"``): every observation is concatenated along a -single ``obs`` dimension, and each (monitoring location, parameter, -statistic) series is one instance along a ``timeseries`` dimension whose -``row_size`` records how many observations it contributes. This stores only -real observations -- no NaN fill -- so it scales to large, very ragged -multi-site pulls where record lengths differ by decades. Parameter, -statistic, unit, and location identity become per-instance metadata, and -``time`` is a 1-D coordinate along ``obs``. The trade-off is that ``time`` -is no longer a dimension you can index directly: to select by time you -first regroup the flat ``obs`` back into per-series views (e.g. with -``cf-xarray``, or via the offsets implied by ``row_size``). - -Pass ``dense=True`` for the alternative gridded layout: one named data -variable per parameter on a ``(monitoring_location_id, time)`` grid, NaN -where a series has no observation. This is ergonomic for a few overlapping -series (``ds["discharge"].sel(time=...)`` just works) but the union time -axis and NaN fill make it memory-costly for large ragged collections. +By default the data is returned on a CF ``(monitoring_location_id, time)`` +grid (``featureType = "timeSeries"``): one named data variable per parameter +(``discharge``, ``temperature_water``, ...), NaN where a series has no +observation. This is the ergonomic layout -- ``ds["discharge"].sel( +monitoring_location_id=..., time=...)`` just works -- and +``monitoring_location_id`` is the instance dimension carrying +``cf_role = "timeseries_id"``. The cost is the union time axis and NaN fill, +which grow for large, very ragged multi-site collections. + +Pass ``dense=False`` for the alternative CF *contiguous ragged array*: every +observation is concatenated along a single ``obs`` dimension, and each +(monitoring location, parameter, statistic) series is one instance along a +``timeseries`` dimension whose ``row_size`` records how many observations it +contributes. This stores only real observations -- no NaN fill -- so it +scales to large multi-site pulls where record lengths differ by decades. +Parameter, statistic, unit, and location identity become per-instance +metadata and ``time`` is a 1-D coordinate along ``obs``, so to select one +series use :func:`select_series` (or regroup the flat ``obs`` via the offsets +implied by ``row_size``, e.g. with ``cf-xarray``). Either way the CF metadata is derived from columns the getter already returns (``unit_of_measure`` -> ``units``, ``statistic_id`` -> @@ -60,6 +61,7 @@ from dataclasses import dataclass, field, replace from functools import wraps as _wraps +import numpy as _np import pandas as _pd try: @@ -81,6 +83,7 @@ __all__ = [ "clear_metadata_cache", + "select_series", "get_continuous", "get_daily", "get_field_measurements", @@ -139,15 +142,12 @@ def _slug(name) -> str: return s or "value" -def _first(series): - """First non-null value of a column, or None.""" - nonnull = series.dropna() - return nonnull.iloc[0] if len(nonnull) else None - - def _first_present(frame, col): """First non-null value of ``col`` if the column is present, else None.""" - return _first(frame[col]) if col in frame else None + if col not in frame: + return None + nonnull = frame[col].dropna() + return nonnull.iloc[0] if len(nonnull) else None def _unique_present(frame, col): @@ -155,6 +155,30 @@ def _unique_present(frame, col): return frame[col].dropna().unique() if col in frame else [] +def _none_if_nan(value): + """Collapse a pandas NaN to None. + + A NaN is *truthy*, so ``meta.get(col) or fallback`` keeps the NaN instead of + falling back. Metadata frames yield NaN for a column that is present but null + for a given parameter, so normalize it to None before any such ``or`` chain. + """ + return value if _pd.notna(value) else None + + +def _scalarize(cell): + """Collapse a list-valued flag cell into a single string; pass scalars through. + + The Water Data API returns some ancillary (flag) columns -- notably + ``qualifier`` -- as a *list* of codes per observation. xarray/netCDF cannot + encode an object array whose elements are lists, so join the codes into one + space-separated string (an empty list becomes None). + """ + if isinstance(cell, (list, tuple)): + parts = [str(v) for v in cell if v is not None and _pd.notna(v)] + return " ".join(parts) if parts else None + return cell + + def _sites(df): """Unique monitoring-location ids present in a values frame.""" if _SITE in df: @@ -203,10 +227,10 @@ def _point_coords(df, site): if geo.empty: return None lon, lat = {}, {} - for _, row in geo.iterrows(): - xy = _lonlat(row["geometry"]) + for site_id, geom in zip(geo[site].to_numpy(), geo["geometry"].to_numpy()): + xy = _lonlat(geom) if xy is not None: - lon[row[site]], lat[row[site]] = xy + lon[site_id], lat[site_id] = xy if not lon: return None # no point geometry; skip rather than guess return lon, lat @@ -241,6 +265,11 @@ def _prepare_values(df, group_cols, ancillary_cols): work["time"], format="ISO8601", errors="coerce", utc=True ).dt.tz_localize(None) work["value"] = _pd.to_numeric(work["value"], errors="coerce") + # Flatten any list-valued flag cells (e.g. ``qualifier``) to strings so the + # ancillary variables stay netCDF-encodable (``_scalarize`` passes scalars + # through unchanged). + for c in ancillary: + work[c] = work[c].map(_scalarize) n_before = len(work) work = work[work["time"].notna()] dropped = n_before - len(work) @@ -266,8 +295,10 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method): names follow the (layout-specific) data-variable name. """ attrs: dict[str, str] = {} - long_name = desc.get("parameter_description") or desc.get("parameter_name") - if long_name and _pd.notna(long_name): + long_name = _none_if_nan(desc.get("parameter_description")) or _none_if_nan( + desc.get("parameter_name") + ) + if long_name: attrs["long_name"] = str(long_name) if unit is not None and _pd.notna(unit): @@ -294,28 +325,36 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method): return attrs -def _dataset_attrs(service, base_meta): - """Dataset-level provenance (CF + ACDD) attributes.""" +def _dataset_attrs(service, base_meta, *, feature_type): + """Dataset-level provenance (CF + ACDD) attributes. + + ``feature_type`` is the CF discrete-sampling ``featureType`` to advertise, or + None to omit it (sourced from the builder's ``feature_type`` class attr). + The series layouts are genuine ``timeSeries`` geometries; the preliminary + flat stats table is not a DSG at all, so it passes None rather than mislabel + itself. + """ attrs = { "Conventions": "CF-1.11", "institution": "U.S. Geological Survey", "source": f"USGS Water Data API ({service})", - "featureType": "timeSeries", "history": ( f"{_dt.datetime.now(_dt.timezone.utc).isoformat(timespec='seconds')} " "created by dataretrieval.waterdata.xarray" ), } + if feature_type: + attrs["featureType"] = feature_type url = getattr(base_meta, "url", None) if url: attrs["references"] = str(url) return attrs -def _empty_dataset(service, base_meta): +def _empty_dataset(service, base_meta, *, feature_type): """An attribute-only Dataset for an empty / unconvertible response.""" ds = _xr.Dataset() - ds.attrs = _dataset_attrs(service, base_meta) + ds.attrs = _dataset_attrs(service, base_meta, feature_type=feature_type) return ds @@ -434,6 +473,63 @@ def clear_metadata_cache(): _FIELD_CACHE.clear() +def select_series(ds, **keys): + """Extract one time series from a ragged-layout Dataset by label. + + The ragged layout (``dense=False``) stores each series as a contiguous block + along ``obs`` located by ``row_size``, so ``time`` is a flat ``obs`` axis and + a single series is addressed by its per-series coordinates -- + ``parameter_code``, ``monitoring_location_id``, ``statistic_id`` (or, for + samples, ``characteristic`` / ``sample_fraction``) -- rather than a named + variable:: + + ds = wdx.get_daily( + monitoring_location_id=[...], parameter_code=[...], dense=False + ) + q = select_series( + ds, monitoring_location_id="USGS-05407000", parameter_code="00060" + ) + q["value"].sel(time="2023-07") # time is now a real dimension + + This is the ragged-layout counterpart to ``ds[name].sel(...)`` on the default + dense Dataset. It selects the one matching instance and returns its + observations as a 1-D, time-indexed ``xarray.Dataset`` (``value`` plus any + ancillary flag variables), with the series' identity carried as scalar + coordinates. Raises ``KeyError`` if no instance matches and ``ValueError`` + if more than one does (add keys to disambiguate). + """ + if "row_size" not in ds.variables or "obs" not in ds.dims: + raise ValueError( + "select_series expects a ragged Dataset (from dense=False); the " + "default dense Dataset already exposes one variable per parameter, " + "so select by name instead, e.g. " + "ds[variable].sel(monitoring_location_id=...)." + ) + inst_coords = [c for c in ds.coords if ds[c].dims == ("timeseries",)] + mask = _np.ones(ds.sizes["timeseries"], dtype=bool) + for key, value in keys.items(): + if key not in inst_coords: + raise KeyError( + f"{key!r} is not a per-series coordinate; choose from {inst_coords}." + ) + mask &= ds[key].to_numpy() == value + matches = _np.flatnonzero(mask) + if matches.size == 0: + raise KeyError(f"no time series matches {keys}.") + if matches.size > 1: + raise ValueError( + f"{matches.size} time series match {keys}; add more of " + f"{inst_coords} to select exactly one." + ) + i = int(matches[0]) + # Slice this instance's contiguous obs block (the CF sample_dimension link), + # then promote time from an obs coordinate to the dimension. + starts = _np.concatenate([[0], _np.cumsum(ds["row_size"].to_numpy())]) + block = slice(int(starts[i]), int(starts[i + 1])) + series = ds.isel(timeseries=i, obs=block).swap_dims(obs="time") + return series.drop_vars("row_size", errors="ignore") + + # === column schemas ======================================================== # Water-quality samples (Samples DB / WQX) speak a different column vocabulary @@ -516,6 +612,10 @@ class _DatasetBuilder: :func:`_build_stats` helpers rather than directly. """ + # The CF ``featureType`` this layout produces (None to omit it). The series + # layouts are timeSeries DSGs; the flat stats table overrides this to None. + feature_type = "timeSeries" + def __init__(self, df, base_meta, *, service): self.df = df self.base_meta = base_meta @@ -528,11 +628,15 @@ def _is_empty(self): return self.df is None or len(self.df) == 0 def _empty(self): - return _empty_dataset(self.service, self.base_meta) + return _empty_dataset( + self.service, self.base_meta, feature_type=self.feature_type + ) def _apply_provenance(self, ds): """Set the dataset-level CF/ACDD attributes and ``date_modified``.""" - ds.attrs = _dataset_attrs(self.service, self.base_meta) + ds.attrs = _dataset_attrs( + self.service, self.base_meta, feature_type=self.feature_type + ) dm = _date_modified(self.df) if dm: ds.attrs["date_modified"] = dm @@ -593,9 +697,12 @@ def _add_spatial_coords(self, ds, dim, order): coords = _point_coords(self.df, _SITE) if coords is not None: lon, lat = coords + # Fill sites lacking point geometry with NaN (not None) so the + # coordinate stays a numeric float array -- a CF longitude/latitude + # coord must be numeric; an object array with None is CF-invalid. ds = ds.assign_coords( - longitude=(dim, [lon.get(k) for k in order]), - latitude=(dim, [lat.get(k) for k in order]), + longitude=(dim, [lon.get(k, _np.nan) for k in order]), + latitude=(dim, [lat.get(k, _np.nan) for k in order]), ) ds["longitude"].attrs = { "standard_name": "longitude", @@ -617,7 +724,7 @@ def _add_spatial_coords(self, ds, dim, order): class _RaggedBuilder(_SeriesBuilder): - """CF *contiguous ragged array* layout (the default). + """CF *contiguous ragged array* layout (the ``dense=False`` opt-in). All observations are concatenated into one ``obs`` dimension; each series (a ``schema.group_cols`` combination at a site) is an instance along @@ -734,10 +841,15 @@ def _finalize(self, ds, inst_frame, ancillary, value_attrs): } # Join the instance keys into a cf_role id, skipping null parts so a # missing key (e.g. a characteristic with no sample fraction) doesn't - # render as a literal "nan" token. - ts_id = inst_frame.apply( - lambda r: ":".join(str(x) for x in r if _pd.notna(x)), axis=1 - ).to_numpy() + # render as a literal "nan" token. Iterating the row arrays avoids the + # per-instance Series boxing of ``apply(axis=1)``. + ts_id = _np.array( + [ + ":".join(str(x) for x in row if _pd.notna(x)) + for row in inst_frame.to_numpy() + ], + dtype=object, + ) ds = ds.assign_coords(timeseries_id=("timeseries", ts_id)) ds["timeseries_id"].attrs["cf_role"] = "timeseries_id" ds[_SITE].attrs.setdefault("long_name", "monitoring location identifier") @@ -745,10 +857,10 @@ def _finalize(self, ds, inst_frame, ancillary, value_attrs): class _DenseBuilder(_SeriesBuilder): - """Dense ``(monitoring_location_id, time)`` grid: one named variable per - parameter, NaN where a series has no observation. Ergonomic for a few - overlapping series but memory-costly for large ragged collections; see - :class:`_RaggedBuilder` for the default. + """Dense ``(monitoring_location_id, time)`` grid (the default): one named + variable per parameter, NaN where a series has no observation. Ergonomic for + a few overlapping series but memory-costly for large ragged collections; see + :class:`_RaggedBuilder` for the ``dense=False`` scaling layout. """ def _build_series(self, work, group_cols, ancillary, has_unit): @@ -793,10 +905,13 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit): if not sub.index.is_unique: _warnings.warn( f"'{name}' has multiple values per (site, time) -- two series " - "share this (site, parameter, statistic); keeping the first. " - "Filter the query to separate them.", + "share this (site, parameter, statistic); keeping the smallest " + "value. Filter the query to separate them.", stacklevel=3, ) + # Sort by value first so the retained value is deterministic + # rather than dependent on the upstream response row order. + sub = sub.sort_values("value") sub = sub[~sub.index.duplicated(keep="first")] ds_g = sub.to_xarray().rename( {"value": name, **{c: f"{name}_{c}" for c in ancillary}} @@ -818,7 +933,7 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit): @staticmethod def _variable_name(desc, pcode, stat, used): """A unique slug for a variable; disambiguate same-parameter series.""" - name = _slug(desc.get("parameter_name") or pcode or "value") + name = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value") if name in used: # same parameter, different statistic -> distinct var op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None) name = f"{name}_{_slug(op)}" if op else name @@ -836,6 +951,10 @@ class _StatsBuilder(_DatasetBuilder): provenance only. A richer percentile / day-of-year layout is future work. """ + # Not a CF discrete-sampling geometry: a flat percentile table has no + # obs/time/cf_role/sample_dimension, so it must not claim a ``featureType``. + feature_type = None + def build(self): if self._is_empty(): return self._empty() @@ -896,9 +1015,10 @@ def _xr_doc(func, *, cf_metadata=True, allow_dense=True): ) elif allow_dense: returns = ( - "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array " - "(pass ``dense=True`` for the NaN-filled (site, time) grid with one " - "named variable per parameter)" + "a CF-conventions ``xarray.Dataset`` on a (site, time) grid with one " + "named variable per parameter (pass ``dense=False`` for the " + "contiguous ragged array that stores only real observations and " + "scales to large, very ragged multi-site pulls)" ) else: returns = ( @@ -924,9 +1044,10 @@ def _public_signature(wrapped, *, has_dense): """The signature ``help()`` / IDEs should show for a wrapper. Keeps the wrapped getter's own parameters (so its real arguments stay - discoverable), adds the keyword-only ``dense`` toggle where the layout - offers it, and declares the ``xarray.Dataset`` return -- correcting the - DataFrame signature that ``functools.wraps`` would otherwise carry over. + discoverable), adds the keyword-only ``dense`` toggle (default ``True``; + pass ``dense=False`` for the ragged layout) where the layout offers it, and + declares the ``xarray.Dataset`` return -- correcting the DataFrame signature + that ``functools.wraps`` would otherwise carry over. """ sig = _inspect.signature(wrapped) params = list(sig.parameters.values()) @@ -944,7 +1065,7 @@ def _public_signature(wrapped, *, has_dense): _inspect.Parameter( "dense", _inspect.Parameter.KEYWORD_ONLY, - default=False, + default=True, annotation="bool", ), ) @@ -974,10 +1095,14 @@ class _Service: def _make_getter(spec): """Build the public getter for one ``_Service`` spec (Factory).""" - has_dense = spec.layout != "stats" and spec.allow_dense + is_stats = spec.layout == "stats" + has_dense = not is_stats and spec.allow_dense + # The (site, time) grid is the default where it is offered; ``dense=False`` + # opts into the ragged array. Services without a grid (samples, stats) + # default to their only layout, so a plain call never trips the warning. @_wraps(spec.getter) # carry __name__/__wrapped__ through to help()/IDEs - def getter(*args, dense=False, **kwargs): + def getter(*args, dense=has_dense, **kwargs): if dense and not has_dense: _warnings.warn( f"{spec.getter.__name__} has no dense layout; ignoring dense=True.", @@ -985,7 +1110,7 @@ def getter(*args, dense=False, **kwargs): ) dense = False df, base_meta = _fetch(spec.getter, args, kwargs) - if spec.layout == "stats": + if is_stats: return _build_stats(df, base_meta, spec.service) if spec.metadata_cache is not None: series_meta, site_meta = spec.metadata_cache.lookup(_sites(df)) @@ -1009,7 +1134,7 @@ def getter(*args, dense=False, **kwargs): # returning an ``xarray.Dataset``. getter.__doc__ = _xr_doc( spec.getter, - cf_metadata=(spec.layout != "stats"), + cf_metadata=not is_stats, allow_dense=spec.allow_dense, ) getter.__signature__ = _public_signature(spec.getter, has_dense=has_dense) diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb index 3aa28246..55e9fcea 100644 --- a/demos/waterdata_xarray_demo.ipynb +++ b/demos/waterdata_xarray_demo.ipynb @@ -14,12 +14,17 @@ "written onto the dataset as CF attributes, so the result is self-describing and\n", "ready to write to netCDF.\n", "\n", + "By **default** the data is returned on a `(monitoring_location_id, time)` grid —\n", + "one named variable per parameter, so `ds[\"discharge\"].sel(time=...)` just works.\n", + "For large, very ragged multi-site pulls, pass `dense=False` to get the compact CF\n", + "*contiguous ragged array* instead.\n", + "\n", "This notebook covers:\n", "\n", - "1. the default **ragged** layout and how to read it;\n", - "2. **why** ragged is the default (and the `dense=True` opt-out);\n", - "3. the one trade-off you need to know — **selecting by time**;\n", - "4. multiple parameters in one pull;\n", + "1. a single time series (the default dense grid);\n", + "2. multiple parameters and sites on one grid;\n", + "3. the **ragged** layout (`dense=False`) for large, very ragged pulls;\n", + "4. selecting one series from a ragged dataset;\n", "5. water-quality samples (`get_samples`);\n", "6. writing CF netCDF." ] @@ -48,8 +53,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "\n", "from dataretrieval.waterdata import xarray as wdx" ] }, @@ -60,7 +63,8 @@ "## 1. A single time series\n", "\n", "Pull a year of daily mean discharge (parameter `00060`, statistic `00003`) at one\n", - "gage. The wrapper takes the same arguments as `waterdata.get_daily`." + "gage. The wrapper takes the same arguments as `waterdata.get_daily` and returns a\n", + "CF dataset:" ] }, { @@ -71,8 +75,8 @@ "source": [ "ds = wdx.get_daily(\n", " monitoring_location_id=\"USGS-05407000\",\n", - " parameter_code=\"00060\",\n", - " statistic_id=\"00003\",\n", + " parameter_code=\"00060\", # discharge\n", + " statistic_id=\"00003\", # daily mean\n", " time=\"2023-01-01/2024-01-01\",\n", ")\n", "ds" @@ -82,19 +86,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note the shape of the result — this is a CF **contiguous ragged array**\n", + "The result is a CF `(monitoring_location_id, time)` grid\n", "(`featureType = \"timeSeries\"`):\n", "\n", - "* every observation lives along a single `obs` dimension;\n", - "* each *series* — one `(monitoring location, parameter, statistic)` — is one\n", - " instance along the `timeseries` dimension;\n", - "* `row_size` records how many observations each series contributes (the CF\n", - " `sample_dimension` link), and `timeseries_id` carries `cf_role`;\n", - "* because there is a single, homogeneous parameter here, the descriptors land\n", - " directly on `value` (`long_name`, `units`, `cell_methods`, `standard_name`).\n", - "\n", - "The flag columns (`qualifier`, `approval_status`) are linked as\n", - "`ancillary_variables`, and dataset attributes carry provenance:" + "* each parameter is its own **named variable** (`discharge`), so `time` is a real\n", + " dimension you can index by label;\n", + "* the descriptors derived from the data land on the variable as CF attributes\n", + " (`long_name`, `units`, `cell_methods`, `standard_name`, `usgs_parameter_code`);\n", + "* `monitoring_location_id` is the instance dimension and carries\n", + " `cf_role = \"timeseries_id\"`; the flag columns (`qualifier`, `approval_status`)\n", + " are linked as `ancillary_variables`;\n", + "* dataset attributes carry provenance (`Conventions`, `references`,\n", + " `date_modified`)." ] }, { @@ -103,19 +106,23 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds[\"value\"].attrs)\n", - "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\", \"date_modified\")})" + "print(ds[\"discharge\"].attrs)\n", + "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\")})\n", + "\n", + "# time is a real dimension -> label-based selection just works\n", + "ds[\"discharge\"].sel(time=\"2023-07-01\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Why ragged is the default\n", + "## 2. Multiple parameters and sites\n", "\n", - "Real collections are *ragged*: some gages have a century of record, others a few\n", - "years. Pull discharge at two gages with very different start dates and look at\n", - "`row_size`:" + "Ask for several parameters across several gages in one call. Each parameter\n", + "becomes its own named variable on a shared `(monitoring_location_id, time)`\n", + "grid, NaN where a series has no observation (e.g. a gage that does not report a\n", + "given parameter):" ] }, { @@ -124,27 +131,23 @@ "metadata": {}, "outputs": [], "source": [ - "sites = [\"USGS-05407000\", \"USGS-02238500\"] # records since 1913 vs 2008\n", - "# Bound the window so the docs build stays light; the start-date gap still\n", - "# shows up as different row_size (~24 vs ~16 years of daily values).\n", - "ragged = wdx.get_daily(\n", - " monitoring_location_id=sites,\n", - " parameter_code=\"00060\",\n", + "grid = wdx.get_daily(\n", + " monitoring_location_id=[\"USGS-05407000\", \"USGS-03339000\"],\n", + " parameter_code=[\"00060\", \"00010\"], # discharge + water temperature\n", " statistic_id=\"00003\",\n", - " time=\"2000-01-01/2024-01-01\",\n", + " time=\"2023-01-01/2024-01-01\",\n", ")\n", - "print(\"dims :\", dict(ragged.sizes))\n", - "print(\"row_size :\", dict(zip(ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))" + "print(\"dims :\", dict(grid.sizes))\n", + "print(\"variables :\", list(grid.data_vars))\n", + "grid" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The alternative is a **dense** `(monitoring_location_id, time)` grid — one named\n", - "variable per parameter, NaN where a series has no observation. It is convenient\n", - "(see the next section) but pays for a union time axis and NaN fill. Pass\n", - "`dense=True` to get it, and compare the in-memory size:" + "Now any slice is a labelled `.sel(...)` — one site's series, or all sites on a\n", + "given day as a vector:" ] }, { @@ -153,38 +156,26 @@ "metadata": {}, "outputs": [], "source": [ - "dense = wdx.get_daily(\n", - " monitoring_location_id=sites,\n", - " parameter_code=\"00060\",\n", - " statistic_id=\"00003\",\n", - " time=\"2000-01-01/2024-01-01\",\n", - " dense=True,\n", - ")\n", - "print(\"dense dims :\", dict(dense.sizes))\n", - "print(\"dense var :\", list(dense.data_vars))\n", - "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB dense {dense.nbytes/1e6:6.2f} MB\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With only two gages the gap is modest, but it grows fast: a whole state's daily\n", - "discharge can be 15–30% dense, where the gridded layout balloons to hundreds of\n", - "megabytes (mostly NaN) while the ragged array stores only real observations.\n", - "That is why ragged is the default." + "# one site's discharge series\n", + "q = grid[\"discharge\"].sel(monitoring_location_id=\"USGS-05407000\")\n", + "print(\"single series :\", dict(q.sizes))\n", + "\n", + "# every site's discharge on one day\n", + "grid[\"discharge\"].sel(time=\"2023-07-01\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. The trade-off: selecting by time\n", + "## 3. The ragged layout for large, very ragged pulls\n", "\n", - "This is the one thing to internalize about the ragged layout.\n", + "The dense grid is ergonomic, but it pays for a union time axis and NaN fill.\n", + "Real collections are *ragged*: some gages have a century of record, others a few\n", + "years. For large multi-site pulls, pass `dense=False` to get the CF **contiguous\n", + "ragged array**, which stores only real observations — no NaN fill.\n", "\n", - "In the **dense** dataset, `time` is a real dimension with an index, so\n", - "label-based selection just works:" + "Pull discharge at two gages with very different start dates:" ] }, { @@ -193,39 +184,31 @@ "metadata": {}, "outputs": [], "source": [ - "# all sites on a given day, addressed as a (site, time) grid\n", - "dense[\"discharge\"].sel(time=\"2020-06-01\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the **ragged** dataset, `value` is one flat list along `obs`, and `time` is\n", - "*just another variable* riding along `obs` — not a dimension. So `value` cannot\n", - "be addressed as a `(site, time)` grid, and a time selection returns whatever\n", - "observations happen to match, **mixed across sites with no labels** (and\n", - "identical dates across sites collide onto the same axis):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"value dims:\", ragged[\"value\"].dims) # ('obs',) -- not (site, time)\n", - "# .sel(time=...) on the flat obs axis can't give you a per-series slice\n", - "ragged[\"value\"].sel(time=\"2020-06-01\")" + "sites = [\"USGS-05407000\", \"USGS-02238500\"] # records since 1913 vs 2008\n", + "# Bound the window so the docs build stays light; the start-date gap still\n", + "# shows up as different row_size.\n", + "ragged = wdx.get_daily(\n", + " monitoring_location_id=sites,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2000-01-01/2024-01-01\",\n", + " dense=False,\n", + ")\n", + "print(\"dims :\", dict(ragged.sizes))\n", + "print(\"row_size :\", dict(zip(\n", + " ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))\n", + "ragged" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To recover the convenient, time-indexed view you first **regroup** the flat\n", - "`obs` back into per-series pieces using the offsets implied by `row_size`. A\n", - "tiny helper:" + "In the ragged array every observation lives along a single `obs` dimension; each\n", + "series — one `(monitoring location, parameter, statistic)` — is one instance\n", + "along `timeseries`, and `row_size` records how many observations it contributes\n", + "(the CF `sample_dimension` link). Because it stores no NaN fill, it stays compact\n", + "as collections get ragged. Compare the in-memory size against the dense grid:" ] }, { @@ -234,42 +217,36 @@ "metadata": {}, "outputs": [], "source": [ - "def series(ds, i):\n", - " \"\"\"Instance ``i`` of a ragged dataset as a time-indexed DataArray.\"\"\"\n", - " starts = np.concatenate([[0], np.cumsum(ds[\"row_size\"].values)])\n", - " sl = slice(int(starts[i]), int(starts[i + 1]))\n", - " da = ds[\"value\"].isel(obs=sl)\n", - " return da.assign_coords(time=ds[\"time\"].isel(obs=sl)).swap_dims(obs=\"time\")\n", - "\n", - "\n", - "s0 = series(ragged, 0)\n", - "print(ragged[\"timeseries_id\"].values[0])\n", - "s0.sel(time=\"2020-06-01\") # now .sel(time=...) works on a single series" + "dense = wdx.get_daily(\n", + " monitoring_location_id=sites,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2000-01-01/2024-01-01\",\n", + ") # default grid\n", + "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB dense {dense.nbytes/1e6:6.2f} MB\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For anything beyond a one-off slice, the [`cf-xarray`](https://cf-xarray.readthedocs.io/)\n", - "package understands the CF ragged encoding (`cf_role`, `sample_dimension`) and\n", - "will regroup/decode for you. Or, when you know the pull is small and overlapping,\n", - "just ask for `dense=True` up front and use `.sel(time=...)` directly.\n", - "\n", - "**Rule of thumb:** ragged for storage and large multi-site pulls; `dense=True`\n", - "for ergonomic time-based slicing of a few overlapping series." + "With two gages the gap is modest, but it grows fast: a whole state's daily\n", + "discharge spans gages with wildly different record lengths, where the gridded\n", + "layout balloons (mostly NaN) while the ragged array stores only real\n", + "observations. **Rule of thumb:** the default grid for ergonomic, label-based\n", + "slicing of a few series; `dense=False` for storage and large, very ragged\n", + "multi-site pulls." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Multiple parameters in one pull\n", + "## 4. Selecting one series from a ragged dataset\n", "\n", - "When a pull mixes parameters (and units), the ragged layout keeps a single\n", - "`value` and records the parameter/unit per *instance* — so nothing is\n", - "mislabeled. The homogeneous descriptors drop off `value` and live on the\n", - "`parameter_code` / `unit_of_measure` coordinates instead:" + "In the ragged layout `time` is a flat `obs` axis, not a dimension, so you can't\n", + "`.sel(time=...)` directly. Use `wdx.select_series` to pull one series by its\n", + "labels; it returns a tidy, time-indexed dataset on which `.sel(time=...)` works:" ] }, { @@ -278,22 +255,20 @@ "metadata": {}, "outputs": [], "source": [ - "multi = wdx.get_daily(\n", - " monitoring_location_id=\"USGS-05407000\",\n", - " parameter_code=[\"00060\", \"00045\"], # discharge + precipitation\n", - " time=\"2023-06-01/2023-07-01\",\n", + "one = wdx.select_series(\n", + " ragged, monitoring_location_id=\"USGS-05407000\", parameter_code=\"00060\"\n", ")\n", - "print(\"value long_name:\", multi[\"value\"].attrs.get(\"long_name\"))\n", - "print(\"per-instance parameter_code:\", multi[\"parameter_code\"].values)\n", - "print(\"per-instance unit_of_measure:\", multi[\"unit_of_measure\"].values)" + "print(\"dims :\", dict(one.sizes))\n", + "one[\"value\"].sel(time=\"2020-07\").mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In the `dense=True` form each parameter instead becomes its own named variable\n", - "(`discharge`, `precipitation`, …) on the shared time grid." + "For richer work, the [`cf-xarray`](https://cf-xarray.readthedocs.io/) package\n", + "understands the CF ragged encoding (`cf_role`, `sample_dimension`) and will\n", + "regroup/decode the whole dataset for you." ] }, { @@ -302,12 +277,13 @@ "source": [ "## 5. Water-quality samples\n", "\n", - "`get_samples` returns discrete water-quality results in the same ragged shape:\n", - "one instance per `(monitoring location, characteristic, sample fraction)`, with\n", - "the result value plus `detection_condition` and `status` as ancillary\n", - "(censoring) variables. Characteristics are free text, so no CF `standard_name`\n", - "is inferred and non-numeric results coerce to NaN (the `detection_condition`\n", - "variable preserves non-detects)." + "`get_samples` returns discrete water-quality results. Samples are irregular in\n", + "time and characteristic, so this getter is always ragged: one instance per\n", + "`(monitoring location, characteristic, sample fraction)`, with the result value\n", + "plus `detection_condition` and `status` as ancillary (censoring) variables.\n", + "Characteristics are free text, so no CF `standard_name` is inferred and\n", + "non-numeric results coerce to NaN (the `detection_condition` variable preserves\n", + "non-detects)." ] }, { @@ -334,10 +310,10 @@ "source": [ "## 6. Writing CF netCDF\n", "\n", - "Because the dataset already carries CF metadata in the standard ragged-array\n", - "encoding, it serializes straight to a self-describing netCDF file that\n", - "CF-aware tools (THREDDS, `cf-xarray`, Panoply, …) can read back (requires a\n", - "netCDF backend, e.g. `pip install netCDF4`):" + "Because the dataset already carries CF metadata, it serializes straight to a\n", + "self-describing netCDF file that CF-aware tools (THREDDS, `cf-xarray`, Panoply,\n", + "…) can read back. This works for either layout (requires a netCDF backend, e.g.\n", + "`pip install netCDF4`):" ] }, { @@ -350,7 +326,7 @@ "import tempfile\n", "\n", "path = os.path.join(tempfile.mkdtemp(), \"daily_discharge.nc\")\n", - "ragged.to_netcdf(path) # CF-1.11 contiguous ragged array\n", + "ds.to_netcdf(path) # the dense grid from section 1\n", "print(\"wrote\", path)" ] }, @@ -378,5 +354,5 @@ } }, "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 4 } diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index d74fc64c..27c4cd6a 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -109,6 +109,20 @@ def test_ancillary_variables_linked(): ) +def test_list_valued_qualifier_is_flattened_to_string(): + # The API returns ``qualifier`` as a list of codes per observation; the + # ancillary variable must be flattened to a netCDF-encodable string (object + # arrays of lists can't be written), empty lists -> missing. + df = _daily_frame() + df["qualifier"] = [["A", "e"], []] # multi-code list, then empty list + for build in (wdx._build_ragged, wdx._build_dense): + ds = build(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + qual = ds["qualifier"] if "qualifier" in ds else ds["discharge_qualifier"] + flat = qual.values.ravel().tolist() + assert not any(isinstance(v, (list, tuple)) for v in flat) # no list cells + assert "A e" in flat # multi-code list joined with a space + + def test_unknown_unit_passes_through_verbatim(): df = _daily_frame() df["unit_of_measure"] = "widgets/s" # units are read from the frame @@ -197,6 +211,22 @@ def test_build_stats_flat_dataset(): assert isinstance(ds, xr.Dataset) assert "p50_va" in ds.data_vars assert ds.attrs["Conventions"] == "CF-1.11" + # A flat percentile table is not a CF discrete-sampling geometry, so it must + # NOT advertise featureType=timeSeries (which would mislead cf-xarray/CF + # tooling into treating it as a timeseries DSG). + assert "featureType" not in ds.attrs + + +def test_stats_empty_omits_feature_type(): + # The empty-stats path must also stay free of the timeSeries featureType. + ds = wdx._build_stats(pd.DataFrame(), _meta(), "statistics") + assert list(ds.data_vars) == [] + assert "featureType" not in ds.attrs + # the series builders still advertise the DSG featureType + series = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert series.attrs["featureType"] == "timeSeries" def test_build_stats_drops_hash_columns(): @@ -578,33 +608,95 @@ def test_ragged_field_schema_without_statistic(): assert ds["value"].attrs["long_name"] == "Discharge, cubic feet per second" -# --- dense opt-out wiring --------------------------------------------------- +# --- select_series (label-based selection on the ragged layout) ------------- + + +def _two_instance_ragged(): + """Ragged Dataset with two instances at one site: 00060 and 00010.""" + meta = {**_DISCHARGE_META, **_TEMP_META} + return wdx._build_ragged( + pd.concat([_daily_frame(), _temp_frame()]), + _meta(), + service="continuous", + series_meta=meta, + ) + + +def test_select_series_returns_time_indexed_single_series(): + ds = _two_instance_ragged() + s = wdx.select_series(ds, monitoring_location_id="USGS-1", parameter_code="00060") + # time is now a real dimension (not the flat obs axis) + assert set(s.sizes) == {"time"} + assert s["value"].dims == ("time",) + assert list(s["value"].values) == [100, 110] + # the series identity rides along as scalar coordinates + assert str(s["parameter_code"].values) == "00060" + assert str(s["unit_of_measure"].values) == "ft^3/s" + # ancillary flags follow the series; row_size is dropped + assert "approval_status" in s.data_vars + assert "row_size" not in s.variables + # .sel(time=...) works now that time is the dimension + assert s["value"].sel(time="2024-06-01").item() == 100 + + +def test_select_series_ambiguous_raises(): + # selecting by site alone matches both instances -> ask for more keys + ds = _two_instance_ragged() + with pytest.raises(ValueError, match="2 time series match"): + wdx.select_series(ds, monitoring_location_id="USGS-1") -def test_wrapper_defaults_to_ragged_and_dense_opt_out(monkeypatch): - # Default wrapper output is ragged; dense=True returns the gridded layout. +def test_select_series_no_match_raises(): + ds = _two_instance_ragged() + with pytest.raises(KeyError, match="no time series matches"): + wdx.select_series(ds, parameter_code="99999") + + +def test_select_series_unknown_key_raises(): + ds = _two_instance_ragged() + with pytest.raises(KeyError, match="not a per-series coordinate"): + wdx.select_series(ds, bogus="x") + + +def test_select_series_on_dense_raises_helpful_error(): + # On a dense Dataset, parameters are named variables; select_series points + # the user at ds[name].sel(...) instead of failing cryptically. + dense = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + with pytest.raises(ValueError, match="expects a ragged Dataset"): + wdx.select_series(dense, monitoring_location_id="USGS-1") + + +# --- ragged opt-out wiring -------------------------------------------------- + + +def test_wrapper_defaults_to_dense_and_ragged_opt_out(monkeypatch): + # Default wrapper output is the (site, time) grid; dense=False opts into the + # ragged array. monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta())) monkeypatch.setattr(wdx._TS_CACHE, "lookup", lambda site_ids: (_DISCHARGE_META, {})) - ragged = wdx.get_daily(monitoring_location_id="USGS-1") - assert "obs" in ragged.sizes and "value" in ragged.data_vars - assert "discharge" not in ragged.data_vars - - dense = wdx.get_daily(monitoring_location_id="USGS-1", dense=True) + dense = wdx.get_daily(monitoring_location_id="USGS-1") assert "discharge" in dense.data_vars assert "obs" not in dense.sizes + ragged = wdx.get_daily(monitoring_location_id="USGS-1", dense=False) + assert "obs" in ragged.sizes and "value" in ragged.data_vars + assert "discharge" not in ragged.data_vars + def test_series_wrapper_signature_advertises_dense_and_dataset_return(): # functools.wraps would otherwise carry over the getter's DataFrame # signature and hide ``dense``; _make_getter publishes an accurate one. + # dense defaults to True (the grid is the default; dense=False -> ragged). import inspect sig = inspect.signature(wdx.get_daily) assert "monitoring_location_id" in sig.parameters # real args stay visible dense = sig.parameters["dense"] assert dense.kind is inspect.Parameter.KEYWORD_ONLY - assert dense.default is False + assert dense.default is True assert "Dataset" in str(sig.return_annotation) assert wdx.get_daily.__annotations__["return"] == "xarray.Dataset" @@ -755,8 +847,10 @@ def boom(monitoring_location_id): def test_wrapper_survives_metadata_failure(monkeypatch): - # End to end: a failing metadata endpoint still yields a CF dataset built - # from the data; only the metadata-sourced long_name is missing. + # End to end on the DEFAULT (dense) layout: a failing metadata endpoint still + # yields a CF dataset built from the data. With no metadata name the variable + # falls back to the parameter code -- never the literal "nan" -- and only the + # metadata-sourced long_name is missing. monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta())) def boom(monitoring_location_id): @@ -765,11 +859,66 @@ def boom(monitoring_location_id): monkeypatch.setattr(wdx._TS_CACHE, "_getter", boom) wdx._TS_CACHE.clear() with pytest.warns(UserWarning, match="metadata lookup failed"): - ds = wdx.get_daily(monitoring_location_id="USGS-1") - assert "value" in ds.data_vars # observations survived - # standard_name/units come from the frame; long_name (metadata) is absent - assert "long_name" not in ds["value"].attrs - assert ds["value"].attrs["units"] == "ft3 s-1" + ds = wdx.get_daily(monitoring_location_id="USGS-1") # default (dense) + assert "00060" in ds.data_vars # observations survived under the code name + assert "nan" not in ds.data_vars # never a literal "nan" variable + v = ds["00060"] + assert "long_name" not in v.attrs # name comes from metadata, which failed + assert v.attrs["units"] == "ft3 s-1" # units come from the frame + + +def test_nan_parameter_description_falls_back_to_name(): + # A present-but-null parameter_description (NaN is truthy) must not mask the + # valid parameter_name; long_name falls back to the name. + meta = { + "00060": {"parameter_name": "Discharge", "parameter_description": float("nan")} + } + ds = wdx._build_dense(_daily_frame(), _meta(), service="daily", series_meta=meta) + assert ds["discharge"].attrs["long_name"] == "Discharge" + + +def test_nan_parameter_name_uses_code_not_literal_nan(): + # A present-but-null parameter_name must fall back to the parameter code, + # never produce a variable literally named "nan". + meta = {"00060": {"parameter_name": float("nan")}} + ds = wdx._build_dense(_daily_frame(), _meta(), service="daily", series_meta=meta) + assert "nan" not in ds.data_vars + assert "00060" in ds.data_vars + + +def test_dense_collision_dedup_is_order_independent(): + # Colliding (site, time) values must dedup deterministically (smallest), + # independent of the upstream row order. + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(200,), times=("2024-06-01",)) + kept = [] + for frame in (pd.concat([a, b]), pd.concat([b, a])): + with pytest.warns(UserWarning, match="multiple values per"): + ds = wdx._build_dense( + frame, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + kept.append( + ds["discharge"] + .sel(monitoring_location_id="USGS-1", time="2024-06-01") + .item() + ) + assert kept == [100, 100] # smallest value, both input orders + + +def test_partial_geometry_keeps_numeric_lonlat_coord(): + # When only some sites have point geometry, the lon/lat coordinate must stay + # numeric (float, NaN-filled) -- not an object array with None, which is a + # CF-invalid spatial coordinate. + a = _daily_frame(site="USGS-A", values=(1, 2), times=("2024-06-01", "2024-06-02")) + b = _daily_frame(site="USGS-B", values=(3, 4), times=("2024-06-01", "2024-06-02")) + a["geometry"] = [[-90.44, 43.19]] * len(a) + b["geometry"] = [None] * len(b) # this site carries no point geometry + ds = wdx._build_dense( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds["longitude"].dtype.kind == "f" # float, not object + assert ds["longitude"].sel(monitoring_location_id="USGS-A").item() == -90.44 + assert pd.isna(ds["longitude"].sel(monitoring_location_id="USGS-B").item()) def test_metadata_cache_bounded_and_clearable(): From e47ad689586aa16c4783ea3e22277acfb2e118b8 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 09:21:52 -0500 Subject: [PATCH 3/7] fix(waterdata.xarray): post-review hardening of the dense-default commit Address the second code-review pass: - dense (site, time) collision dedup is now fully deterministic: stable sort on value then the flag columns, so the retained ancillary is order-independent - _scalarize handles numpy arrays and nested/array elements without raising (was list/tuple-only and called pd.notna on possibly-non-scalar elements) - _none_if_nan is array-safe via an is_scalar guard (new _is_missing helper) - normalize NaN name descriptors once in _MetadataCache._ingest - only map _scalarize over flag columns that are actually sequence-valued - fix the stale examples/index.rst (dense is the default; dense=False ragged) 58 offline tests pass; live get_daily/get_samples + to_netcdf verified. Co-Authored-By: Claude Opus 4.8 --- dataretrieval/waterdata/xarray.py | 52 +++++++++++++++++++++---------- docs/source/examples/index.rst | 7 +++-- tests/waterdata_xarray_test.py | 40 +++++++++++++++++++++++- 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index b15912c0..d542e389 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -155,6 +155,16 @@ def _unique_present(frame, col): return frame[col].dropna().unique() if col in frame else [] +def _is_missing(value): + """True for None / a *scalar* NaN; False for any non-scalar (list, array). + + ``_pd.notna`` raises on a list/array (it returns an element-wise mask), so + guard with ``is_scalar`` before testing -- callers pass cells that may be + sequences. + """ + return value is None or (_pd.api.types.is_scalar(value) and _pd.isna(value)) + + def _none_if_nan(value): """Collapse a pandas NaN to None. @@ -162,19 +172,21 @@ def _none_if_nan(value): falling back. Metadata frames yield NaN for a column that is present but null for a given parameter, so normalize it to None before any such ``or`` chain. """ - return value if _pd.notna(value) else None + return None if _is_missing(value) else value def _scalarize(cell): - """Collapse a list-valued flag cell into a single string; pass scalars through. + """Collapse a sequence-valued flag cell into a single string; pass scalars through. The Water Data API returns some ancillary (flag) columns -- notably ``qualifier`` -- as a *list* of codes per observation. xarray/netCDF cannot - encode an object array whose elements are lists, so join the codes into one - space-separated string (an empty list becomes None). + encode an object array whose elements are sequences, so join the codes into + one space-separated string (an empty/all-missing sequence becomes None). + Handles lists, tuples, and numpy arrays; missing elements are dropped + element-by-element without assuming the element is scalar. """ - if isinstance(cell, (list, tuple)): - parts = [str(v) for v in cell if v is not None and _pd.notna(v)] + if isinstance(cell, (list, tuple, _np.ndarray)): + parts = [str(v) for v in cell if not _is_missing(v)] return " ".join(parts) if parts else None return cell @@ -265,11 +277,14 @@ def _prepare_values(df, group_cols, ancillary_cols): work["time"], format="ISO8601", errors="coerce", utc=True ).dt.tz_localize(None) work["value"] = _pd.to_numeric(work["value"], errors="coerce") - # Flatten any list-valued flag cells (e.g. ``qualifier``) to strings so the - # ancillary variables stay netCDF-encodable (``_scalarize`` passes scalars - # through unchanged). + # Flatten any sequence-valued flag cells (e.g. ``qualifier``) to strings so + # the ancillary variables stay netCDF-encodable. Only the columns that are + # actually sequence-valued pay the per-row map -- a flag column is uniformly + # typed, so the first non-null cell decides (skips the common scalar case). for c in ancillary: - work[c] = work[c].map(_scalarize) + nonnull = work[c].dropna() + if len(nonnull) and isinstance(nonnull.iloc[0], (list, tuple, _np.ndarray)): + work[c] = work[c].map(_scalarize) n_before = len(work) work = work[work["time"].notna()] dropped = n_before - len(work) @@ -443,12 +458,16 @@ def _ingest(self, meta, todo): continue if has_pcode: pcode = row.get("parameter_code") - if _pd.notna(pcode): + if not _is_missing(pcode): + # Normalize NaN -> None at the source so every consumer of + # the name descriptors gets a clean ``or``-able value. fresh[site]["params"][str(pcode)] = { - c: row.get(c) for c in name_cols + c: _none_if_nan(row.get(c)) for c in name_cols } if not fresh[site]["site"]: - desc = {c: row.get(c) for c in site_cols if _pd.notna(row.get(c))} + desc = { + c: row.get(c) for c in site_cols if not _is_missing(row.get(c)) + } if desc: fresh[site]["site"] = desc with self._lock: @@ -909,9 +928,10 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit): "value. Filter the query to separate them.", stacklevel=3, ) - # Sort by value first so the retained value is deterministic - # rather than dependent on the upstream response row order. - sub = sub.sort_values("value") + # Sort by value then the flag columns, with a stable sort, so the + # whole retained row (value *and* its ancillary flags) is + # deterministic rather than dependent on the upstream row order. + sub = sub.sort_values(["value", *ancillary], kind="stable") sub = sub[~sub.index.duplicated(keep="first")] ds_g = sub.to_xarray().rename( {"value": name, **{c: f"{name}_{c}" for c in ancillary}} diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index c86f6f4d..af9db9c6 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -35,9 +35,10 @@ functions and is executed against the live USGS Water Data API. CF-conventions ``xarray`` datasets from the ``waterdata`` module ---------------------------------------------------------------- The ``waterdata.xarray`` wrappers return CF-conventions ``xarray.Dataset`` -objects (a ragged array by default, with a ``dense=True`` gridded opt-out). -This notebook demonstrates the layouts, the time-selection trade-off, and -writing CF netCDF. +objects (a dense ``(monitoring_location_id, time)`` grid by default, with a +``dense=False`` contiguous-ragged-array opt-out for large multi-site pulls). +This notebook demonstrates the layouts, selecting one series, and writing CF +netCDF. .. toctree:: :maxdepth: 1 diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 27c4cd6a..735e82ba 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -109,6 +109,20 @@ def test_ancillary_variables_linked(): ) +def test_scalarize_handles_list_array_nested_and_scalar(): + import numpy as np + + assert wdx._scalarize(["A", "e"]) == "A e" + assert wdx._scalarize(("A", "e")) == "A e" + assert wdx._scalarize(np.array(["A", "e"])) == "A e" # numpy array, not list + assert wdx._scalarize([]) is None # empty -> missing + assert wdx._scalarize(["A", None]) == "A" # missing element dropped + assert wdx._scalarize("A") == "A" # scalar passes through + assert wdx._scalarize(None) is None + # a nested element must not raise (array-truth pitfall); it is stringified + assert isinstance(wdx._scalarize([["A", "e"], "z"]), str) + + def test_list_valued_qualifier_is_flattened_to_string(): # The API returns ``qualifier`` as a list of codes per observation; the # ancillary variable must be flattened to a netCDF-encodable string (object @@ -176,7 +190,9 @@ def test_multiple_parameters_outer_join_on_time(): def test_collision_dedups_with_warning(): # two values for the same (site, parameter, statistic, time) are ambiguous - # without the hash key -> keep the first and warn; site stays the dim. + # without the hash key -> keep the smallest (deterministically) and warn; + # site stays the dim. (test_dense_collision_dedup_is_order_independent pins + # the order-independence; here 100 is both the first and the smallest.) a = _daily_frame(values=(100,), times=("2024-06-01",)) b = _daily_frame(values=(200,), times=("2024-06-01",)) with pytest.warns(UserWarning, match="multiple values per"): @@ -905,6 +921,28 @@ def test_dense_collision_dedup_is_order_independent(): assert kept == [100, 100] # smallest value, both input orders +def test_dense_collision_kept_ancillary_is_order_independent(): + # Two rows collide on (site, time) with the SAME value but different + # qualifiers; the retained flag must be deterministic (stable sort on value + # then ancillary), not dependent on upstream row order. + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(100,), times=("2024-06-01",)) + a["qualifier"] = ["X"] + b["qualifier"] = ["Y"] + kept = [] + for frame in (pd.concat([a, b]), pd.concat([b, a])): + with pytest.warns(UserWarning, match="multiple values per"): + ds = wdx._build_dense( + frame, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + kept.append( + ds["discharge_qualifier"] + .sel(monitoring_location_id="USGS-1", time="2024-06-01") + .item() + ) + assert kept == ["X", "X"] # smaller flag kept, both input orders + + def test_partial_geometry_keeps_numeric_lonlat_coord(): # When only some sites have point geometry, the lon/lat coordinate must stay # numeric (float, NaN-filled) -- not an object array with None, which is a From 961be13da3e07502f7360c37f6002475ed087f26 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 09:56:17 -0500 Subject: [PATCH 4/7] fix(waterdata.xarray): resolve remaining review findings - samples now surface station longitude/latitude (mapped from Location_Longitude/Location_Latitude; _point_coords reads explicit lon/lat columns in addition to an OGC geometry column) - metadata cache: a single large pull is no longer subject to within-batch FIFO eviction (the call's result is built from the freshly-parsed entries), and sites with no metadata are no longer negatively cached, so they retry - dense variable naming is deterministic and unambiguous: a bare name (e.g. discharge) is used only when unique; same-named series are all disambiguated by cell method / statistic / parameter code - dense multi-unit label is deterministic (sorted) instead of row-order dependent - row_size is int64 (was int32) to avoid overflow / cumsum truncation - select_series rejects descriptor coords as keys (lon/lat float-equality footgun) and can match a null instance key 64 offline tests pass; live samples lon/lat + dense naming verified. Co-Authored-By: Claude Opus 4.8 --- dataretrieval/waterdata/xarray.py | 138 +++++++++++++++++++++++------- tests/waterdata_xarray_test.py | 108 ++++++++++++++++++++++- 2 files changed, 215 insertions(+), 31 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index d542e389..867d1ab9 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -232,7 +232,27 @@ def _lonlat(geom): def _point_coords(df, site): - """lon/lat dicts keyed by site from point geometry, or None.""" + """lon/lat dicts keyed by site, or None. + + Reads either a ``geometry`` column (the time-series getters' OGC response) or + explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via + :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates. + """ + if {"longitude", "latitude"}.issubset(df.columns): + geo = df.dropna(subset=["longitude", "latitude"]).drop_duplicates(site) + if geo.empty: + return None + lon, lat = {}, {} + for site_id, x, y in zip( + geo[site].to_numpy(), + geo["longitude"].to_numpy(), + geo["latitude"].to_numpy(), + ): + try: + lon[site_id], lat[site_id] = float(x), float(y) + except (TypeError, ValueError): + continue + return (lon, lat) if lon else None if "geometry" not in df.columns: return None geo = df.dropna(subset=["geometry"]).drop_duplicates(site) @@ -408,8 +428,9 @@ def lookup(self, site_ids): """ sites = sorted({str(s) for s in site_ids if _pd.notna(s)}) # Racy read of the keys is fine: a concurrent miss just re-fetches (the - # fetch is idempotent); only the writes in _ingest take the lock. + # fetch is idempotent); only the writes in _store take the lock. todo = [s for s in sites if s not in self._entries] + fresh: dict[str, dict] = {} if todo: try: meta, _ = self._getter(monitoring_location_id=todo) @@ -420,12 +441,17 @@ def lookup(self, site_ids): stacklevel=2, ) else: - self._ingest(meta, todo) + fresh = self._parse(meta, todo) + self._store(fresh) param_meta: dict[str, dict] = {} site_meta: dict[str, dict] = {} with self._lock: for s in sites: - entry = self._entries.get(s, {}) + # Prefer this call's freshly-parsed entry over the cache: the + # bounded cache may have already evicted just-fetched sites when a + # single pull's ``todo`` exceeds maxsize, but the current call + # must still see every site it fetched. + entry = fresh.get(s) or self._entries.get(s, {}) param_meta.update(entry.get("params", {})) if entry.get("site"): site_meta[s] = entry["site"] @@ -439,12 +465,8 @@ def clear(self): def __len__(self): return len(self._entries) - def _ingest(self, meta, todo): - """Parse ``meta`` into per-site entries, then merge + evict under lock. - - The parsing runs lock-free on a local dict; only the (cheap) merge into - the shared cache and the FIFO eviction past ``maxsize`` hold the lock. - """ + def _parse(self, meta, todo): + """Parse ``meta`` into per-site ``{params, site}`` entries (lock-free).""" fresh = {s: {"params": {}, "site": {}} for s in todo} if not meta.empty: name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns] @@ -470,8 +492,20 @@ def _ingest(self, meta, todo): } if desc: fresh[site]["site"] = desc + return fresh + + def _store(self, fresh): + """Merge non-empty entries into the bounded cache (FIFO eviction). + + Sites that came back with no metadata are *not* cached, so a later call + retries them rather than being stuck with a sticky empty result; the + current call still sees them via the freshly-parsed ``fresh`` dict. + """ + keep = {s: e for s, e in fresh.items() if e["params"] or e["site"]} + if not keep: + return with self._lock: - self._entries.update(fresh) + self._entries.update(keep) while len(self._entries) > self._maxsize: self._entries.pop(next(iter(self._entries))) @@ -524,14 +558,24 @@ def select_series(ds, **keys): "so select by name instead, e.g. " "ds[variable].sel(monitoring_location_id=...)." ) - inst_coords = [c for c in ds.coords if ds[c].dims == ("timeseries",)] + # Selectable keys are the series *identity* coordinates only -- exclude the + # per-series descriptors (lon/lat are a float-equality footgun; unit/HUC/state + # are not series identifiers). + descriptors = {"longitude", "latitude", "unit_of_measure", *_SITE_DESCRIPTORS} + inst_coords = [ + c for c in ds.coords if ds[c].dims == ("timeseries",) and c not in descriptors + ] mask = _np.ones(ds.sizes["timeseries"], dtype=bool) for key, value in keys.items(): if key not in inst_coords: raise KeyError( - f"{key!r} is not a per-series coordinate; choose from {inst_coords}." + f"{key!r} is not a per-series identity coordinate; choose from " + f"{inst_coords}." ) - mask &= ds[key].to_numpy() == value + arr = ds[key].to_numpy() + # NaN never equals anything, so match a missing instance key (e.g. a + # characteristic with no sample fraction) by null-ness instead. + mask &= _pd.isna(arr) if _is_missing(value) else (arr == value) matches = _np.flatnonzero(mask) if matches.size == 0: raise KeyError(f"no time series matches {keys}.") @@ -563,6 +607,10 @@ def select_series(ds, **keys): "Result_SampleFraction": "sample_fraction", "Result_ResultDetectionCondition": "detection_condition", "Result_MeasureStatusIdentifier": "status", + # Samples carry position as explicit columns (no OGC ``geometry``); map them + # to the canonical names so _point_coords surfaces station lon/lat. + "Location_Longitude": "longitude", + "Location_Latitude": "latitude", } _CANONICAL_COORD_ATTRS = { "parameter_code": {"long_name": "USGS parameter code"}, @@ -786,7 +834,9 @@ def _assemble(self, work, inst_cols, ancillary, has_unit): ) data_vars = { "value": ("obs", work["value"].to_numpy()), - "row_size": ("timeseries", row_size.to_numpy().astype("int32")), + # int64 (not int32): a single long, high-frequency series can exceed + # 2^31 observations, and the select_series cumsum must not overflow. + "row_size": ("timeseries", row_size.to_numpy().astype("int64")), } for c in ancillary: data_vars[c] = ("obs", work[c].to_numpy()) @@ -899,16 +949,25 @@ def _build_series(self, work, group_cols, ancillary, has_unit): def _variable_datasets(self, work, group_cols, ancillary, has_unit): """One pivoted ``(site, time)`` Dataset per (parameter, statistic).""" - datasets, used = [], set() + # First pass: gather each group's identity and base name, so naming can + # see the whole set (a bare name is only used when it is unambiguous). + specs = [] for _, group in work.groupby(group_cols, dropna=False): pcode = _first_present(group, "parameter_code") stat = _first_present(group, "statistic_id") - group_units = group["unit_of_measure"].dropna().unique() if has_unit else () - unit = group_units[0] if len(group_units) else None desc = self.series_meta.get(str(pcode), {}) if pcode is not None else {} - - name = self._variable_name(desc, pcode, stat, used) - used.add(name) + base = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value") + specs.append((group, pcode, stat, desc, base)) + names = self._disambiguate([s[4] for s in specs], [(s[1], s[2]) for s in specs]) + + datasets = [] + for (group, pcode, stat, desc, _base), name in zip(specs, names): + # Sort the units so the chosen label is deterministic across pulls + # (values are not converted either way; see the multi-unit warning). + group_units = ( + sorted(group["unit_of_measure"].dropna().unique()) if has_unit else [] + ) + unit = group_units[0] if group_units else None if len(group_units) > 1: # One variable can carry only one ``units`` attr; surface the @@ -951,15 +1010,34 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit): return datasets @staticmethod - def _variable_name(desc, pcode, stat, used): - """A unique slug for a variable; disambiguate same-parameter series.""" - name = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value") - if name in used: # same parameter, different statistic -> distinct var - op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None) - name = f"{name}_{_slug(op)}" if op else name - while name in used: - name += "_x" - return name + def _disambiguate(bases, keys): + """Map per-group base slugs to unique, deterministic variable names. + + ``keys[i]`` is the group's ``(parameter_code, statistic_id)``. A base used + by exactly one group stays bare (e.g. ``discharge``); a base shared by + several groups is disambiguated for *all* of them -- by the statistic's + cell-method operator (``discharge_maximum`` / ``discharge_mean``), falling + back to the statistic id then the parameter code -- so a bare name never + silently refers to an arbitrary one of several same-named series. + """ + counts: dict[str, int] = {} + for b in bases: + counts[b] = counts.get(b, 0) + 1 + names, used = [], set() + for base, (pcode, stat) in zip(bases, keys): + if counts[base] == 1: + name = base + else: + op = CF_CELL_METHODS.get(str(stat)) if stat is not None else None + suffix = op or (str(stat) if stat is not None else None) + name = f"{base}_{_slug(suffix)}" if suffix else base + if name == base or name in used: # statistic didn't separate them + name = f"{base}_{_slug(pcode)}" if pcode is not None else base + while name in used: + name += "_x" + used.add(name) + names.append(name) + return names class _StatsBuilder(_DatasetBuilder): diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 735e82ba..4a5cc5d2 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -655,6 +655,41 @@ def test_select_series_returns_time_indexed_single_series(): assert s["value"].sel(time="2024-06-01").item() == 100 +def test_dense_same_parameter_two_statistics_no_bare_name(): + # 00060 under both 00001 (max) and 00003 (mean): the bare 'discharge' name is + # ambiguous, so BOTH variables are disambiguated by their cell method -- no + # order-dependent bare 'discharge' that silently means one of them. + mx = _daily_frame(values=(500,), times=("2024-06-01",)) + mx["statistic_id"] = "00001" + mn = _daily_frame(values=(100,), times=("2024-06-01",)) + ds = wdx._build_dense( + pd.concat([mx, mn]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "discharge" not in ds.data_vars # no bare (ambiguous) name + assert {"discharge_maximum", "discharge_mean"} <= set(ds.data_vars) + assert ds["discharge_maximum"].attrs["cell_methods"] == "time: maximum" + assert ds["discharge_mean"].attrs["cell_methods"] == "time: mean" + + +def test_dense_single_statistic_keeps_bare_name(): + # The common single-statistic case keeps the clean bare name. + ds = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "discharge" in ds.data_vars + + +def test_select_series_matches_nan_instance_key(): + # An instance whose key is null (samples with no sample_fraction) must be + # selectable by passing None, since `== NaN` never matches. + df = _samples_frame() + df["Result_SampleFraction"] = None + ds = _samples_ds(df) + s = wdx.select_series(ds, characteristic="Temperature, water", sample_fraction=None) + assert set(s.sizes) == {"time"} + assert "value" in s.data_vars + + def test_select_series_ambiguous_raises(): # selecting by site alone matches both instances -> ask for more keys ds = _two_instance_ragged() @@ -670,8 +705,11 @@ def test_select_series_no_match_raises(): def test_select_series_unknown_key_raises(): ds = _two_instance_ragged() - with pytest.raises(KeyError, match="not a per-series coordinate"): + with pytest.raises(KeyError, match="not a per-series identity coordinate"): wdx.select_series(ds, bogus="x") + # descriptor coords (lon/lat/unit/HUC/state) are not selectable identity keys + with pytest.raises(KeyError, match="not a per-series identity coordinate"): + wdx.select_series(ds, unit_of_measure="ft^3/s") def test_select_series_on_dense_raises_helpful_error(): @@ -760,6 +798,20 @@ def _samples_ds(frame): ) +def test_samples_surface_lonlat_from_location_columns(): + # Samples carry position as Location_Latitude/Location_Longitude (no OGC + # geometry); the dataset must still get numeric longitude/latitude coords. + frame = _samples_frame() + frame["Location_Longitude"] = [-90.44] + frame["Location_Latitude"] = [43.19] + ds = _samples_ds(frame) + assert "longitude" in ds.coords and "latitude" in ds.coords + assert ds["longitude"].dtype.kind == "f" + assert float(ds["longitude"].values[0]) == -90.44 + assert float(ds["latitude"].values[0]) == 43.19 + assert ds["longitude"].attrs["units"] == "degrees_east" + + def test_build_samples_single_characteristic(): ds = _samples_ds(_samples_frame()) assert set(ds.sizes) == {"obs", "timeseries"} @@ -988,3 +1040,57 @@ def fake(monitoring_location_id): wdx._FIELD_CACHE._entries["Y"] = {"params": {}, "site": {}} wdx.clear_metadata_cache() assert len(wdx._TS_CACHE) == 0 and len(wdx._FIELD_CACHE) == 0 + + +def test_metadata_missing_site_is_not_negatively_cached(): + # A site the metadata endpoint returns nothing for must NOT be cached as an + # empty entry (which would never be retried); a later call re-fetches it. + calls = [] + + def fake(monitoring_location_id): + calls.append(list(monitoring_location_id)) + # respond only for S1, never for S2 + rows = [ + { + "monitoring_location_id": s, + "parameter_code": "00060", + "parameter_name": s, + } + for s in monitoring_location_id + if s == "S1" + ] + return pd.DataFrame(rows), SimpleNamespace(url=None) + + cache = wdx._MetadataCache(fake) + cache.lookup(["S1", "S2"]) + cache.lookup(["S1", "S2"]) + # S1 cached (hit, not re-fetched); S2 never cached, so it is re-requested. + assert calls[0] == ["S1", "S2"] + assert calls[1] == ["S2"] # only the still-uncached S2 + assert "S1" in cache._entries and "S2" not in cache._entries + + +def test_metadata_lookup_survives_within_batch_eviction(): + # A single pull whose site count exceeds maxsize must still return metadata + # for every requested site, even though the bounded cache can't hold them all. + sites = ["S0", "S1", "S2", "S3", "S4"] + + def fake(monitoring_location_id): + rows = [ + { + "monitoring_location_id": s, + "parameter_code": f"p{s}", # distinct per site + "parameter_name": f"name-{s}", + "hydrologic_unit_code": f"huc-{s}", + } + for s in monitoring_location_id + ] + return pd.DataFrame(rows), SimpleNamespace(url=None) + + cache = wdx._MetadataCache(fake, maxsize=2) + param_meta, site_meta = cache.lookup(sites) + # every requested site's metadata is in the result even though the bounded + # cache evicted most of the just-fetched batch. + assert {f"p{s}" for s in sites} <= set(param_meta) + assert set(site_meta) == set(sites) + assert len(cache) <= 2 # cache stayed bounded From 1e771581b96b40a8e7a3af856cb15a420cb8dfb0 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 10:21:25 -0500 Subject: [PATCH 5/7] feat(waterdata.xarray): add to_awkward() ragged -> awkward view Convert a ragged (dense=False) Dataset to a one-record-per-series awkward.Array: row_size is awkward's offsets and obs is its flat content, so it is a near-zero-copy re-view -- each series carries its scalar identity metadata plus jagged time/value/flag fields, no NaN fill, with per-series ops vectorized across the whole collection (e.g. ak.mean(arr.value, axis=1)). awkward is NOT a dependency: to_awkward lazy-imports it and raises an informative ModuleNotFoundError ("pip install awkward") when absent. Object columns route through ak.from_iter (NaN -> missing) so flags become a clean option[string]; numeric/datetime content stays numpy. Adds offline tests (importorskip awkward; the missing-dep error is tested by simulating the absent import) and a demo note. Co-Authored-By: Claude Opus 4.8 --- dataretrieval/waterdata/xarray.py | 58 +++++++++++++++++++++++++++++++ demos/waterdata_xarray_demo.ipynb | 23 ++++++++++++ tests/waterdata_xarray_test.py | 47 +++++++++++++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 867d1ab9..7f4b9248 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -84,6 +84,7 @@ __all__ = [ "clear_metadata_cache", "select_series", + "to_awkward", "get_continuous", "get_daily", "get_field_measurements", @@ -593,6 +594,63 @@ def select_series(ds, **keys): return series.drop_vars("row_size", errors="ignore") +def to_awkward(ds): + """Convert a ragged (``dense=False``) Dataset to a per-series ``awkward.Array``. + + The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs`` + dimension) is structurally identical to awkward's jagged ``ListOffsetArray``, + so this is a near-zero-copy re-view: each timeseries instance becomes one + record carrying its per-series identity/metadata (scalar fields such as + ``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its + observations as variable-length jagged fields (``time`` / ``value`` / flags). + No NaN fill, each series on its own time axis -- per-series operations then + vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``:: + + ds = wdx.get_daily(..., dense=False) + arr = wdx.to_awkward(ds) + ak.mean(arr.value, axis=1) # per-series means + arr[arr.parameter_code == "00060"] # filter series by metadata + + ``awkward`` is an optional dependency that is *not* installed with + ``dataretrieval``; install it separately (``pip install awkward``). + """ + try: + import awkward as ak + except ModuleNotFoundError as exc: # pragma: no cover - exercised only sans awkward + raise ModuleNotFoundError( + "to_awkward requires the optional 'awkward' dependency, which is not " + "installed with dataretrieval. Install it with: pip install awkward" + ) from exc + if "row_size" not in ds.variables or "obs" not in ds.dims: + raise ValueError( + "to_awkward expects a ragged Dataset (from dense=False); the default " + "dense Dataset is already a (monitoring_location_id, time) grid." + ) + counts = ds["row_size"].to_numpy() + + def _content(values): + # awkward rejects numpy object dtype; route string/None columns through + # from_iter (normalizing NaN -> missing), and keep numeric/datetime + # content as numpy -- that part is the zero-copy re-view. + if values.dtype == object: + return ak.from_iter([_none_if_nan(v) for v in values.tolist()]) + return values + + # Per-series (timeseries-dim) coords -> scalar record fields; obs-dim + # variables/coords -> jagged fields (unflattened by row_size). ``row_size`` + # itself is the offsets, already encoded in the jagged structure. + record = {} + for name in (*ds.data_vars, *ds.coords): + if name == "row_size": + continue + da = ds[name] + if da.dims == ("timeseries",): + record[name] = _content(da.to_numpy()) + elif da.dims == ("obs",): + record[name] = ak.unflatten(_content(da.to_numpy()), counts) + return ak.zip(record, depth_limit=1) + + # === column schemas ======================================================== # Water-quality samples (Samples DB / WQX) speak a different column vocabulary diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb index 55e9fcea..641ec498 100644 --- a/demos/waterdata_xarray_demo.ipynb +++ b/demos/waterdata_xarray_demo.ipynb @@ -271,6 +271,29 @@ "regroup/decode the whole dataset for you." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The whole collection at once: `to_awkward`\n", + "\n", + "For analysis across *all* series at once, convert the ragged dataset to an\n", + "[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n", + "(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n", + "a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n", + "fields, its observations jagged), and per-series operations vectorize with no NaN\n", + "fill. `awkward` is an optional dependency that is *not* installed with\n", + "`dataretrieval` (`pip install awkward`):\n", + "\n", + "```python\n", + "import awkward as ak\n", + "\n", + "arr = wdx.to_awkward(ragged) # one record per series\n", + "ak.mean(arr.value, axis=1) # per-series means, all at once\n", + "arr[arr.parameter_code == \"00060\"] # filter series by metadata\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 4a5cc5d2..5293f60c 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -722,6 +722,53 @@ def test_select_series_on_dense_raises_helpful_error(): wdx.select_series(dense, monitoring_location_id="USGS-1") +def test_to_awkward_missing_dependency_raises_informative(monkeypatch): + # awkward is NOT a dependency; calling to_awkward without it must raise a + # clear, actionable error rather than a bare ImportError. (Simulated so the + # test holds whether or not awkward happens to be installed.) + import builtins + + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name == "awkward": + raise ModuleNotFoundError("No module named 'awkward'") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + with pytest.raises(ModuleNotFoundError, match="pip install awkward"): + wdx.to_awkward(ds) + + +def test_to_awkward_converts_ragged_to_jagged_records(): + ak = pytest.importorskip("awkward") + ds = _two_instance_ragged() # two series at USGS-1: 00060 and 00010 + arr = wdx.to_awkward(ds) + assert len(arr) == ds.sizes["timeseries"] # one record per series + # scalar identity fields + jagged observation fields + assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set( + arr.fields + ) + # faithful: per-series lengths == row_size, total obs preserved, no fill + assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist() + assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"] + # per-series reductions vectorize across all series at once + means = ak.mean(arr.value, axis=1) + assert len(means) == len(arr) + + +def test_to_awkward_on_dense_raises(): + pytest.importorskip("awkward") + dense = wdx._build_dense( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + with pytest.raises(ValueError, match="expects a ragged Dataset"): + wdx.to_awkward(dense) + + # --- ragged opt-out wiring -------------------------------------------------- From b83b9a5725717b19ce795f54907576212168b345 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 10:44:25 -0500 Subject: [PATCH 6/7] refactor(waterdata.xarray): nest to_awkward observations under an `obs` field Group each series' per-observation fields (time, value, qualifier, approval_status) into a single jagged `obs` list of records, instead of several parallel top-level jagged fields. Each series now reads as "identity metadata (scalar fields) + obs (a list of {time, value, ...} observations)" -- `arr.obs.value`, `ak.mean(arr.obs.value, axis=1)` -- which is clearer and the conventional awkward idiom. Co-Authored-By: Claude Opus 4.8 --- dataretrieval/waterdata/xarray.py | 37 +++++++++++++++++++------------ demos/waterdata_xarray_demo.ipynb | 15 +++++++------ tests/waterdata_xarray_test.py | 14 ++++++------ 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 7f4b9248..0cc6e254 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -599,16 +599,19 @@ def to_awkward(ds): The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs`` dimension) is structurally identical to awkward's jagged ``ListOffsetArray``, - so this is a near-zero-copy re-view: each timeseries instance becomes one - record carrying its per-series identity/metadata (scalar fields such as - ``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its - observations as variable-length jagged fields (``time`` / ``value`` / flags). - No NaN fill, each series on its own time axis -- per-series operations then - vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``:: + so this is a near-zero-copy re-view. Each timeseries instance becomes one + record: its per-series identity/metadata as scalar fields + (``monitoring_location_id`` / ``parameter_code`` / ``longitude`` / ...) plus + a single ``obs`` field holding that series' observations as a + variable-length list of ``{time, value, }`` records. No NaN fill, each + series on its own time axis -- per-series operations then vectorize across + every series at once, e.g. ``ak.mean(arr.obs.value, axis=1)``:: ds = wdx.get_daily(..., dense=False) arr = wdx.to_awkward(ds) - ak.mean(arr.value, axis=1) # per-series means + arr[0].monitoring_location_id # one series' metadata + arr.obs.value # jagged values, all series + ak.mean(arr.obs.value, axis=1) # per-series means arr[arr.parameter_code == "00060"] # filter series by metadata ``awkward`` is an optional dependency that is *not* installed with @@ -636,19 +639,25 @@ def _content(values): return ak.from_iter([_none_if_nan(v) for v in values.tolist()]) return values - # Per-series (timeseries-dim) coords -> scalar record fields; obs-dim - # variables/coords -> jagged fields (unflattened by row_size). ``row_size`` - # itself is the offsets, already encoded in the jagged structure. - record = {} + # Per-series (timeseries-dim) coords -> scalar identity fields. Obs-dim + # variables/coords -> a single jagged ``obs`` field whose elements are the + # observation records, so each series is "metadata + a list of observations" + # rather than several parallel jagged fields. ``row_size`` is the offsets, + # already encoded by the unflatten. + scalars, obs_fields = {}, {} for name in (*ds.data_vars, *ds.coords): if name == "row_size": continue da = ds[name] if da.dims == ("timeseries",): - record[name] = _content(da.to_numpy()) + scalars[name] = _content(da.to_numpy()) elif da.dims == ("obs",): - record[name] = ak.unflatten(_content(da.to_numpy()), counts) - return ak.zip(record, depth_limit=1) + obs_fields[name] = ak.unflatten(_content(da.to_numpy()), counts) + # time first, then value, then any flags, for a readable observation record. + order = [n for n in ("time", "value") if n in obs_fields] + order += [n for n in obs_fields if n not in order] + obs = ak.zip({n: obs_fields[n] for n in order}) + return ak.zip({**scalars, "obs": obs}, depth_limit=1) # === column schemas ======================================================== diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb index 641ec498..a0775e1b 100644 --- a/demos/waterdata_xarray_demo.ipynb +++ b/demos/waterdata_xarray_demo.ipynb @@ -280,17 +280,18 @@ "For analysis across *all* series at once, convert the ragged dataset to an\n", "[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n", "(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n", - "a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n", - "fields, its observations jagged), and per-series operations vectorize with no NaN\n", - "fill. `awkward` is an optional dependency that is *not* installed with\n", - "`dataretrieval` (`pip install awkward`):\n", + "a near-zero-copy re-view: each series becomes one record — its identity as scalar\n", + "fields plus an `obs` list of `{time, value, …}` observations — so per-series\n", + "operations vectorize with no NaN fill. `awkward` is an optional dependency that\n", + "is *not* installed with `dataretrieval` (`pip install awkward`):\n", "\n", "```python\n", "import awkward as ak\n", "\n", - "arr = wdx.to_awkward(ragged) # one record per series\n", - "ak.mean(arr.value, axis=1) # per-series means, all at once\n", - "arr[arr.parameter_code == \"00060\"] # filter series by metadata\n", + "arr = wdx.to_awkward(ragged) # one record per series\n", + "arr[0].parameter_code # a series' metadata\n", + "ak.mean(arr.obs.value, axis=1) # per-series means, all at once\n", + "arr[arr.parameter_code == \"00060\"] # filter series by metadata\n", "```" ] }, diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 5293f60c..0c96d4dd 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -748,15 +748,15 @@ def test_to_awkward_converts_ragged_to_jagged_records(): ds = _two_instance_ragged() # two series at USGS-1: 00060 and 00010 arr = wdx.to_awkward(ds) assert len(arr) == ds.sizes["timeseries"] # one record per series - # scalar identity fields + jagged observation fields - assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set( - arr.fields - ) + # per-series identity is scalar fields; observations are nested under ``obs`` + assert {"monitoring_location_id", "parameter_code", "obs"} <= set(arr.fields) + assert {"time", "value"} <= set(arr.obs.fields) + assert "value" not in arr.fields # not a parallel top-level series # faithful: per-series lengths == row_size, total obs preserved, no fill - assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist() - assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"] + assert ak.num(arr.obs.value).tolist() == ds["row_size"].values.tolist() + assert int(ak.sum(ak.num(arr.obs.value))) == ds.sizes["obs"] # per-series reductions vectorize across all series at once - means = ak.mean(arr.value, axis=1) + means = ak.mean(arr.obs.value, axis=1) assert len(means) == len(arr) From 6f819bcbc1711fcd82ecc5f5a9202d1f491e19ba Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 5 Jun 2026 10:53:23 -0500 Subject: [PATCH 7/7] refactor(waterdata.xarray): simplify point-coords and naming dedup (/simplify) - _point_coords: merge the explicit-longitude/latitude path and the geometry path into one dedup/loop/return scaffold with a swappable per-row extractor, and route the lon/lat float coercion through _lonlat instead of a second open-coded try/except (kills the duplicated branch). - _DenseBuilder._disambiguate: replace the `name == base` proxy (a confusing stand-in for "no suffix") with an explicit "suffix didn't separate them" condition, and use collections.Counter for the base-name counts. No behavior change; 67 tests pass, both spatial-coordinate paths verified live. Co-Authored-By: Claude Opus 4.8 --- dataretrieval/waterdata/xarray.py | 50 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 0cc6e254..24e09489 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -57,6 +57,7 @@ import re as _re import threading as _threading import warnings as _warnings +from collections import Counter as _Counter from collections.abc import Callable from dataclasses import dataclass, field, replace from functools import wraps as _wraps @@ -239,34 +240,31 @@ def _point_coords(df, site): explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates. """ + # Both sources reduce to a per-row "lonlat-able" value that _lonlat decodes + # (a (lon, lat) tuple for the explicit columns, the geometry object + # otherwise), so the dedup/loop/coercion scaffolding is shared. if {"longitude", "latitude"}.issubset(df.columns): - geo = df.dropna(subset=["longitude", "latitude"]).drop_duplicates(site) - if geo.empty: - return None - lon, lat = {}, {} - for site_id, x, y in zip( - geo[site].to_numpy(), - geo["longitude"].to_numpy(), - geo["latitude"].to_numpy(), - ): - try: - lon[site_id], lat[site_id] = float(x), float(y) - except (TypeError, ValueError): - continue - return (lon, lat) if lon else None - if "geometry" not in df.columns: + subset = ["longitude", "latitude"] + + def _geoms(g): + return list(zip(g["longitude"].to_numpy(), g["latitude"].to_numpy())) + elif "geometry" in df.columns: + subset = ["geometry"] + + def _geoms(g): + return g["geometry"].to_numpy() + else: return None - geo = df.dropna(subset=["geometry"]).drop_duplicates(site) + + geo = df.dropna(subset=subset).drop_duplicates(site) if geo.empty: return None lon, lat = {}, {} - for site_id, geom in zip(geo[site].to_numpy(), geo["geometry"].to_numpy()): - xy = _lonlat(geom) + for site_id, geom in zip(geo[site].to_numpy(), _geoms(geo)): + xy = _lonlat(geom) # skips non-point / unparseable rather than guessing if xy is not None: lon[site_id], lat[site_id] = xy - if not lon: - return None # no point geometry; skip rather than guess - return lon, lat + return (lon, lat) if lon else None def _prepare_values(df, group_cols, ancillary_cols): @@ -1087,20 +1085,20 @@ def _disambiguate(bases, keys): back to the statistic id then the parameter code -- so a bare name never silently refers to an arbitrary one of several same-named series. """ - counts: dict[str, int] = {} - for b in bases: - counts[b] = counts.get(b, 0) + 1 + counts = _Counter(bases) names, used = [], set() for base, (pcode, stat) in zip(bases, keys): if counts[base] == 1: name = base else: + # statistic cell-method (or raw id); if that doesn't yield a + # fresh name, fall back to the parameter code. op = CF_CELL_METHODS.get(str(stat)) if stat is not None else None suffix = op or (str(stat) if stat is not None else None) name = f"{base}_{_slug(suffix)}" if suffix else base - if name == base or name in used: # statistic didn't separate them + if name in used or suffix is None: name = f"{base}_{_slug(pcode)}" if pcode is not None else base - while name in used: + while name in used: # final guard: append until unique name += "_x" used.add(name) names.append(name)