Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api/esmvalcore.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ Submodules
esmvalcore.io.intake_esgf
esmvalcore.io.local
esmvalcore.io.protocol
esmvalcore.io.xcube
5 changes: 5 additions & 0 deletions doc/api/esmvalcore.io.xcube.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
esmvalcore.io.xcube
===================

.. automodule:: esmvalcore.io.xcube
:no-inherited-members:
8 changes: 7 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ dependencies:
- scipy >=1.6
- shapely >=2.0.0
- xarray
- xcube
- xcube-cci
- yamale
- zarr >3
- zarr >2
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

zarr3 is perfectly able to read zarr2 datasets, bud

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xcube requires zarr==2

Copy link
Copy Markdown
Contributor

@valeriupredoi valeriupredoi Dec 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well that's a bummer - that means it can't read Zarr3 spec?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also Zarr2 is borderline archaic - good luck to us trying to maintain such an evironment

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like it's on the TODO list: xcube-dev/xcube#1182

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good, otherwise without a pixi env this would be diabolically hard to maintain 😁

# Python packages needed for building docs
- autodocsumm >=0.2.2
- ipython
Expand All @@ -66,3 +68,7 @@ dependencies:
- pytest-mock
- pytest-xdist
# Not on conda forge - vprof
- pip:
- git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer
- git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer-cmip7
- git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer-esa-cci
25 changes: 25 additions & 0 deletions esmvalcore/config/configurations/data-xcube-esacci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Read data from the ESA Climate Data Centre (ESA CCI) using xcube.
# More information available at
# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc.
projects:
ESACCI:
data:
ccizarr:
type: "esmvalcore.io.xcube.XCubeDataSource"
data_store_id: "ccizarr"
priority: 1
values: &values
# Use this to define the mapping between the short_name used by
# ESMValCore and the variable name in the source dataset.
short_name:
prw: tcwv
esa-cci-kc:
type: "esmvalcore.io.xcube.XCubeDataSource"
data_store_id: "esa-cci-kc"
priority: 2
values: *values
cciodp:
type: "esmvalcore.io.xcube.XCubeDataSource"
data_store_id: "cciodp"
priority: 3
values: *values
7 changes: 7 additions & 0 deletions esmvalcore/config/configurations/defaults/cmor_tables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ projects:
- cmip6/Tables
- cmip6-custom
strict: false
# Data from the ESA Climate Data Centre (ESA CCI).
ESACCI:
cmor_table:
type: esmvalcore.cmor.table.CMIP6Info
paths:
- cmip7/tables
- cmip6-custom
# Data from selected climate models that can be read in its native format by ESMValCore.
ACCESS:
cmor_table:
Expand Down
281 changes: 281 additions & 0 deletions esmvalcore/io/xcube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
"""Access data using `xcube <https://xcube.readthedocs.io>`_.

Run the command ``esmvaltool config copy data-xcube-esacci.yml`` to update
your :ref:`configuration <config-data-sources>` to use this module. This will
create a file with the following content in your configuration directory:

.. literalinclude:: ../configurations/data-xcube-esacci.yml
:language: yaml
:caption: Contents of ``data-xcube-esacci.yml``

"""

from __future__ import annotations

import copy
import fnmatch
import logging
from dataclasses import dataclass, field
from functools import cached_property
from typing import TYPE_CHECKING, Any

import xcube.core.store
from fixer import fix

import esmvalcore.io.protocol
from esmvalcore.iris_helpers import dataset_to_iris

if TYPE_CHECKING:
import iris.cube

from esmvalcore.typing import Facets, FacetValue


logger = logging.getLogger(__name__)

FREQUENCIES = {
"P1D": "day",
"P1M": "mon",
"P1Y": "yr",
}


@dataclass
class XCubeDataset(esmvalcore.io.protocol.DataElement):
"""A dataset that can be used to load data found using xcube_."""

name: str
"""A unique name identifying the data."""

facets: Facets = field(repr=False)
"""Facets are key-value pairs that were used to find this data."""

store: xcube.core.store.store.DataStore = field(repr=False)
"""The store containing the data."""

open_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when opening the data."""

_attributes: dict[str, Any] | None = field(
init=False,
repr=False,
default=None,
)

def __hash__(self) -> int:
"""Return a number uniquely representing the data element."""
return hash((self.name, self.facets.get("version")))

def prepare(self) -> None:
"""Prepare the data for access."""
self.store.preload_data(self.name)

@property
def attributes(self) -> dict[str, Any]:
"""Attributes are key-value pairs describing the data."""
if self._attributes is None:
msg = (
"Attributes have not been read yet. Call the `to_iris` method "
"first to read the attributes from the file."
)
raise ValueError(msg)
return self._attributes

@attributes.setter
def attributes(self, value: dict[str, Any]) -> None:
self._attributes = value

def to_iris(self) -> iris.cube.CubeList:
"""Load the data as Iris cubes.

Returns
-------
:
The loaded data.
"""
dataset = self.store.open_data(self.name, **self.open_params)
dataset = fix(dataset, self.name)
dataset.attrs["source_file"] = repr(self)

# Cache the attributes.
self.attributes = copy.deepcopy(dataset.attrs)
return dataset_to_iris(dataset)


_DATASETS_LOGGED: set[str] = set()


@dataclass
class XCubeDataSource(esmvalcore.io.protocol.DataSource):
"""Data source for finding files on a local filesystem."""

name: str
"""A name identifying the data source."""

project: str
"""The project that the data source provides data for."""

priority: int
"""The priority of the data source. Lower values have priority."""

debug_info: str = field(init=False, repr=False, default="")
"""A string containing debug information when no data is found."""

data_store_id: str
"""Name of the data store.

A list of available data stores can be found in the `xcube documentation
<https://xcube.readthedocs.io/en/latest/dataaccess.html#available-data-stores>`__.
"""

values: dict[str, dict[str, str]] = field(default_factory=dict)
"""Mapping between the ESMValCore and xcube facet values."""

data_store_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when creating the data store."""

open_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when opening the data."""

@cached_property
def store(self) -> xcube.core.store.store.DataStore:
"""The store containing the data."""
return xcube.core.store.new_data_store(
self.data_store_id,
**self.data_store_params,
)

@cached_property
def _available_datasets(self) -> list[str]:
return self.store.list_data_ids()

def _get_frequency(
self,
description: xcube.core.store.descriptor.DatasetDescriptor,
) -> str | None:
"""Get the frequency of the dataset.

Parameters
----------
description:
The description of the dataset to get the frequency from.

Returns
-------
:
The frequency of the dataset, or ``None`` if it cannot be determined.
"""
frequency = FREQUENCIES.get(
description.attrs.get("time_coverage_resolution", ""),
)
if not frequency:
# Try to extract the frequency from the dataset name.
values = [
v
for v in FREQUENCIES.values()
if v in description.data_id.split(".")
]
if len(values) == 1:
frequency = values[0]
return frequency

def find_data(self, **facets: FacetValue) -> list[XCubeDataset]:
"""Find data.

Parameters
----------
**facets :
Find data matching these facets.

Returns
-------
:
A list of data elements that have been found.
"""
result = []
requested_short_names = facets.get("short_name", "*")
if isinstance(requested_short_names, str | int | float):
requested_short_names = [str(requested_short_names)]
requested_xcube_short_names = [
self.values.get("short_name", {}).get(short_name, short_name)
for short_name in requested_short_names
]
requested_datasets = facets.get("dataset", "*")
if isinstance(requested_datasets, str | int | float):
requested_datasets = [str(requested_datasets)]

self.debug_info = (
"No dataset matching "
+ ", ".join(f"'{d}'" for d in requested_datasets)
+ f" was found in {self.data_store_id}. Available datasets are:\n"
+ "\n".join(sorted(self._available_datasets))
)
for data_id in self._available_datasets:
for dataset_pattern in requested_datasets:
if fnmatch.fnmatchcase(data_id, dataset_pattern):
description: xcube.core.store.descriptor.DatasetDescriptor = self.store.describe_data(
data_id,
)
if not isinstance(
description,
xcube.core.store.descriptor.DatasetDescriptor,
):
continue
available_xcube_short_names = list(description.data_vars)
xcube_short_names = [
short_name
for short_name in available_xcube_short_names
for short_name_pattern in requested_xcube_short_names
if fnmatch.fnmatchcase(short_name, short_name_pattern)
]
if not xcube_short_names:
self.debug_info = (
"No variable matching "
+ ", ".join(
f"'{s}'" for s in requested_xcube_short_names
)
+ f" was found in dataset '{data_id}'. Available variables are:\n"
+ "\n".join(sorted(available_xcube_short_names))
)
continue

timerange = f"{description.time_range[0]}/{description.time_range[1]}".replace(
"-",
"",
)
short_names = [
short_name
for short_name, xcube_short_name in self.values.get(
"short_name",
{},
).items()
if xcube_short_name in xcube_short_names
]
dataset = XCubeDataset(
name=data_id,
facets={
"dataset": data_id,
"short_name": (
short_names[0]
if len(short_names) == 1
else short_names
),
"timerange": timerange,
},
store=self.store,
open_params=copy.deepcopy(self.open_params),
)
if frequency := self._get_frequency(description):
dataset.facets["frequency"] = frequency
dataset.attributes = description.attrs

result.append(dataset)

if result:
self.debug_info = (
f"Found dataset{'' if len(result) == 1 else 's'} "
f"{', '.join(d.name for d in result)} in data store "
f"{self.data_store_id}."
)

return result
18 changes: 11 additions & 7 deletions esmvalcore/iris_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,9 +631,13 @@ def dataset_to_iris(
with ignore_warnings_context(ignore_warnings):
cubes = conversion_func(dataset)

# Restore the lat/lon coordinate units that iris changes to degrees
for coord_name in ["latitude", "longitude"]:
for cube in cubes:
for cube in cubes:
# Iris works best with masked arrays, so change NaNs to masked values.
array_module = da if cube.has_lazy_data() else np
cube.data = array_module.ma.masked_invalid(cube.core_data())

# Restore the lat/lon coordinate units that iris changes to degrees
for coord_name in ["latitude", "longitude"]:
try:
coord = cube.coord(coord_name)
except iris.exceptions.CoordinateNotFoundError:
Expand All @@ -643,9 +647,9 @@ def dataset_to_iris(
ds_coord = ds_coords[coord.var_name]
coord.units = _get_attribute(ds_coord, "units")

# If possible, add the source file as an attribute to support
# grouping by file when calling fix_metadata.
if filepath is not None:
cube.attributes.globals["source_file"] = str(filepath)
# If possible, add the source file as an attribute to support
# grouping by file when calling fix_metadata.
if filepath is not None:
cube.attributes.globals["source_file"] = str(filepath)

return cubes
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ dependencies = [
"stratify>=0.3",
"xarray",
"yamale",
"zarr>3",
"zarr>2",
]
description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
license = "Apache-2.0"
Expand Down