ESMValGroup · bouweandela · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/doc/api/esmvalcore.io.rst b/doc/api/esmvalcore.io.rst
@@ -21,3 +21,4 @@ Submodules
    esmvalcore.io.intake_esgf
    esmvalcore.io.local
    esmvalcore.io.protocol
+   esmvalcore.io.xcube
diff --git a/doc/api/esmvalcore.io.xcube.rst b/doc/api/esmvalcore.io.xcube.rst
@@ -0,0 +1,5 @@
+esmvalcore.io.xcube
+===================
+
+.. automodule:: esmvalcore.io.xcube
+    :no-inherited-members:
diff --git a/environment.yml b/environment.yml
@@ -48,8 +48,10 @@ dependencies:
   - scipy >=1.6
   - shapely >=2.0.0
   - xarray
+  - xcube
+  - xcube-cci
   - yamale
-  - zarr >3
+  - zarr >2
   # Python packages needed for building docs
   - autodocsumm >=0.2.2
   - ipython
@@ -66,3 +68,7 @@ dependencies:
   - pytest-mock
   - pytest-xdist
   # Not on conda forge - vprof
+  - pip:
+      - git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer
+      - git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer-cmip7
+      - git+https://github.com/ESMValGroup/fixer-prototype.git@main#subdirectory=packages/fixer-esa-cci
diff --git a/esmvalcore/config/configurations/data-xcube-esacci.yml b/esmvalcore/config/configurations/data-xcube-esacci.yml
@@ -0,0 +1,25 @@
+# Read data from the ESA Climate Data Centre (ESA CCI) using xcube.
+# More information available at
+# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc.
+projects:
+  ESACCI:
+    data:
+      ccizarr:
+        type: "esmvalcore.io.xcube.XCubeDataSource"
+        data_store_id: "ccizarr"
+        priority: 1
+        values: &values
+          # Use this to define the mapping between the short_name used by
+          # ESMValCore and the variable name in the source dataset.
+          short_name:
+            prw: tcwv
+      esa-cci-kc:
+        type: "esmvalcore.io.xcube.XCubeDataSource"
+        data_store_id: "esa-cci-kc"
+        priority: 2
+        values: *values
+      cciodp:
+        type: "esmvalcore.io.xcube.XCubeDataSource"
+        data_store_id: "cciodp"
+        priority: 3
+        values: *values
diff --git a/esmvalcore/config/configurations/defaults/cmor_tables.yml b/esmvalcore/config/configurations/defaults/cmor_tables.yml
@@ -57,6 +57,13 @@ projects:
         - cmip6/Tables
         - cmip6-custom
       strict: false
+  # Data from the ESA Climate Data Centre (ESA CCI).
+  ESACCI:
+    cmor_table:
+      type: esmvalcore.cmor.table.CMIP6Info
+      paths:
+        - cmip7/tables
+        - cmip6-custom
   # Data from selected climate models that can be read in its native format by ESMValCore.
   ACCESS:
     cmor_table:

diff --git a/esmvalcore/io/xcube.py b/esmvalcore/io/xcube.py
@@ -0,0 +1,281 @@
+"""Access data using `xcube <https://xcube.readthedocs.io>`_.
+
+Run the command ``esmvaltool config copy data-xcube-esacci.yml`` to update
+your :ref:`configuration <config-data-sources>` to use this module. This will
+create a file with the following content in your configuration directory:
+
+.. literalinclude:: ../configurations/data-xcube-esacci.yml
+   :language: yaml
+   :caption: Contents of ``data-xcube-esacci.yml``
+
+"""
+
+from __future__ import annotations
+
+import copy
+import fnmatch
+import logging
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+
+import xcube.core.store
+from fixer import fix
+
+import esmvalcore.io.protocol
+from esmvalcore.iris_helpers import dataset_to_iris
+
+if TYPE_CHECKING:
+    import iris.cube
+
+    from esmvalcore.typing import Facets, FacetValue
+
+
+logger = logging.getLogger(__name__)
+
+FREQUENCIES = {
+    "P1D": "day",
+    "P1M": "mon",
+    "P1Y": "yr",
+}
+
+
+@dataclass
+class XCubeDataset(esmvalcore.io.protocol.DataElement):
+    """A dataset that can be used to load data found using xcube_."""
+
+    name: str
+    """A unique name identifying the data."""
+
+    facets: Facets = field(repr=False)
+    """Facets are key-value pairs that were used to find this data."""
+
+    store: xcube.core.store.store.DataStore = field(repr=False)
+    """The store containing the data."""
+
+    open_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when opening the data."""
+
+    _attributes: dict[str, Any] | None = field(
+        init=False,
+        repr=False,
+        default=None,
+    )
+
+    def __hash__(self) -> int:
+        """Return a number uniquely representing the data element."""
+        return hash((self.name, self.facets.get("version")))
+
+    def prepare(self) -> None:
+        """Prepare the data for access."""
+        self.store.preload_data(self.name)
+
+    @property
+    def attributes(self) -> dict[str, Any]:
+        """Attributes are key-value pairs describing the data."""
+        if self._attributes is None:
+            msg = (
+                "Attributes have not been read yet. Call the `to_iris` method "
+                "first to read the attributes from the file."
+            )
+            raise ValueError(msg)
+        return self._attributes
+
+    @attributes.setter
+    def attributes(self, value: dict[str, Any]) -> None:
+        self._attributes = value
+
+    def to_iris(self) -> iris.cube.CubeList:
+        """Load the data as Iris cubes.
+
+        Returns
+        -------
+        :
+            The loaded data.
+        """
+        dataset = self.store.open_data(self.name, **self.open_params)
+        dataset = fix(dataset, self.name)
+        dataset.attrs["source_file"] = repr(self)
+
+        # Cache the attributes.
+        self.attributes = copy.deepcopy(dataset.attrs)
+        return dataset_to_iris(dataset)
+
+
+_DATASETS_LOGGED: set[str] = set()
+
+
+@dataclass
+class XCubeDataSource(esmvalcore.io.protocol.DataSource):
+    """Data source for finding files on a local filesystem."""
+
+    name: str
+    """A name identifying the data source."""
+
+    project: str
+    """The project that the data source provides data for."""
+
+    priority: int
+    """The priority of the data source. Lower values have priority."""
+
+    debug_info: str = field(init=False, repr=False, default="")
+    """A string containing debug information when no data is found."""
+
+    data_store_id: str
+    """Name of the data store.
+
+    A list of available data stores can be found in the `xcube documentation
+    <https://xcube.readthedocs.io/en/latest/dataaccess.html#available-data-stores>`__.
+    """
+
+    values: dict[str, dict[str, str]] = field(default_factory=dict)
+    """Mapping between the ESMValCore and xcube facet values."""
+
+    data_store_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when creating the data store."""
+
+    open_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when opening the data."""
+
+    @cached_property
+    def store(self) -> xcube.core.store.store.DataStore:
+        """The store containing the data."""
+        return xcube.core.store.new_data_store(
+            self.data_store_id,
+            **self.data_store_params,
+        )
+
+    @cached_property
+    def _available_datasets(self) -> list[str]:
+        return self.store.list_data_ids()
+
+    def _get_frequency(
+        self,
+        description: xcube.core.store.descriptor.DatasetDescriptor,
+    ) -> str | None:
+        """Get the frequency of the dataset.
+
+        Parameters
+        ----------
+        description:
+            The description of the dataset to get the frequency from.
+
+        Returns
+        -------
+        :
+            The frequency of the dataset, or ``None`` if it cannot be determined.
+        """
+        frequency = FREQUENCIES.get(
+            description.attrs.get("time_coverage_resolution", ""),
+        )
+        if not frequency:
+            # Try to extract the frequency from the dataset name.
+            values = [
+                v
+                for v in FREQUENCIES.values()
+                if v in description.data_id.split(".")
+            ]
+            if len(values) == 1:
+                frequency = values[0]
+        return frequency
+
+    def find_data(self, **facets: FacetValue) -> list[XCubeDataset]:
+        """Find data.
+
+        Parameters
+        ----------
+        **facets :
+            Find data matching these facets.
+
+        Returns
+        -------
+        :
+            A list of data elements that have been found.
+        """
+        result = []
+        requested_short_names = facets.get("short_name", "*")
+        if isinstance(requested_short_names, str | int | float):
+            requested_short_names = [str(requested_short_names)]
+        requested_xcube_short_names = [
+            self.values.get("short_name", {}).get(short_name, short_name)
+            for short_name in requested_short_names
+        ]
+        requested_datasets = facets.get("dataset", "*")
+        if isinstance(requested_datasets, str | int | float):
+            requested_datasets = [str(requested_datasets)]
+
+        self.debug_info = (
+            "No dataset matching "
+            + ", ".join(f"'{d}'" for d in requested_datasets)
+            + f" was found in {self.data_store_id}. Available datasets are:\n"
+            + "\n".join(sorted(self._available_datasets))
+        )
+        for data_id in self._available_datasets:
+            for dataset_pattern in requested_datasets:
+                if fnmatch.fnmatchcase(data_id, dataset_pattern):
+                    description: xcube.core.store.descriptor.DatasetDescriptor = self.store.describe_data(
+                        data_id,
+                    )
+                    if not isinstance(
+                        description,
+                        xcube.core.store.descriptor.DatasetDescriptor,
+                    ):
+                        continue
+                    available_xcube_short_names = list(description.data_vars)
+                    xcube_short_names = [
+                        short_name
+                        for short_name in available_xcube_short_names
+                        for short_name_pattern in requested_xcube_short_names
+                        if fnmatch.fnmatchcase(short_name, short_name_pattern)
+                    ]
+                    if not xcube_short_names:
+                        self.debug_info = (
+                            "No variable matching "
+                            + ", ".join(
+                                f"'{s}'" for s in requested_xcube_short_names
+                            )
+                            + f" was found in dataset '{data_id}'. Available variables are:\n"
+                            + "\n".join(sorted(available_xcube_short_names))
+                        )
+                        continue
+
+                    timerange = f"{description.time_range[0]}/{description.time_range[1]}".replace(
+                        "-",
+                        "",
+                    )
+                    short_names = [
+                        short_name
+                        for short_name, xcube_short_name in self.values.get(
+                            "short_name",
+                            {},
+                        ).items()
+                        if xcube_short_name in xcube_short_names
+                    ]
+                    dataset = XCubeDataset(
+                        name=data_id,
+                        facets={
+                            "dataset": data_id,
+                            "short_name": (
+                                short_names[0]
+                                if len(short_names) == 1
+                                else short_names
+                            ),
+                            "timerange": timerange,
+                        },
+                        store=self.store,
+                        open_params=copy.deepcopy(self.open_params),
+                    )
+                    if frequency := self._get_frequency(description):
+                        dataset.facets["frequency"] = frequency
+                    dataset.attributes = description.attrs
+
+                    result.append(dataset)
+
+        if result:
+            self.debug_info = (
+                f"Found dataset{'' if len(result) == 1 else 's'} "
+                f"{', '.join(d.name for d in result)} in data store "
+                f"{self.data_store_id}."
+            )
+
+        return result
diff --git a/esmvalcore/iris_helpers.py b/esmvalcore/iris_helpers.py
@@ -631,9 +631,13 @@ def dataset_to_iris(
     with ignore_warnings_context(ignore_warnings):
         cubes = conversion_func(dataset)
 
-    # Restore the lat/lon coordinate units that iris changes to degrees
-    for coord_name in ["latitude", "longitude"]:
-        for cube in cubes:
+    for cube in cubes:
+        # Iris works best with masked arrays, so change NaNs to masked values.
+        array_module = da if cube.has_lazy_data() else np
+        cube.data = array_module.ma.masked_invalid(cube.core_data())
+
+        # Restore the lat/lon coordinate units that iris changes to degrees
+        for coord_name in ["latitude", "longitude"]:
             try:
                 coord = cube.coord(coord_name)
             except iris.exceptions.CoordinateNotFoundError:
@@ -643,9 +647,9 @@ def dataset_to_iris(
                     ds_coord = ds_coords[coord.var_name]
                     coord.units = _get_attribute(ds_coord, "units")
 
-            # If possible, add the source file as an attribute to support
-            # grouping by file when calling fix_metadata.
-            if filepath is not None:
-                cube.attributes.globals["source_file"] = str(filepath)
+        # If possible, add the source file as an attribute to support
+        # grouping by file when calling fix_metadata.
+        if filepath is not None:
+            cube.attributes.globals["source_file"] = str(filepath)
 
     return cubes
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,7 +68,7 @@ dependencies = [
     "stratify>=0.3",
     "xarray",
     "yamale",
-    "zarr>3",
+    "zarr>2",
 ]
 description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
 license = "Apache-2.0"