From 16e11666ff0f09e48a6185efb50546987c2c96b8 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 6 Mar 2026 15:20:17 +0100 Subject: [PATCH 01/12] enchancement to add stac catalog and item for a product along with the existing collection --- CHANGES.md | 20 +++ deep_code/constants.py | 1 + deep_code/tests/tools/test_publish.py | 126 ++++++++++++++++ .../utils/test_dataset_stac_generator.py | 131 +++++++++++++++- deep_code/tools/new.py | 14 ++ deep_code/tools/publish.py | 87 +++++++++-- deep_code/utils/dataset_stac_generator.py | 141 +++++++++++++++++- 7 files changed, 508 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0000658..fcce0d5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -60,3 +60,23 @@ - Fixed a bug in build_child_link_to_related_experiment for the publish mode `"all"`. ## Changes in 0.1.8 (in Development) + +- Fixed a crash in workflow publishing when `jupyter_notebook_url` is not provided in + the workflow config. `jupyter_kernel_info`, `application_link`, and `jnb_open_link` + are now only computed when a notebook URL is present, making the field truly optional. + +- Added STAC Item and S3-hosted STAC Catalog generation for Zarr datasets, enabling + a richer `STAC Collection → STAC Catalog (S3) → STAC Item` hierarchy alongside the + existing OSC metadata. + - A single STAC Item is generated per Zarr store, covering the full spatiotemporal + extent with two assets: `zarr-data` (`application/vnd+zarr`) and + `zarr-consolidated-metadata` (`.zmetadata`). + - The S3 STAC catalog and item are written directly to S3 via `fsspec`/`s3fs` + independently of the GitHub PR. + - The OSC STAC Collection gains a `child` link pointing to the S3 catalog root, + connecting the two levels of the hierarchy. + - Opt-in via the new `stac_catalog_s3_root` field in `dataset_config.yaml` + (e.g. `stac_catalog_s3_root: s3://my-bucket/stac/my-collection/`). + - S3 write credentials are resolved from `S3_USER_STORAGE_KEY`/`S3_USER_STORAGE_SECRET`, + `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`, or the boto3 default chain + (IAM role, `~/.aws/credentials`) — no secrets in config files. diff --git a/deep_code/constants.py b/deep_code/constants.py index 9d01cae..1e8cdcb 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -39,3 +39,4 @@ APPLICATION_STAC_EXTENSION_SPEC = ( "https://stac-extensions.github.io/application/v0.1.0/schema.json" ) +ZARR_MEDIA_TYPE = "application/vnd+zarr" diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 42dbc47..a83854e 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -1,4 +1,5 @@ import json +import os import tempfile import unittest from pathlib import Path @@ -174,6 +175,131 @@ def test_publish_builds_pr_params(self, mock_wf, mock_ds): assert "workflow/experiment: wf" in kwargs["pr_title"] + # ------------------------------------------------------------------ + # S3 credential resolution + # ------------------------------------------------------------------ + + def test_get_stac_s3_storage_options_prefers_xcube_env_vars(self): + env = { + "S3_USER_STORAGE_KEY": "xcube-key", + "S3_USER_STORAGE_SECRET": "xcube-secret", + "AWS_ACCESS_KEY_ID": "aws-key", + "AWS_SECRET_ACCESS_KEY": "aws-secret", + } + with patch.dict(os.environ, env): + opts = self.publisher._get_stac_s3_storage_options() + self.assertEqual(opts["key"], "xcube-key") + self.assertEqual(opts["secret"], "xcube-secret") + + def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self): + env = {"AWS_ACCESS_KEY_ID": "aws-key", "AWS_SECRET_ACCESS_KEY": "aws-secret"} + # Ensure xcube vars are absent + patched_env = { + k: v + for k, v in os.environ.items() + if k not in ("S3_USER_STORAGE_KEY", "S3_USER_STORAGE_SECRET") + } + patched_env.update(env) + with patch.dict(os.environ, patched_env, clear=True): + opts = self.publisher._get_stac_s3_storage_options() + self.assertEqual(opts["key"], "aws-key") + self.assertEqual(opts["secret"], "aws-secret") + + def test_get_stac_s3_storage_options_returns_empty_for_boto3_chain(self): + no_cred_env = { + k: v + for k, v in os.environ.items() + if k + not in ( + "S3_USER_STORAGE_KEY", + "S3_USER_STORAGE_SECRET", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + ) + } + with patch.dict(os.environ, no_cred_env, clear=True): + opts = self.publisher._get_stac_s3_storage_options() + self.assertEqual(opts, {}) + + # ------------------------------------------------------------------ + # S3 write helper + # ------------------------------------------------------------------ + + @patch("deep_code.tools.publish.fsspec.open") + def test_write_stac_catalog_to_s3(self, mock_fsspec_open): + mock_file = MagicMock() + mock_ctx = MagicMock() + mock_ctx.__enter__ = MagicMock(return_value=mock_file) + mock_ctx.__exit__ = MagicMock(return_value=False) + mock_fsspec_open.return_value = mock_ctx + + file_dict = { + "s3://bucket/catalog.json": {"type": "Catalog", "id": "test"}, + "s3://bucket/col/item.json": {"type": "Feature", "id": "item"}, + } + self.publisher._write_stac_catalog_to_s3( + file_dict, {"key": "k", "secret": "s"} + ) + + self.assertEqual(mock_fsspec_open.call_count, 2) + mock_fsspec_open.assert_any_call( + "s3://bucket/catalog.json", "w", key="k", secret="s" + ) + mock_fsspec_open.assert_any_call( + "s3://bucket/col/item.json", "w", key="k", secret="s" + ) + + # ------------------------------------------------------------------ + # End-to-end zarr STAC publishing wired into publish() + # ------------------------------------------------------------------ + + @patch("deep_code.tools.publish.fsspec.open") + @patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}}) + def test_publish_writes_zarr_stac_to_s3_when_configured( + self, mock_publish_ds, mock_fsspec_open + ): + self.publisher.dataset_config["stac_catalog_s3_root"] = ( + "s3://test-bucket/stac/" + ) + + mock_ctx = MagicMock() + mock_ctx.__enter__ = MagicMock(return_value=MagicMock()) + mock_ctx.__exit__ = MagicMock(return_value=False) + mock_fsspec_open.return_value = mock_ctx + + mock_generator = MagicMock() + mock_generator.build_zarr_stac_catalog_file_dict.return_value = { + "s3://test-bucket/stac/catalog.json": {"type": "Catalog"}, + "s3://test-bucket/stac/test-collection/item.json": {"type": "Feature"}, + } + # Simulate what publish_dataset() normally does: store the generator + self.publisher._last_generator = mock_generator + self.publisher.gh_publisher.publish_files.return_value = "PR_URL" + + self.publisher.publish(mode="dataset") + + mock_generator.build_zarr_stac_catalog_file_dict.assert_called_once_with( + "s3://test-bucket/stac/" + ) + # Two S3 files written: catalog.json + item.json + self.assertEqual(mock_fsspec_open.call_count, 2) + + @patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}}) + def test_publish_skips_zarr_stac_when_not_configured(self, mock_publish_ds): + # No stac_catalog_s3_root in config + self.publisher.dataset_config = { + "collection_id": "test-collection", + "dataset_id": "test-dataset", + } + self.publisher.gh_publisher.publish_files.return_value = "PR_URL" + + with patch.object( + self.publisher, "_write_stac_catalog_to_s3" + ) as mock_write: + self.publisher.publish(mode="dataset") + mock_write.assert_not_called() + + class TestParseGithubNotebookUrl: @pytest.mark.parametrize( "url,repo_url,repo_name,branch,file_path", diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 1e47b58..47dffcb 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock, patch import numpy as np -from pystac import Catalog, Collection +from pystac import Catalog, Collection, Item from xarray import DataArray, Dataset from deep_code.constants import ( @@ -16,6 +16,7 @@ OSC_THEME_SCHEME, PRODUCT_BASE_CATALOG_SELF_HREF, VARIABLE_BASE_CATALOG_SELF_HREF, + ZARR_MEDIA_TYPE, ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator, Theme @@ -177,6 +178,134 @@ def test_update_deepesdl_collection(self, mock_from_file): self.assertGreaterEqual(calls, 1 + len(self.generator.osc_themes)) mock_coll.set_self_href.assert_called_once_with(DEEPESDL_COLLECTION_SELF_HREF) + # ------------------------------------------------------------------ + # Zarr STAC Item / Catalog generation + # ------------------------------------------------------------------ + + def test_build_zarr_stac_item_structure(self): + """Item has correct geometry, bbox, datetime range, assets, and links.""" + s3_root = "s3://test-bucket/stac/my-collection/" + item = self.generator.build_zarr_stac_item(s3_root) + + self.assertIsInstance(item, Item) + self.assertEqual(item.id, "mock-collection-id") + + # Spatial + self.assertEqual(item.bbox, [-180.0, -90.0, 180.0, 90.0]) + self.assertEqual(item.geometry["type"], "Polygon") + coords = item.geometry["coordinates"][0] + self.assertEqual(len(coords), 5) # closed ring + + # Temporal — datetime must be null; start/end in properties + self.assertIsNone(item.datetime) + self.assertIn("start_datetime", item.properties) + self.assertIn("end_datetime", item.properties) + # Timezone-aware ISO strings + self.assertTrue(item.properties["start_datetime"].endswith("+00:00")) + self.assertTrue(item.properties["end_datetime"].endswith("+00:00")) + + # Assets + self.assertIn("zarr-data", item.assets) + self.assertIn("zarr-consolidated-metadata", item.assets) + + zarr_asset = item.assets["zarr-data"] + self.assertEqual(zarr_asset.href, "s3://mock-bucket/mock-dataset") + self.assertEqual(zarr_asset.media_type, ZARR_MEDIA_TYPE) + self.assertIn("data", zarr_asset.roles) + + meta_asset = item.assets["zarr-consolidated-metadata"] + self.assertEqual( + meta_asset.href, "s3://mock-bucket/mock-dataset/.zmetadata" + ) + self.assertIn("metadata", meta_asset.roles) + + # Self href + self.assertEqual( + item.self_href, + "s3://test-bucket/stac/my-collection/mock-collection-id/item.json", + ) + + # Required link rels + link_rels = {link.rel for link in item.links} + self.assertIn("root", link_rels) + self.assertIn("parent", link_rels) + self.assertIn("collection", link_rels) + + # root and parent point to the S3 catalog + root_link = next(l for l in item.links if l.rel == "root") + self.assertEqual( + root_link.target, + "s3://test-bucket/stac/my-collection/catalog.json", + ) + + # collection link points to the OSC GitHub collection + coll_link = next(l for l in item.links if l.rel == "collection") + self.assertIn("open-science-catalog-metadata", coll_link.target) + self.assertIn("mock-collection-id", coll_link.target) + + def test_build_zarr_stac_item_trailing_slash_normalised(self): + """Trailing slash on s3_root should not produce double slashes.""" + item_with = self.generator.build_zarr_stac_item("s3://bucket/stac/") + item_without = self.generator.build_zarr_stac_item("s3://bucket/stac") + self.assertEqual(item_with.self_href, item_without.self_href) + + def test_build_zarr_stac_catalog_file_dict_keys(self): + """file_dict contains exactly the catalog and item S3 paths.""" + s3_root = "s3://test-bucket/stac/my-collection/" + file_dict = self.generator.build_zarr_stac_catalog_file_dict(s3_root) + + catalog_path = "s3://test-bucket/stac/my-collection/catalog.json" + item_path = ( + "s3://test-bucket/stac/my-collection/mock-collection-id/item.json" + ) + self.assertIn(catalog_path, file_dict) + self.assertIn(item_path, file_dict) + self.assertEqual(len(file_dict), 2) + + def test_build_zarr_stac_catalog_file_dict_content(self): + """Catalog dict is type Catalog; item dict is type Feature with assets.""" + s3_root = "s3://test-bucket/stac/my-collection/" + file_dict = self.generator.build_zarr_stac_catalog_file_dict(s3_root) + + catalog_dict = file_dict["s3://test-bucket/stac/my-collection/catalog.json"] + self.assertEqual(catalog_dict["type"], "Catalog") + self.assertEqual(catalog_dict["id"], "mock-collection-id-stac-catalog") + + item_dict = file_dict[ + "s3://test-bucket/stac/my-collection/mock-collection-id/item.json" + ] + self.assertEqual(item_dict["type"], "Feature") + self.assertEqual(item_dict["id"], "mock-collection-id") + self.assertIn("assets", item_dict) + self.assertIn("zarr-data", item_dict["assets"]) + self.assertIn("zarr-consolidated-metadata", item_dict["assets"]) + + def test_build_dataset_stac_collection_adds_s3_catalog_child_link(self): + """Child link to S3 catalog is added when stac_catalog_s3_root is provided.""" + s3_root = "s3://test-bucket/stac/my-collection/" + collection = self.generator.build_dataset_stac_collection( + mode="dataset", stac_catalog_s3_root=s3_root + ) + child_links = [l for l in collection.links if l.rel == "child"] + s3_child = next( + (l for l in child_links if "s3://" in str(l.target)), None + ) + self.assertIsNotNone(s3_child, "Expected a child link pointing to S3 catalog") + self.assertEqual( + s3_child.target, + "s3://test-bucket/stac/my-collection/catalog.json", + ) + + def test_build_dataset_stac_collection_no_s3_child_link_by_default(self): + """No S3 child link is added when stac_catalog_s3_root is absent.""" + collection = self.generator.build_dataset_stac_collection(mode="dataset") + s3_child_links = [ + l + for l in collection.links + if l.rel == "child" and "s3://" in str(getattr(l, "target", "")) + ] + self.assertEqual(len(s3_child_links), 0) + class TestFormatString(unittest.TestCase): def test_single_word(self): diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 3e1bd08..485d7f0 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -79,8 +79,22 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: template, sort_keys=False, width=1000, default_flow_style=False ) + stac_catalog_comment = ( + "\n# Optional: publish a STAC Catalog + Item to S3 alongside the Zarr.\n" + "# When set, deep-code writes:\n" + "# {stac_catalog_s3_root}/catalog.json (STAC Catalog root)\n" + "# {stac_catalog_s3_root}/{collection_id}/item.json (STAC Item for the whole Zarr)\n" + "# and adds a 'child' link from the OSC Collection to this S3 catalog.\n" + "# S3 write credentials are resolved in order from:\n" + "# 1. S3_USER_STORAGE_KEY / S3_USER_STORAGE_SECRET env vars\n" + "# 2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars\n" + "# 3. boto3 default chain (IAM role, ~/.aws/credentials)\n" + "# stac_catalog_s3_root: s3://[YOUR-BUCKET]/stac/[collection-id]/\n" + ) + if output_path: with open(output_path, "w") as f: f.write("# Complete Dataset Configuration Template\n") f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n") f.write(yaml_str) + f.write(stac_catalog_comment) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 8eec27c..4055904 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -3,7 +3,9 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. import copy +import json import logging +import os from datetime import datetime from pathlib import Path from typing import Any, Literal @@ -286,9 +288,14 @@ def publish_dataset( osc_themes=osc_themes, cf_params=cf_params, ) + # Store so publish() can reuse it for zarr STAC catalog generation + self._last_generator = generator + stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root") variable_ids = generator.get_variable_ids() - ds_collection = generator.build_dataset_stac_collection(mode=mode) + ds_collection = generator.build_dataset_stac_collection( + mode=mode, stac_catalog_s3_root=stac_catalog_s3_root + ) # Prepare a dictionary of file paths and content file_dict = {} @@ -416,17 +423,21 @@ def generate_workflow_experiment_records( wf_record_properties = rg.build_record_properties(properties_list, contacts) # make a copy for experiment record exp_record_properties = copy.deepcopy(wf_record_properties) - jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict() + jupyter_kernel_info = {} + if jupyter_notebook_url: + jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict() link_builder = LinksBuilder(osc_themes, jupyter_kernel_info) theme_links = link_builder.build_theme_links_for_records() - application_link = link_builder.build_link_to_jnb( - self.workflow_title, jupyter_notebook_url - ) - jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github( - jupyter_notebook_url=jupyter_notebook_url - ) - + application_link = [] + jnb_open_link = [] + if jupyter_notebook_url: + jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github( + jupyter_notebook_url=jupyter_notebook_url + ) + application_link = link_builder.build_link_to_jnb( + self.workflow_title, jupyter_notebook_url + ) workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", @@ -505,6 +516,49 @@ def generate_workflow_experiment_records( return {} return file_dict + def _get_stac_s3_storage_options(self) -> dict: + """Resolve S3 credentials for writing the STAC catalog. + + Priority (first match wins): + + 1. xcube user-storage env vars — ``S3_USER_STORAGE_KEY`` / + ``S3_USER_STORAGE_SECRET`` (already used by :func:`open_dataset`). + 2. Standard AWS env vars — ``AWS_ACCESS_KEY_ID`` / + ``AWS_SECRET_ACCESS_KEY``. + 3. boto3 default credential chain — IAM role attached to the + JupyterHub pod, ``~/.aws/credentials`` profile, etc. + An empty ``storage_options`` dict lets ``s3fs`` fall through + to this chain automatically; no secrets are required in code. + + .. note:: + **JupyterHub best practice**: prefer IAM roles (instance / pod + identity) over env-var credentials. IAM roles are scoped to the + specific S3 prefix the user owns, require no secret rotation, and + are never visible to other users on the hub. Per-user env vars + set by the JupyterHub spawner (not server-wide) are an acceptable + fallback — they are private to each user's server process. + Avoid hard-coding credentials in YAML config files. + """ + key = os.environ.get("S3_USER_STORAGE_KEY") or os.environ.get( + "AWS_ACCESS_KEY_ID" + ) + secret = os.environ.get("S3_USER_STORAGE_SECRET") or os.environ.get( + "AWS_SECRET_ACCESS_KEY" + ) + if key and secret: + return {"key": key, "secret": secret} + # Fall through to boto3 chain (IAM role / ~/.aws/credentials) + return {} + + def _write_stac_catalog_to_s3( + self, file_dict: dict[str, dict], storage_options: dict + ) -> None: + """Write STAC catalog and item JSON files to S3 via fsspec/s3fs.""" + for s3_path, content in file_dict.items(): + logger.info(f"Writing STAC file to {s3_path}") + with fsspec.open(s3_path, "w", **storage_options) as f: + json.dump(content, f, indent=2) + def publish( self, write_to_file: bool = False, @@ -530,6 +584,21 @@ def publish( ds_files = self.publish_dataset(write_to_file=False, mode=mode) files.update(ds_files) + # Publish STAC catalog + item to S3 when stac_catalog_s3_root is configured. + # This is independent of the GitHub PR and happens immediately. + stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root") + if stac_catalog_s3_root and hasattr(self, "_last_generator"): + logger.info( + f"Publishing STAC catalog to S3: {stac_catalog_s3_root}" + ) + zarr_stac_files = self._last_generator.build_zarr_stac_catalog_file_dict( + stac_catalog_s3_root + ) + self._write_stac_catalog_to_s3( + zarr_stac_files, self._get_stac_s3_storage_options() + ) + logger.info("STAC catalog written to S3.") + if mode in ("workflow", "all"): wf_files = self.generate_workflow_experiment_records( write_to_file=False, mode=mode diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 29ff3b7..71e783b 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -7,13 +7,15 @@ from datetime import datetime, timezone import pandas as pd -from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent +import pystac +from pystac import Catalog, Collection, Extent, Item, Asset, Link, SpatialExtent, TemporalExtent from deep_code.constants import ( DEEPESDL_COLLECTION_SELF_HREF, OSC_THEME_SCHEME, PRODUCT_BASE_CATALOG_SELF_HREF, VARIABLE_BASE_CATALOG_SELF_HREF, + ZARR_MEDIA_TYPE, ) from deep_code.utils.helper import open_dataset from deep_code.utils.ogc_api_record import Theme, ThemeConcept @@ -377,7 +379,132 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_dataset_stac_collection(self, mode: str) -> Collection: + def build_zarr_stac_item(self, stac_catalog_s3_root: str) -> Item: + """Build a single STAC Item representing the entire Zarr store. + + One item covers the full spatiotemporal extent of the dataset. + Assets point to the Zarr store and its consolidated metadata. + + Args: + stac_catalog_s3_root: S3 root URL where the STAC catalog will be hosted + (e.g. ``s3://my-bucket/stac/``). Used to build self/root/parent hrefs. + + Returns: + A :class:`pystac.Item` ready to be serialised to S3. + """ + spatial_extent = self._get_spatial_extent() + temporal_extent = self._get_temporal_extent() + general_metadata = self._get_general_metadata() + + bbox = spatial_extent.bboxes[0] # [lon_min, lat_min, lon_max, lat_max] + lon_min, lat_min, lon_max, lat_max = bbox + geometry = { + "type": "Polygon", + "coordinates": [[ + [lon_min, lat_min], + [lon_max, lat_min], + [lon_max, lat_max], + [lon_min, lat_max], + [lon_min, lat_min], + ]], + } + + start_dt, end_dt = temporal_extent.intervals[0] + # Ensure UTC timezone so ISO strings are STAC-compliant + if start_dt is not None and start_dt.tzinfo is None: + start_dt = start_dt.replace(tzinfo=timezone.utc) + if end_dt is not None and end_dt.tzinfo is None: + end_dt = end_dt.replace(tzinfo=timezone.utc) + + now_iso = datetime.now(timezone.utc).isoformat() + root = stac_catalog_s3_root.rstrip("/") + catalog_href = f"{root}/catalog.json" + item_href = f"{root}/{self.collection_id}/item.json" + osc_collection_href = ( + "https://esa-earthcode.github.io/open-science-catalog-metadata" + f"/products/{self.collection_id}/collection.json" + ) + + item = Item( + id=self.collection_id, + geometry=geometry, + bbox=bbox, + datetime=None, + properties={ + "start_datetime": start_dt.isoformat() if start_dt else None, + "end_datetime": end_dt.isoformat() if end_dt else None, + "description": general_metadata.get("description", ""), + "created": now_iso, + "updated": now_iso, + }, + ) + item.collection_id = self.collection_id + item.set_self_href(item_href) + item.add_link(Link(rel="root", target=catalog_href, media_type="application/json")) + item.add_link(Link(rel="parent", target=catalog_href, media_type="application/json")) + item.add_link(Link( + rel="collection", + target=osc_collection_href, + media_type="application/json", + title=self.collection_id, + )) + item.add_asset("zarr-data", Asset( + href=self.access_link, + media_type=ZARR_MEDIA_TYPE, + title="Zarr Data Store", + roles=["data"], + )) + item.add_asset("zarr-consolidated-metadata", Asset( + href=f"{self.access_link}/.zmetadata", + media_type="application/json", + title="Consolidated Zarr Metadata", + roles=["metadata"], + )) + return item + + def build_zarr_stac_catalog_file_dict( + self, stac_catalog_s3_root: str + ) -> dict[str, dict]: + """Generate the STAC Catalog and Item JSON for the Zarr store. + + The catalog acts as the root for the dataset-level STAC hierarchy + that lives on S3 alongside the data:: + + {stac_catalog_s3_root}/ + ├── catalog.json # STAC Catalog (root) + └── {collection_id}/ + └── item.json # STAC Item (whole Zarr) + + Args: + stac_catalog_s3_root: S3 root URL (e.g. ``s3://my-bucket/stac/``). + + Returns: + ``{s3_path: content_dict}`` for every file to be written to S3. + """ + root = stac_catalog_s3_root.rstrip("/") + catalog_href = f"{root}/catalog.json" + + item = self.build_zarr_stac_item(stac_catalog_s3_root) + + catalog = Catalog( + id=f"{self.collection_id}-stac-catalog", + description=f"STAC Catalog for {self.collection_id}", + ) + catalog.set_self_href(catalog_href) + catalog.add_link(Link(rel="root", target=catalog_href, media_type="application/json")) + catalog.add_link(Link( + rel="item", + target=f"./{self.collection_id}/item.json", + media_type="application/json", + title=self.collection_id, + )) + + return { + catalog_href: catalog.to_dict(transform_hrefs=False), + f"{root}/{self.collection_id}/item.json": item.to_dict(transform_hrefs=False), + } + + def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | None = None) -> Collection: """Build an OSC STAC Collection for the dataset. Returns: @@ -496,6 +623,16 @@ def build_dataset_stac_collection(self, mode: str) -> Collection: collection.license = self.license_type + # Link to the S3-hosted STAC catalog when provided + if stac_catalog_s3_root: + catalog_href = stac_catalog_s3_root.rstrip("/") + "/catalog.json" + collection.add_link(Link( + rel="child", + target=catalog_href, + media_type="application/json", + title="STAC Catalog", + )) + # Validate OSC extension fields try: osc_extension.validate_extension() From 74ead07477e57c550701b3caea9128d49fbd9d13 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 6 Mar 2026 15:34:49 +0100 Subject: [PATCH 02/12] refactor according to ruff checks --- .../tests/utils/test_dataset_stac_generator.py | 14 +++++++------- deep_code/utils/dataset_stac_generator.py | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 47dffcb..6704802 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -232,14 +232,14 @@ def test_build_zarr_stac_item_structure(self): self.assertIn("collection", link_rels) # root and parent point to the S3 catalog - root_link = next(l for l in item.links if l.rel == "root") + root_link = next(lnk for lnk in item.links if lnk.rel == "root") self.assertEqual( root_link.target, "s3://test-bucket/stac/my-collection/catalog.json", ) # collection link points to the OSC GitHub collection - coll_link = next(l for l in item.links if l.rel == "collection") + coll_link = next(lnk for lnk in item.links if lnk.rel == "collection") self.assertIn("open-science-catalog-metadata", coll_link.target) self.assertIn("mock-collection-id", coll_link.target) @@ -286,9 +286,9 @@ def test_build_dataset_stac_collection_adds_s3_catalog_child_link(self): collection = self.generator.build_dataset_stac_collection( mode="dataset", stac_catalog_s3_root=s3_root ) - child_links = [l for l in collection.links if l.rel == "child"] + child_links = [lnk for lnk in collection.links if lnk.rel == "child"] s3_child = next( - (l for l in child_links if "s3://" in str(l.target)), None + (lnk for lnk in child_links if "s3://" in str(lnk.target)), None ) self.assertIsNotNone(s3_child, "Expected a child link pointing to S3 catalog") self.assertEqual( @@ -300,9 +300,9 @@ def test_build_dataset_stac_collection_no_s3_child_link_by_default(self): """No S3 child link is added when stac_catalog_s3_root is absent.""" collection = self.generator.build_dataset_stac_collection(mode="dataset") s3_child_links = [ - l - for l in collection.links - if l.rel == "child" and "s3://" in str(getattr(l, "target", "")) + lnk + for lnk in collection.links + if lnk.rel == "child" and "s3://" in str(getattr(lnk, "target", "")) ] self.assertEqual(len(s3_child_links), 0) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 71e783b..60a2298 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -7,7 +7,6 @@ from datetime import datetime, timezone import pandas as pd -import pystac from pystac import Catalog, Collection, Extent, Item, Asset, Link, SpatialExtent, TemporalExtent from deep_code.constants import ( From 92465569a622028ad96ec3716b65e9eb2129018e Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 6 Mar 2026 19:19:52 +0100 Subject: [PATCH 03/12] =?UTF-8?q?=20No=20pystac=20serialization=20?= =?UTF-8?q?=E2=80=94=20avoids=20all=20pystac=20link-reordering=20and=20hre?= =?UTF-8?q?f-transformation=20side=20effects?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 6 +- deep_code/tests/tools/test_publish.py | 7 +- .../utils/test_dataset_stac_generator.py | 175 +++++++++++++----- deep_code/tools/publish.py | 14 +- deep_code/utils/dataset_stac_generator.py | 162 +++++++++------- deep_code/version.py | 2 +- 6 files changed, 237 insertions(+), 129 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index fcce0d5..9f1abcf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -73,8 +73,10 @@ `zarr-consolidated-metadata` (`.zmetadata`). - The S3 STAC catalog and item are written directly to S3 via `fsspec`/`s3fs` independently of the GitHub PR. - - The OSC STAC Collection gains a `child` link pointing to the S3 catalog root, - connecting the two levels of the hierarchy. + - The OSC STAC Collection gains a `via` link pointing to the S3 catalog root, + connecting the two levels of the hierarchy. (`child` is intentionally avoided + because the OSC validator requires every `child` link to resolve to a file inside + the metadata repository.) - Opt-in via the new `stac_catalog_s3_root` field in `dataset_config.yaml` (e.g. `stac_catalog_s3_root: s3://my-bucket/stac/my-collection/`). - S3 write credentials are resolved from `S3_USER_STORAGE_KEY`/`S3_USER_STORAGE_SECRET`, diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index a83854e..ba25029 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -190,10 +190,10 @@ def test_get_stac_s3_storage_options_prefers_xcube_env_vars(self): opts = self.publisher._get_stac_s3_storage_options() self.assertEqual(opts["key"], "xcube-key") self.assertEqual(opts["secret"], "xcube-secret") + self.assertEqual(opts["s3_additional_kwargs"], {"ACL": ""}) def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self): env = {"AWS_ACCESS_KEY_ID": "aws-key", "AWS_SECRET_ACCESS_KEY": "aws-secret"} - # Ensure xcube vars are absent patched_env = { k: v for k, v in os.environ.items() @@ -204,8 +204,9 @@ def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self): opts = self.publisher._get_stac_s3_storage_options() self.assertEqual(opts["key"], "aws-key") self.assertEqual(opts["secret"], "aws-secret") + self.assertEqual(opts["s3_additional_kwargs"], {"ACL": ""}) - def test_get_stac_s3_storage_options_returns_empty_for_boto3_chain(self): + def test_get_stac_s3_storage_options_returns_acl_suppression_for_boto3_chain(self): no_cred_env = { k: v for k, v in os.environ.items() @@ -219,7 +220,7 @@ def test_get_stac_s3_storage_options_returns_empty_for_boto3_chain(self): } with patch.dict(os.environ, no_cred_env, clear=True): opts = self.publisher._get_stac_s3_storage_options() - self.assertEqual(opts, {}) + self.assertEqual(opts, {"s3_additional_kwargs": {"ACL": ""}}) # ------------------------------------------------------------------ # S3 write helper diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 6704802..b73ec88 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock, patch import numpy as np -from pystac import Catalog, Collection, Item +from pystac import Catalog, Item from xarray import DataArray, Dataset from deep_code.constants import ( @@ -141,42 +141,115 @@ def test_build_variable_catalog(self, mock_add_themes, mock_add_gcmd): # Self href ends with var1/catalog.json self.assertTrue(catalog.self_href.endswith("/var1/catalog.json")) - @patch("pystac.Catalog.from_file") - def test_update_product_base_catalog(self, mock_from_file): - """Test linking product catalog.""" - mock_cat = MagicMock(spec=Catalog) - mock_from_file.return_value = mock_cat - - result = self.generator.update_product_base_catalog("path.json") - self.assertIs(result, mock_cat) - mock_cat.add_link.assert_called_once() - mock_cat.set_self_href.assert_called_once_with(PRODUCT_BASE_CATALOG_SELF_HREF) - - @patch("pystac.Catalog.from_file") - def test_update_variable_base_catalog(self, mock_from_file): - """Test linking variable base catalog.""" - mock_cat = MagicMock(spec=Catalog) - mock_from_file.return_value = mock_cat + def test_update_product_base_catalog(self): + """Child link is appended; existing links (including self) are untouched.""" + base = { + "type": "Catalog", + "id": "products", + "stac_version": "1.0.0", + "description": "Products", + "links": [ + { + "rel": "self", + "href": PRODUCT_BASE_CATALOG_SELF_HREF, + "type": "application/json", + } + ], + } + import tempfile, json as _json + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + _json.dump(base, tmp) + tmp_path = tmp.name + + result = self.generator.update_product_base_catalog(tmp_path) + + import os + + os.unlink(tmp_path) + + self.assertIsInstance(result, dict) + rels = [lnk["rel"] for lnk in result["links"]] + # self link must still be present and still first + self.assertEqual(result["links"][0]["rel"], "self") + self.assertEqual(result["links"][0]["href"], PRODUCT_BASE_CATALOG_SELF_HREF) + self.assertIn("child", rels) + child = next(lnk for lnk in result["links"] if lnk["rel"] == "child") + self.assertIn("mock-collection-id", child["href"]) + + def test_update_variable_base_catalog(self): + """Child links for each variable are appended.""" + base = { + "type": "Catalog", + "id": "variables", + "stac_version": "1.0.0", + "description": "Variables", + "links": [ + { + "rel": "self", + "href": VARIABLE_BASE_CATALOG_SELF_HREF, + "type": "application/json", + } + ], + } + import tempfile, json as _json, os + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + _json.dump(base, tmp) + tmp_path = tmp.name vars_ = ["v1", "v2"] - result = self.generator.update_variable_base_catalog("vars.json", vars_) - self.assertIs(result, mock_cat) - # Expect one add_link per variable - self.assertEqual(mock_cat.add_link.call_count, len(vars_)) - mock_cat.set_self_href.assert_called_once_with(VARIABLE_BASE_CATALOG_SELF_HREF) - - @patch("pystac.Collection.from_file") - def test_update_deepesdl_collection(self, mock_from_file): - """Test updating DeepESDL collection.""" - mock_coll = MagicMock(spec=Collection) - mock_from_file.return_value = mock_coll - - result = self.generator.update_deepesdl_collection("deep.json") - self.assertIs(result, mock_coll) - # Expect child and theme related links for each theme - calls = mock_coll.add_link.call_count - self.assertGreaterEqual(calls, 1 + len(self.generator.osc_themes)) - mock_coll.set_self_href.assert_called_once_with(DEEPESDL_COLLECTION_SELF_HREF) + result = self.generator.update_variable_base_catalog(tmp_path, vars_) + os.unlink(tmp_path) + + self.assertIsInstance(result, dict) + child_hrefs = [ + lnk["href"] for lnk in result["links"] if lnk["rel"] == "child" + ] + self.assertEqual(len(child_hrefs), len(vars_)) + # self link must remain in place + self.assertEqual(result["links"][0]["rel"], "self") + + def test_update_deepesdl_collection(self): + """Child and theme-related links are appended; existing links kept.""" + base = { + "type": "Collection", + "id": "deep-esdl", + "stac_version": "1.0.0", + "description": "DeepESDL", + "extent": {}, + "links": [ + { + "rel": "self", + "href": DEEPESDL_COLLECTION_SELF_HREF, + "type": "application/json", + } + ], + } + import tempfile, json as _json, os + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + _json.dump(base, tmp) + tmp_path = tmp.name + + result = self.generator.update_deepesdl_collection(tmp_path) + os.unlink(tmp_path) + + self.assertIsInstance(result, dict) + rels = [lnk["rel"] for lnk in result["links"]] + # child link added + self.assertIn("child", rels) + # one related link per theme + related = [lnk for lnk in result["links"] if lnk["rel"] == "related"] + self.assertGreaterEqual(len(related), len(self.generator.osc_themes)) + # self link still present + self.assertEqual(result["links"][0]["rel"], "self") # ------------------------------------------------------------------ # Zarr STAC Item / Catalog generation @@ -280,31 +353,39 @@ def test_build_zarr_stac_catalog_file_dict_content(self): self.assertIn("zarr-data", item_dict["assets"]) self.assertIn("zarr-consolidated-metadata", item_dict["assets"]) - def test_build_dataset_stac_collection_adds_s3_catalog_child_link(self): - """Child link to S3 catalog is added when stac_catalog_s3_root is provided.""" + def test_build_dataset_stac_collection_adds_s3_catalog_via_link(self): + """A 'via' link to the S3 catalog is added when stac_catalog_s3_root is provided. + + rel='via' is used (not 'child') because the OSC validator requires every + 'child' link to resolve to a file inside the metadata repository. + """ s3_root = "s3://test-bucket/stac/my-collection/" collection = self.generator.build_dataset_stac_collection( mode="dataset", stac_catalog_s3_root=s3_root ) - child_links = [lnk for lnk in collection.links if lnk.rel == "child"] - s3_child = next( - (lnk for lnk in child_links if "s3://" in str(lnk.target)), None + s3_via = next( + ( + lnk + for lnk in collection.links + if lnk.rel == "via" and "catalog.json" in str(lnk.target) + ), + None, ) - self.assertIsNotNone(s3_child, "Expected a child link pointing to S3 catalog") + self.assertIsNotNone(s3_via, "Expected a 'via' link pointing to S3 catalog") self.assertEqual( - s3_child.target, + s3_via.target, "s3://test-bucket/stac/my-collection/catalog.json", ) - def test_build_dataset_stac_collection_no_s3_child_link_by_default(self): - """No S3 child link is added when stac_catalog_s3_root is absent.""" + def test_build_dataset_stac_collection_no_s3_via_link_by_default(self): + """No S3 catalog 'via' link is added when stac_catalog_s3_root is absent.""" collection = self.generator.build_dataset_stac_collection(mode="dataset") - s3_child_links = [ + s3_catalog_links = [ lnk for lnk in collection.links - if lnk.rel == "child" and "s3://" in str(getattr(lnk, "target", "")) + if lnk.rel == "via" and "catalog.json" in str(getattr(lnk, "target", "")) ] - self.assertEqual(len(s3_child_links), 0) + self.assertEqual(len(s3_catalog_links), 0) class TestFormatString(unittest.TestCase): diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 4055904..0e59b01 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -215,8 +215,7 @@ def _update_and_add_to_file_dict( full_path = ( Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path ) - updated_catalog = update_method(full_path, *args) - file_dict[full_path] = updated_catalog.to_dict() + file_dict[full_path] = update_method(full_path, *args) def _update_variable_catalogs(self, generator, file_dict, variable_ids): """Update or create variable catalogs and add them to file_dict. @@ -243,10 +242,9 @@ def _update_variable_catalogs(self, generator, file_dict, variable_ids): Path(self.gh_publisher.github_automation.local_clone_dir) / var_file_path ) - updated_catalog = generator.update_existing_variable_catalog( + file_dict[var_file_path] = generator.update_existing_variable_catalog( full_path, var_id ) - file_dict[var_file_path] = updated_catalog.to_dict() def publish_dataset( self, @@ -545,10 +543,14 @@ def _get_stac_s3_storage_options(self) -> dict: secret = os.environ.get("S3_USER_STORAGE_SECRET") or os.environ.get( "AWS_SECRET_ACCESS_KEY" ) + # s3_additional_kwargs={"ACL": ""} suppresses the ACL header that s3fs + # adds by default; required when Object Ownership is set to + # BucketOwnerEnforced (ACLs disabled) to avoid AccessDenied errors. + base = {"s3_additional_kwargs": {"ACL": ""}} if key and secret: - return {"key": key, "secret": secret} + return {"key": key, "secret": secret, **base} # Fall through to boto3 chain (IAM role / ~/.aws/credentials) - return {} + return base def _write_stac_catalog_to_s3( self, file_dict: dict[str, dict], storage_options: dict diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 60a2298..5b503e1 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -3,6 +3,7 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. +import json import logging from datetime import datetime, timezone @@ -272,38 +273,50 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog - def update_product_base_catalog(self, product_catalog_path) -> Catalog: - """Link product to base product catalog""" - product_base_catalog = Catalog.from_file(product_catalog_path) - product_base_catalog.add_link( - Link( - rel="child", - target=f"./{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, - ) + @staticmethod + def _append_link_if_absent(links: list, new_link: dict) -> None: + """Append *new_link* to *links* only when no existing entry has the + same ``rel`` and ``href`` (prevents duplicates on repeated publishes).""" + if not any( + lnk.get("rel") == new_link["rel"] and lnk.get("href") == new_link["href"] + for lnk in links + ): + links.append(new_link) + + def update_product_base_catalog(self, product_catalog_path) -> dict: + """Append a child link to the products base catalog and return the + modified JSON dict without touching any existing links.""" + with open(product_catalog_path, encoding="utf-8") as f: + data = json.load(f) + self._append_link_if_absent( + data.setdefault("links", []), + { + "rel": "child", + "href": f"./{self.collection_id}/collection.json", + "type": "application/json", + "title": self.collection_id, + }, ) - # 'self' link: the direct URL where this JSON is hosted - product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) - return product_base_catalog + return data def update_variable_base_catalog( self, variable_base_catalog_path, variable_ids - ) -> (Catalog): - """Link product to base product catalog""" - variable_base_catalog = Catalog.from_file(variable_base_catalog_path) + ) -> dict: + """Append child links for each variable to the variables base catalog.""" + with open(variable_base_catalog_path, encoding="utf-8") as f: + data = json.load(f) + links = data.setdefault("links", []) for var_id in variable_ids: - variable_base_catalog.add_link( - Link( - rel="child", - target=f"./{var_id}/catalog.json", - media_type="application/json", - title=self.format_string(var_id), - ) + self._append_link_if_absent( + links, + { + "rel": "child", + "href": f"./{var_id}/catalog.json", + "type": "application/json", + "title": self.format_string(var_id), + }, ) - # 'self' link: the direct URL where this JSON is hosted - variable_base_catalog.set_self_href(VARIABLE_BASE_CATALOG_SELF_HREF) - return variable_base_catalog + return data def add_themes_as_related_links_var_catalog(self, var_catalog): """Add themes as related links to variable catalog""" @@ -317,52 +330,58 @@ def add_themes_as_related_links_var_catalog(self, var_catalog): ) ) - def update_deepesdl_collection(self, deepesdl_collection_full_path): - deepesdl_collection = Collection.from_file(deepesdl_collection_full_path) - deepesdl_collection.add_link( - Link( - rel="child", - target=f"../../products/{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, - ) + def update_deepesdl_collection(self, deepesdl_collection_full_path) -> dict: + """Append child and theme-related links to the DeepESDL collection.""" + with open(deepesdl_collection_full_path, encoding="utf-8") as f: + data = json.load(f) + links = data.setdefault("links", []) + self._append_link_if_absent( + links, + { + "rel": "child", + "href": f"../../products/{self.collection_id}/collection.json", + "type": "application/json", + "title": self.collection_id, + }, ) - # add themes to deepesdl for theme in self.osc_themes: - deepesdl_collection.add_link( - Link( - rel="related", - target=f"../../themes/{theme}/catalog.json", - media_type="application/json", - title=f"Theme: {self.format_string(theme)}", - ) - ) - deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF) - return deepesdl_collection - - def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: - existing_catalog = Catalog.from_file(var_file_path) - now_iso = datetime.now(timezone.utc).isoformat() - existing_catalog.extra_fields["updated"] = now_iso - - # add 'child' link as the product - existing_catalog.add_link( - Link( - rel="child", - target=f"../../products/{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, + self._append_link_if_absent( + links, + { + "rel": "related", + "href": f"../../themes/{theme}/catalog.json", + "type": "application/json", + "title": f"Theme: {self.format_string(theme)}", + }, ) + return data + + def update_existing_variable_catalog(self, var_file_path, var_id) -> dict: + """Append child and theme links to an existing variable catalog.""" + with open(var_file_path, encoding="utf-8") as f: + data = json.load(f) + data["updated"] = datetime.now(timezone.utc).isoformat() + links = data.setdefault("links", []) + self._append_link_if_absent( + links, + { + "rel": "child", + "href": f"../../products/{self.collection_id}/collection.json", + "type": "application/json", + "title": self.collection_id, + }, ) - self.add_themes_as_related_links_var_catalog(existing_catalog) - self_href = ( - f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" - f"/{var_id}/catalog.json" - ) - # 'self' link: the direct URL where this JSON is hosted - existing_catalog.set_self_href(self_href) - - return existing_catalog + for theme in self.osc_themes: + self._append_link_if_absent( + links, + { + "rel": "related", + "href": f"../../themes/{theme}/catalog.json", + "type": "application/json", + "title": f"Theme: {self.format_string(theme)}", + }, + ) + return data @staticmethod def format_string(s: str) -> str: @@ -622,11 +641,14 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N collection.license = self.license_type - # Link to the S3-hosted STAC catalog when provided + # Link to the S3-hosted STAC catalog when provided. + # Uses rel="via" (not "child") because the OSC validator requires every + # "child" link to resolve to a file inside the metadata repository; + # the S3 catalog lives outside the repo and would fail that check. if stac_catalog_s3_root: catalog_href = stac_catalog_s3_root.rstrip("/") + "/catalog.json" collection.add_link(Link( - rel="child", + rel="via", target=catalog_href, media_type="application/json", title="STAC Catalog", diff --git a/deep_code/version.py b/deep_code/version.py index f2ff3c8..3748bea 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.7" +version = "0.1.8.dev0" From b8f5653fca60531dc090df531e852265746ea813 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 00:03:22 +0100 Subject: [PATCH 04/12] Made osc_project a configurable parameter --- deep_code/tests/tools/test_publish.py | 74 ++++++++++ .../utils/test_dataset_stac_generator.py | 139 ++++++++++++++++++ deep_code/tools/publish.py | 20 ++- deep_code/utils/dataset_stac_generator.py | 76 +++++++++- 4 files changed, 298 insertions(+), 11 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index ba25029..87f2137 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -285,6 +285,80 @@ def test_publish_writes_zarr_stac_to_s3_when_configured( # Two S3 files written: catalog.json + item.json self.assertEqual(mock_fsspec_open.call_count, 2) + # ------------------------------------------------------------------ + # Project collection create-vs-update branching + # ------------------------------------------------------------------ + + @patch("deep_code.tools.publish.OscDatasetStacGenerator") + def test_publish_dataset_creates_project_collection_when_missing( + self, MockGenerator + ): + """When the project collection does not exist, build_project_collection is + called and projects/catalog.json is updated via _update_and_add_to_file_dict.""" + mock_gen = MagicMock() + mock_gen.osc_project = "test-project" + mock_gen.get_variable_ids.return_value = [] + mock_gen.build_dataset_stac_collection.return_value.to_dict.return_value = {} + mock_gen.build_project_collection.return_value = { + "type": "Collection", + "id": "test-project", + } + MockGenerator.return_value = mock_gen + + self.publisher.dataset_config = { + "dataset_id": "test-dataset", + "collection_id": "test-collection", + "license_type": "CC-BY-4.0", + } + self.publisher.collection_id = "test-collection" + + # Project collection is missing; all other file_exists calls return True + self.publisher.gh_publisher.github_automation.file_exists.return_value = False + + with patch.object(self.publisher, "_update_and_add_to_file_dict") as mock_update, \ + patch.object(self.publisher, "_update_variable_catalogs"): + file_dict = self.publisher.publish_dataset(write_to_file=False) + + mock_gen.build_project_collection.assert_called_once() + self.assertIn("projects/test-project/collection.json", file_dict) + mock_gen.update_deepesdl_collection.assert_not_called() + + # projects/catalog.json must be updated + updated_paths = [call.args[1] for call in mock_update.call_args_list] + self.assertIn("projects/catalog.json", updated_paths) + + @patch("deep_code.tools.publish.OscDatasetStacGenerator") + def test_publish_dataset_updates_project_collection_when_exists( + self, MockGenerator + ): + """When the project collection exists, update_deepesdl_collection is called + via _update_and_add_to_file_dict and build_project_collection is not called.""" + mock_gen = MagicMock() + mock_gen.osc_project = "test-project" + mock_gen.get_variable_ids.return_value = [] + mock_gen.build_dataset_stac_collection.return_value.to_dict.return_value = {} + MockGenerator.return_value = mock_gen + + self.publisher.dataset_config = { + "dataset_id": "test-dataset", + "collection_id": "test-collection", + "license_type": "CC-BY-4.0", + } + self.publisher.collection_id = "test-collection" + + # Project collection already exists + self.publisher.gh_publisher.github_automation.file_exists.return_value = True + + with patch.object(self.publisher, "_update_and_add_to_file_dict") as mock_update, \ + patch.object(self.publisher, "_update_variable_catalogs"): + self.publisher.publish_dataset(write_to_file=False) + + mock_gen.build_project_collection.assert_not_called() + + # update_deepesdl_collection passed to _update_and_add_to_file_dict + update_methods = [call.args[2] for call in mock_update.call_args_list] + self.assertIn(mock_gen.update_deepesdl_collection, update_methods) + @patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}}) def test_publish_skips_zarr_stac_when_not_configured(self, mock_publish_ds): # No stac_catalog_s3_root in config diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index b73ec88..74af547 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -214,6 +214,145 @@ def test_update_variable_base_catalog(self): # self link must remain in place self.assertEqual(result["links"][0]["rel"], "self") + # ------------------------------------------------------------------ + # osc_project parameter + # ------------------------------------------------------------------ + + def test_osc_project_default(self): + """Default osc_project is 'deep-earth-system-data-lab'.""" + self.assertEqual(self.generator.osc_project, "deep-earth-system-data-lab") + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_osc_project_custom(self, mock_open_ds): + """A custom osc_project is stored on the generator.""" + mock_open_ds.return_value = self.mock_dataset + gen = OscDatasetStacGenerator( + dataset_id="mock-dataset-id", + collection_id="mock-collection-id", + workflow_id="dummy", + workflow_title="test", + license_type="proprietary", + osc_project="my-custom-project", + ) + self.assertEqual(gen.osc_project, "my-custom-project") + + def test_build_dataset_stac_collection_osc_project_in_related_link(self): + """The project-related link in the collection uses the configured osc_project.""" + collection = self.generator.build_dataset_stac_collection(mode="dataset") + project_links = [ + lnk + for lnk in collection.links + if lnk.rel == "related" and "projects" in str(lnk.target) + ] + self.assertEqual(len(project_links), 1) + self.assertIn("deep-earth-system-data-lab", project_links[0].target) + + # ------------------------------------------------------------------ + # build_project_collection + # ------------------------------------------------------------------ + + def test_build_project_collection_structure(self): + """build_project_collection returns a minimal valid STAC Collection dict.""" + result = self.generator.build_project_collection() + + self.assertIsInstance(result, dict) + self.assertEqual(result["type"], "Collection") + self.assertEqual(result["id"], "deep-earth-system-data-lab") + self.assertEqual(result["stac_version"], "1.0.0") + self.assertIn("extent", result) + + rels = [lnk["rel"] for lnk in result["links"]] + self.assertIn("self", rels) + self.assertIn("root", rels) + self.assertIn("parent", rels) + + self_link = next(lnk for lnk in result["links"] if lnk["rel"] == "self") + self.assertIn("deep-earth-system-data-lab", self_link["href"]) + self.assertTrue(self_link["href"].endswith("collection.json")) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_build_project_collection_custom_project(self, mock_open_ds): + """build_project_collection reflects a custom osc_project.""" + mock_open_ds.return_value = self.mock_dataset + gen = OscDatasetStacGenerator( + dataset_id="mock-dataset-id", + collection_id="mock-collection-id", + workflow_id="dummy", + workflow_title="test", + license_type="proprietary", + osc_project="my-project", + ) + result = gen.build_project_collection() + + self.assertEqual(result["id"], "my-project") + self_link = next(lnk for lnk in result["links"] if lnk["rel"] == "self") + self.assertIn("my-project", self_link["href"]) + + # ------------------------------------------------------------------ + # update_project_base_catalog + # ------------------------------------------------------------------ + + def test_update_project_base_catalog(self): + """Child link for the project is appended to the projects base catalog.""" + import json as _json, os, tempfile + + base = { + "type": "Catalog", + "id": "projects", + "stac_version": "1.0.0", + "description": "Projects", + "links": [ + { + "rel": "self", + "href": "https://esa-earthcode.github.io/open-science-catalog-metadata/projects/catalog.json", + "type": "application/json", + } + ], + } + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp: + _json.dump(base, tmp) + tmp_path = tmp.name + + result = self.generator.update_project_base_catalog(tmp_path) + os.unlink(tmp_path) + + self.assertIsInstance(result, dict) + child_links = [lnk for lnk in result["links"] if lnk["rel"] == "child"] + self.assertEqual(len(child_links), 1) + self.assertIn("deep-earth-system-data-lab", child_links[0]["href"]) + self.assertTrue(child_links[0]["href"].endswith("collection.json")) + # existing self link is preserved + self.assertEqual(result["links"][0]["rel"], "self") + + def test_update_project_base_catalog_no_duplicate(self): + """Calling update_project_base_catalog when the child link already exists + does not produce a duplicate.""" + import json as _json, os, tempfile + + base = { + "type": "Catalog", + "id": "projects", + "stac_version": "1.0.0", + "description": "Projects", + "links": [ + { + "rel": "child", + "href": "./deep-earth-system-data-lab/collection.json", + "type": "application/json", + "title": "Deep Earth System Data Lab", + } + ], + } + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp: + _json.dump(base, tmp) + tmp_path = tmp.name + + result = self.generator.update_project_base_catalog(tmp_path) + os.unlink(tmp_path) + + child_links = [lnk for lnk in result["links"] if lnk["rel"] == "child"] + self.assertEqual(len(child_links), 1) + def test_update_deepesdl_collection(self): """Child and theme-related links are appended; existing links kept.""" base = { diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 0e59b01..c08ad66 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -318,11 +318,21 @@ def publish_dataset( file_dict, product_catalog_path, generator.update_product_base_catalog ) - # Update DeepESDL collection - deepesdl_collection_path = "projects/deep-earth-system-data-lab/collection.json" - self._update_and_add_to_file_dict( - file_dict, deepesdl_collection_path, generator.update_deepesdl_collection - ) + # Update or create project collection + project_collection_path = f"projects/{generator.osc_project}/collection.json" + if not self.gh_publisher.github_automation.file_exists(project_collection_path): + logger.info( + f"Project collection for {generator.osc_project} does not exist. Creating..." + ) + file_dict[project_collection_path] = generator.build_project_collection() + # Add child link in the projects base catalog + self._update_and_add_to_file_dict( + file_dict, "projects/catalog.json", generator.update_project_base_catalog + ) + else: + self._update_and_add_to_file_dict( + file_dict, project_collection_path, generator.update_deepesdl_collection + ) # Write to files if testing if write_to_file: diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 5b503e1..40a086f 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -11,10 +11,7 @@ from pystac import Catalog, Collection, Extent, Item, Asset, Link, SpatialExtent, TemporalExtent from deep_code.constants import ( - DEEPESDL_COLLECTION_SELF_HREF, OSC_THEME_SCHEME, - PRODUCT_BASE_CATALOG_SELF_HREF, - VARIABLE_BASE_CATALOG_SELF_HREF, ZARR_MEDIA_TYPE, ) from deep_code.utils.helper import open_dataset @@ -35,6 +32,7 @@ class OscDatasetStacGenerator: osc_themes: List of themes related to the dataset (e.g., ["climate"]). osc_missions: List of satellite missions associated with the dataset. cf_params: CF metadata parameters for the dataset. + osc_project: OSC project identifier (default: "deep-earth-system-data-lab"). """ def __init__( @@ -51,12 +49,14 @@ def __init__( osc_themes: list[str] | None = None, osc_missions: list[str] | None = None, cf_params: list[dict[str]] | None = None, + osc_project: str = "deep-earth-system-data-lab", ): self.dataset_id = dataset_id self.collection_id = collection_id self.workflow_id = workflow_id self.workflow_title = workflow_title self.license_type = license_type + self.osc_project = osc_project self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" self.documentation_link = documentation_link self.osc_status = osc_status @@ -330,6 +330,70 @@ def add_themes_as_related_links_var_catalog(self, var_catalog): ) ) + def build_project_collection(self) -> dict: + """Build a minimal STAC Collection JSON dict for the OSC project. + + Used when the project collection does not yet exist in the catalog. + + Returns: + A plain dict representing the STAC Collection. + """ + now_iso = datetime.now(timezone.utc).isoformat() + self_href = ( + "https://esa-earthcode.github.io/open-science-catalog-metadata" + f"/projects/{self.osc_project}/collection.json" + ) + return { + "type": "Collection", + "id": self.osc_project, + "stac_version": "1.0.0", + "stac_extensions": [], + "title": self.format_string(self.osc_project), + "description": self.format_string(self.osc_project), + "keywords": [], + "license": "various", + "extent": { + "spatial": {"bbox": [[-180, -90, 180, 90]]}, + "temporal": {"interval": [[None, None]]}, + }, + "created": now_iso, + "updated": now_iso, + "links": [ + { + "rel": "self", + "href": self_href, + "type": "application/json", + }, + { + "rel": "root", + "href": "../../catalog.json", + "type": "application/json", + "title": "Open Science Catalog", + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json", + "title": "Projects", + }, + ], + } + + def update_project_base_catalog(self, project_base_catalog_path) -> dict: + """Append a child link for the project to the projects base catalog.""" + with open(project_base_catalog_path, encoding="utf-8") as f: + data = json.load(f) + self._append_link_if_absent( + data.setdefault("links", []), + { + "rel": "child", + "href": f"./{self.osc_project}/collection.json", + "type": "application/json", + "title": self.format_string(self.osc_project), + }, + ) + return data + def update_deepesdl_collection(self, deepesdl_collection_full_path) -> dict: """Append child and theme-related links to the DeepESDL collection.""" with open(deepesdl_collection_full_path, encoding="utf-8") as f: @@ -546,7 +610,7 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N # Add OSC extension metadata osc_extension = OscExtension.add_to(collection) # osc_project and osc_type are fixed constant values - osc_extension.osc_project = "deep-earth-system-data-lab" + osc_extension.osc_project = self.osc_project osc_extension.osc_type = "product" osc_extension.osc_status = self.osc_status osc_extension.osc_region = self.osc_region @@ -623,9 +687,9 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N collection.add_link( Link( rel="related", - target="../../projects/deep-earth-system-data-lab/collection.json", + target=f"../../projects/{self.osc_project}/collection.json", media_type="application/json", - title="Project: DeepESDL", + title=f"Project: {self.format_string(self.osc_project)}", ) ) From 58dbf31bd45536040f0b0464d5d15c590f110d31 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 00:05:40 +0100 Subject: [PATCH 05/12] updated change log and docs --- CHANGES.md | 15 +++++++++ docs/configuration.md | 45 +++++++++++++++++++++++++++ docs/python-api.md | 71 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 130 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 9f1abcf..66c6159 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -82,3 +82,18 @@ - S3 write credentials are resolved from `S3_USER_STORAGE_KEY`/`S3_USER_STORAGE_SECRET`, `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`, or the boto3 default chain (IAM role, `~/.aws/credentials`) — no secrets in config files. + +- Made `osc_project` a configurable parameter on `OscDatasetStacGenerator` (default: + `"deep-earth-system-data-lab"`), replacing the previously hardcoded value. + - The project identifier is now used dynamically when setting `osc:project` on the + OSC extension, generating the `related` link to the project collection, and + resolving the project collection file path during publishing. + +- Publisher now creates the OSC project collection automatically when it does not yet + exist in the catalog repository, instead of failing or silently skipping. + - A minimal but valid STAC Collection is generated for the project with `self`, + `root`, and `parent` links. + - A `child` link for the new project is appended to `projects/catalog.json` so the + project is reachable from the catalog root. + - If the project collection already exists, the existing update path (appending + product `child` and theme `related` links) is used unchanged. diff --git a/docs/configuration.md b/docs/configuration.md index 12d6ffa..3c0b7c5 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -7,10 +7,55 @@ collection_id: your-collection osc_themes: [cryosphere] osc_region: global dataset_status: completed # or ongoing/planned +license_type: CC-BY-4.0 documentation_link: https://example.com/docs access_link: s3://bucket/your-dataset.zarr + +# Optional: publish a STAC Catalog + Item next to the data on S3. +# When set, a lightweight STAC hierarchy (catalog.json → item.json) is written +# directly to S3 and a "via" link is added to the OSC collection pointing to it. +stac_catalog_s3_root: s3://bucket/stac/your-collection/ +``` + +### Field reference + +| Field | Required | Description | +|---|---|---| +| `dataset_id` | Yes | Zarr store identifier (used to open the dataset). | +| `collection_id` | Yes | Unique ID for the STAC collection in the OSC catalog. | +| `license_type` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`). | +| `osc_themes` | No | List of OSC theme slugs (e.g. `[cryosphere, oceans]`). | +| `osc_region` | No | Geographical region label (default: `Global`). | +| `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). | +| `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. | +| `documentation_link` | No | URL to dataset documentation. | +| `stac_catalog_s3_root` | No | S3 root for the dataset-level STAC Catalog/Item. See [STAC Catalog on S3](#stac-catalog-on-s3). | + +### STAC Catalog on S3 + +Setting `stac_catalog_s3_root` generates a two-file STAC hierarchy on S3 alongside +the data: + +``` +s3://bucket/stac/your-collection/ +├── catalog.json # STAC Catalog (root) +└── your-collection/ + └── item.json # STAC Item covering the full Zarr store ``` +The item has two assets: + +- `zarr-data` — points to the Zarr store (`application/vnd+zarr`). +- `zarr-consolidated-metadata` — points to `.zmetadata` (`application/json`). + +The OSC collection gains a `via` link to `catalog.json` so STAC-aware clients +can discover the data path. `rel="child"` is intentionally avoided because the +OSC validator requires every `child` link to resolve inside the metadata repository. + +S3 credentials are resolved in this order: `S3_USER_STORAGE_KEY` / +`S3_USER_STORAGE_SECRET` env vars, then `AWS_ACCESS_KEY_ID` / +`AWS_SECRET_ACCESS_KEY`, then the boto3 default chain (IAM role, `~/.aws/credentials`). + ## Workflow config (YAML) ```yaml workflow_id: your-workflow diff --git a/docs/python-api.md b/docs/python-api.md index 2695cbb..f16fe8c 100644 --- a/docs/python-api.md +++ b/docs/python-api.md @@ -8,7 +8,7 @@ from deep_code.tools.publish import Publisher publisher = Publisher( dataset_config_path="dataset.yaml", workflow_config_path="workflow.yaml", - environment="staging", + environment="staging", # "production" | "staging" | "testing" ) # Generate files locally (no PR) @@ -18,3 +18,72 @@ publisher.publish(write_to_file=True, mode="all") publisher.publish(write_to_file=False, mode="dataset") ``` +`mode` controls what is published: + +| `mode` | What is published | +|---|---| +| `"dataset"` | OSC STAC collection, variable catalogs, product/variable base catalogs, project collection. | +| `"workflow"` | OGC API workflow and experiment records, workflow/experiment base catalogs. | +| `"all"` | Both of the above (default). | + +--- + +## OscDatasetStacGenerator + +`OscDatasetStacGenerator` can also be used directly when you need more control +over individual artifacts. + +```python +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator + +generator = OscDatasetStacGenerator( + dataset_id="my-dataset.zarr", + collection_id="my-collection", + workflow_id="my-workflow", + workflow_title="My Workflow", + license_type="CC-BY-4.0", + osc_themes=["cryosphere"], + osc_region="Global", + osc_status="completed", + # Optional: override the default project identifier. + # Controls osc:project on the collection and the link to the project collection. + osc_project="deep-earth-system-data-lab", +) +``` + +### `osc_project` parameter + +`osc_project` defaults to `"deep-earth-system-data-lab"` and is used in three places: + +1. Sets `osc:project` on the OSC extension of the generated STAC collection. +2. Generates the `related` link from the product collection to the project collection + (`../../projects/{osc_project}/collection.json`). +3. Determines the file path of the project collection when publishing + (`projects/{osc_project}/collection.json`). + +### Automatic project collection creation + +When `Publisher.publish_dataset()` runs, it checks whether +`projects/{osc_project}/collection.json` already exists in the catalog repository: + +- **Missing** — a minimal STAC Collection is created for the project and a `child` + link is appended to `projects/catalog.json` so it is reachable from the catalog root. +- **Exists** — the existing collection is updated with a `child` link to the new + product and `related` links for its themes (same behaviour as before). + +This means publishing to a new project does not require manual catalog setup. + +### STAC Catalog and Item generation + +```python +# Build the S3 STAC hierarchy (dict keyed by S3 path) +file_dict = generator.build_zarr_stac_catalog_file_dict( + stac_catalog_s3_root="s3://bucket/stac/my-collection/" +) +# file_dict contains: +# "s3://bucket/stac/my-collection/catalog.json" +# "s3://bucket/stac/my-collection/my-collection/item.json" +``` + +See [STAC Catalog on S3](configuration.md#stac-catalog-on-s3) for details on the +generated structure. From 3b5add917201af19ccca4b51bea58709d7a69379 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 00:08:52 +0100 Subject: [PATCH 06/12] refactor --- deep_code/tools/publish.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index c08ad66..1a20d7d 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -537,16 +537,8 @@ def _get_stac_s3_storage_options(self) -> dict: JupyterHub pod, ``~/.aws/credentials`` profile, etc. An empty ``storage_options`` dict lets ``s3fs`` fall through to this chain automatically; no secrets are required in code. - - .. note:: - **JupyterHub best practice**: prefer IAM roles (instance / pod - identity) over env-var credentials. IAM roles are scoped to the - specific S3 prefix the user owns, require no secret rotation, and - are never visible to other users on the hub. Per-user env vars - set by the JupyterHub spawner (not server-wide) are an acceptable - fallback — they are private to each user's server process. - Avoid hard-coding credentials in YAML config files. """ + key = os.environ.get("S3_USER_STORAGE_KEY") or os.environ.get( "AWS_ACCESS_KEY_ID" ) From 31bdb6cfae452fa2eb2f25bdc317fbd70a803257 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 00:17:56 +0100 Subject: [PATCH 07/12] refactor --- CHANGES.md | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 66c6159..c5172d5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -61,39 +61,7 @@ ## Changes in 0.1.8 (in Development) -- Fixed a crash in workflow publishing when `jupyter_notebook_url` is not provided in - the workflow config. `jupyter_kernel_info`, `application_link`, and `jnb_open_link` - are now only computed when a notebook URL is present, making the field truly optional. - -- Added STAC Item and S3-hosted STAC Catalog generation for Zarr datasets, enabling - a richer `STAC Collection → STAC Catalog (S3) → STAC Item` hierarchy alongside the - existing OSC metadata. - - A single STAC Item is generated per Zarr store, covering the full spatiotemporal - extent with two assets: `zarr-data` (`application/vnd+zarr`) and - `zarr-consolidated-metadata` (`.zmetadata`). - - The S3 STAC catalog and item are written directly to S3 via `fsspec`/`s3fs` - independently of the GitHub PR. - - The OSC STAC Collection gains a `via` link pointing to the S3 catalog root, - connecting the two levels of the hierarchy. (`child` is intentionally avoided - because the OSC validator requires every `child` link to resolve to a file inside - the metadata repository.) - - Opt-in via the new `stac_catalog_s3_root` field in `dataset_config.yaml` - (e.g. `stac_catalog_s3_root: s3://my-bucket/stac/my-collection/`). - - S3 write credentials are resolved from `S3_USER_STORAGE_KEY`/`S3_USER_STORAGE_SECRET`, - `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`, or the boto3 default chain - (IAM role, `~/.aws/credentials`) — no secrets in config files. - -- Made `osc_project` a configurable parameter on `OscDatasetStacGenerator` (default: - `"deep-earth-system-data-lab"`), replacing the previously hardcoded value. - - The project identifier is now used dynamically when setting `osc:project` on the - OSC extension, generating the `related` link to the project collection, and - resolving the project collection file path during publishing. - -- Publisher now creates the OSC project collection automatically when it does not yet - exist in the catalog repository, instead of failing or silently skipping. - - A minimal but valid STAC Collection is generated for the project with `self`, - `root`, and `parent` links. - - A `child` link for the new project is appended to `projects/catalog.json` so the - project is reachable from the catalog root. - - If the project collection already exists, the existing update path (appending - product `child` and theme `related` links) is used unchanged. +- Fixed a crash in workflow publishing when `jupyter_notebook_url` is absent in the config. +- Added STAC Item and S3-hosted STAC Catalog generation for Zarr datasets (opt-in via `stac_catalog_s3_root` in dataset config). +- `osc_project` is now a configurable parameter on `OscDatasetStacGenerator` (default: `"deep-earth-system-data-lab"`). +- Publisher automatically creates the OSC project collection and registers it in `projects/catalog.json` when it does not yet exist. From 81f7261e8cf19deada1326abed55241b1e1f2018 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 00:21:57 +0100 Subject: [PATCH 08/12] ruff checks fix --- .../utils/test_dataset_stac_generator.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 74af547..6cbe0c1 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -156,7 +156,8 @@ def test_update_product_base_catalog(self): } ], } - import tempfile, json as _json + import tempfile + import json as _json with tempfile.NamedTemporaryFile( mode="w", suffix=".json", delete=False @@ -194,7 +195,9 @@ def test_update_variable_base_catalog(self): } ], } - import tempfile, json as _json, os + import tempfile + import json as _json + import os with tempfile.NamedTemporaryFile( mode="w", suffix=".json", delete=False @@ -294,7 +297,9 @@ def test_build_project_collection_custom_project(self, mock_open_ds): def test_update_project_base_catalog(self): """Child link for the project is appended to the projects base catalog.""" - import json as _json, os, tempfile + import json as _json + import os + import tempfile base = { "type": "Catalog", @@ -327,7 +332,9 @@ def test_update_project_base_catalog(self): def test_update_project_base_catalog_no_duplicate(self): """Calling update_project_base_catalog when the child link already exists does not produce a duplicate.""" - import json as _json, os, tempfile + import json as _json + import os + import tempfile base = { "type": "Catalog", @@ -369,7 +376,9 @@ def test_update_deepesdl_collection(self): } ], } - import tempfile, json as _json, os + import tempfile + import json as _json + import os with tempfile.NamedTemporaryFile( mode="w", suffix=".json", delete=False From b25a68b66b836d90e0f60d08ae75addc88496478 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 16:48:57 +0100 Subject: [PATCH 09/12] change env names for STAC catalog and items for products --- deep_code/tests/tools/test_publish.py | 16 ++++----- deep_code/tools/new.py | 2 +- deep_code/tools/publish.py | 11 +++--- docs/configuration.md | 51 +++++++++++++++++++++++---- 4 files changed, 58 insertions(+), 22 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 87f2137..de841f4 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -179,17 +179,17 @@ def test_publish_builds_pr_params(self, mock_wf, mock_ds): # S3 credential resolution # ------------------------------------------------------------------ - def test_get_stac_s3_storage_options_prefers_xcube_env_vars(self): + def test_get_stac_s3_storage_options_prefers_stac_env_vars(self): env = { - "S3_USER_STORAGE_KEY": "xcube-key", - "S3_USER_STORAGE_SECRET": "xcube-secret", + "STAC_S3_KEY": "stac-key", + "STAC_S3_SECRET": "stac-secret", "AWS_ACCESS_KEY_ID": "aws-key", "AWS_SECRET_ACCESS_KEY": "aws-secret", } with patch.dict(os.environ, env): opts = self.publisher._get_stac_s3_storage_options() - self.assertEqual(opts["key"], "xcube-key") - self.assertEqual(opts["secret"], "xcube-secret") + self.assertEqual(opts["key"], "stac-key") + self.assertEqual(opts["secret"], "stac-secret") self.assertEqual(opts["s3_additional_kwargs"], {"ACL": ""}) def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self): @@ -197,7 +197,7 @@ def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self): patched_env = { k: v for k, v in os.environ.items() - if k not in ("S3_USER_STORAGE_KEY", "S3_USER_STORAGE_SECRET") + if k not in ("STAC_S3_KEY", "STAC_S3_SECRET") } patched_env.update(env) with patch.dict(os.environ, patched_env, clear=True): @@ -212,8 +212,8 @@ def test_get_stac_s3_storage_options_returns_acl_suppression_for_boto3_chain(sel for k, v in os.environ.items() if k not in ( - "S3_USER_STORAGE_KEY", - "S3_USER_STORAGE_SECRET", + "STAC_S3_KEY", + "STAC_S3_SECRET", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", ) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 485d7f0..8e3e67a 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -86,7 +86,7 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: "# {stac_catalog_s3_root}/{collection_id}/item.json (STAC Item for the whole Zarr)\n" "# and adds a 'child' link from the OSC Collection to this S3 catalog.\n" "# S3 write credentials are resolved in order from:\n" - "# 1. S3_USER_STORAGE_KEY / S3_USER_STORAGE_SECRET env vars\n" + "# 1. STAC_S3_KEY / STAC_S3_SECRET env vars (STAC-specific, any bucket)\n" "# 2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars\n" "# 3. boto3 default chain (IAM role, ~/.aws/credentials)\n" "# stac_catalog_s3_root: s3://[YOUR-BUCKET]/stac/[collection-id]/\n" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 1a20d7d..78f06f7 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -529,8 +529,9 @@ def _get_stac_s3_storage_options(self) -> dict: Priority (first match wins): - 1. xcube user-storage env vars — ``S3_USER_STORAGE_KEY`` / - ``S3_USER_STORAGE_SECRET`` (already used by :func:`open_dataset`). + 1. STAC-specific env vars — ``STAC_S3_KEY`` / ``STAC_S3_SECRET``. + Use these to target any S3 bucket independently of the xcube + user-storage bucket. 2. Standard AWS env vars — ``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY``. 3. boto3 default credential chain — IAM role attached to the @@ -539,10 +540,8 @@ def _get_stac_s3_storage_options(self) -> dict: to this chain automatically; no secrets are required in code. """ - key = os.environ.get("S3_USER_STORAGE_KEY") or os.environ.get( - "AWS_ACCESS_KEY_ID" - ) - secret = os.environ.get("S3_USER_STORAGE_SECRET") or os.environ.get( + key = os.environ.get("STAC_S3_KEY") or os.environ.get("AWS_ACCESS_KEY_ID") + secret = os.environ.get("STAC_S3_SECRET") or os.environ.get( "AWS_SECRET_ACCESS_KEY" ) # s3_additional_kwargs={"ACL": ""} suppresses the ACL header that s3fs diff --git a/docs/configuration.md b/docs/configuration.md index 3c0b7c5..553c46c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2,18 +2,30 @@ ## Dataset config (YAML) ```yaml +# Required dataset_id: your-dataset.zarr collection_id: your-collection -osc_themes: [cryosphere] -osc_region: global -dataset_status: completed # or ongoing/planned license_type: CC-BY-4.0 + +# Optional +osc_themes: [cryosphere] # must match slugs at opensciencedata.esa.int/themes/catalog +osc_region: global +dataset_status: completed # ongoing | completed | planned (default: ongoing) documentation_link: https://example.com/docs -access_link: s3://bucket/your-dataset.zarr +access_link: s3://bucket/your-dataset.zarr # defaults to s3://deep-esdl-public/{dataset_id} + +# CF parameter overrides (list of {name, units, ...} dicts) +cf_parameter: + - name: sea_surface_temperature + units: kelvin # Optional: publish a STAC Catalog + Item next to the data on S3. # When set, a lightweight STAC hierarchy (catalog.json → item.json) is written # directly to S3 and a "via" link is added to the OSC collection pointing to it. +# S3 write credentials are resolved in order: +# 1. STAC_S3_KEY / STAC_S3_SECRET (STAC-specific, any bucket) +# 2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY +# 3. boto3 default chain (IAM role, ~/.aws/credentials) stac_catalog_s3_root: s3://bucket/stac/your-collection/ ``` @@ -29,6 +41,7 @@ stac_catalog_s3_root: s3://bucket/stac/your-collection/ | `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). | | `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. | | `documentation_link` | No | URL to dataset documentation. | +| `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). | | `stac_catalog_s3_root` | No | S3 root for the dataset-level STAC Catalog/Item. See [STAC Catalog on S3](#stac-catalog-on-s3). | ### STAC Catalog on S3 @@ -52,12 +65,14 @@ The OSC collection gains a `via` link to `catalog.json` so STAC-aware clients can discover the data path. `rel="child"` is intentionally avoided because the OSC validator requires every `child` link to resolve inside the metadata repository. -S3 credentials are resolved in this order: `S3_USER_STORAGE_KEY` / -`S3_USER_STORAGE_SECRET` env vars, then `AWS_ACCESS_KEY_ID` / -`AWS_SECRET_ACCESS_KEY`, then the boto3 default chain (IAM role, `~/.aws/credentials`). +S3 credentials for writing the STAC catalog are resolved in this order: +`STAC_S3_KEY` / `STAC_S3_SECRET` env vars (STAC-specific, can target any bucket), +then `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`, +then the boto3 default chain (IAM role, `~/.aws/credentials`). ## Workflow config (YAML) ```yaml +# Required workflow_id: your-workflow properties: title: "My workflow" @@ -69,6 +84,8 @@ properties: name: deepesdl-xcube-1.8.3 python_version: 3.11 env_file: https://example.com/environment.yml + +# Optional jupyter_notebook_url: https://github.com/org/repo/path/to/notebook.ipynb contact: - name: Jane Doe @@ -77,6 +94,26 @@ contact: - rel: about type: text/html href: https://example.org +links: + - rel: related + type: text/html + href: https://example.com/related-resource + title: Related resource ``` +### Field reference + +| Field | Required | Description | +|---|---|---| +| `workflow_id` | Yes | Unique identifier for the workflow (spaces converted to hyphens, lowercased). | +| `properties.title` | Yes | Human-readable title. | +| `properties.description` | No | Short summary of what the workflow does. | +| `properties.keywords` | No | List of keyword strings. | +| `properties.themes` | No | List of OSC theme slugs. | +| `properties.license` | No | License identifier (e.g. `proprietary`, `CC-BY-4.0`). | +| `properties.jupyter_kernel_info` | No | Kernel name, Python version, and environment file URL. | +| `jupyter_notebook_url` | No | Link to the source notebook on GitHub. When omitted, kernel and application links are skipped. | +| `contact` | No | List of contact objects with `name`, `organization`, and `links`. | +| `links` | No | Additional OGC API record links (e.g. `related`, `describedby`). | + More templates and examples live in `dataset_config.yaml`, `workflow_config.yaml`, and `example-config/`. From 02eedf6f725a64fa5505127dd18e56f85e8e192f Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 17 Mar 2026 17:06:06 +0100 Subject: [PATCH 10/12] update doc --- docs/configuration.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 553c46c..bf2b955 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1,5 +1,18 @@ # Configuration +The quickest way to get started is to generate starter templates with the CLI: + +```bash +deep-code generate-config # writes to current directory +deep-code generate-config -o ./configs # custom output folder +``` + +This creates `dataset_config.yaml` and `workflow_config.yaml` with all supported fields and placeholder values. Fill them in, then run [`deep-code publish`](cli.md#publish-metadata). + +The sections below document every field in those templates. + +--- + ## Dataset config (YAML) ```yaml # Required @@ -80,6 +93,7 @@ properties: keywords: ["Earth Science"] themes: ["cryosphere"] license: proprietary + # jupyter_kernel_info is optional — only published when jupyter_notebook_url is set jupyter_kernel_info: name: deepesdl-xcube-1.8.3 python_version: 3.11 @@ -111,8 +125,8 @@ links: | `properties.keywords` | No | List of keyword strings. | | `properties.themes` | No | List of OSC theme slugs. | | `properties.license` | No | License identifier (e.g. `proprietary`, `CC-BY-4.0`). | -| `properties.jupyter_kernel_info` | No | Kernel name, Python version, and environment file URL. | | `jupyter_notebook_url` | No | Link to the source notebook on GitHub. When omitted, kernel and application links are skipped. | +| `properties.jupyter_kernel_info` | No | Kernel name, Python version, and environment file URL. Only published when `jupyter_notebook_url` is set. | | `contact` | No | List of contact objects with `name`, `organization`, and `links`. | | `links` | No | Additional OGC API record links (e.g. `related`, `describedby`). | From acfd278c878de052007d32622256e33cd824dc83 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 18 Mar 2026 11:33:23 +0100 Subject: [PATCH 11/12] updated tests to remove temp files --- .../utils/test_dataset_stac_generator.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 6cbe0c1..039bac2 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -165,11 +165,12 @@ def test_update_product_base_catalog(self): _json.dump(base, tmp) tmp_path = tmp.name - result = self.generator.update_product_base_catalog(tmp_path) - import os - os.unlink(tmp_path) + try: + result = self.generator.update_product_base_catalog(tmp_path) + finally: + os.unlink(tmp_path) self.assertIsInstance(result, dict) rels = [lnk["rel"] for lnk in result["links"]] @@ -206,8 +207,10 @@ def test_update_variable_base_catalog(self): tmp_path = tmp.name vars_ = ["v1", "v2"] - result = self.generator.update_variable_base_catalog(tmp_path, vars_) - os.unlink(tmp_path) + try: + result = self.generator.update_variable_base_catalog(tmp_path, vars_) + finally: + os.unlink(tmp_path) self.assertIsInstance(result, dict) child_hrefs = [ @@ -318,8 +321,10 @@ def test_update_project_base_catalog(self): _json.dump(base, tmp) tmp_path = tmp.name - result = self.generator.update_project_base_catalog(tmp_path) - os.unlink(tmp_path) + try: + result = self.generator.update_project_base_catalog(tmp_path) + finally: + os.unlink(tmp_path) self.assertIsInstance(result, dict) child_links = [lnk for lnk in result["links"] if lnk["rel"] == "child"] @@ -354,8 +359,10 @@ def test_update_project_base_catalog_no_duplicate(self): _json.dump(base, tmp) tmp_path = tmp.name - result = self.generator.update_project_base_catalog(tmp_path) - os.unlink(tmp_path) + try: + result = self.generator.update_project_base_catalog(tmp_path) + finally: + os.unlink(tmp_path) child_links = [lnk for lnk in result["links"] if lnk["rel"] == "child"] self.assertEqual(len(child_links), 1) From e0cc064957e2673e95dfcfec97a4ede440077776 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 18 Mar 2026 11:41:12 +0100 Subject: [PATCH 12/12] removed the redundant hasattr check --- deep_code/tools/publish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 78f06f7..dda7c2d 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -590,7 +590,7 @@ def publish( # Publish STAC catalog + item to S3 when stac_catalog_s3_root is configured. # This is independent of the GitHub PR and happens immediately. stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root") - if stac_catalog_s3_root and hasattr(self, "_last_generator"): + if stac_catalog_s3_root: logger.info( f"Publishing STAC catalog to S3: {stac_catalog_s3_root}" )