Skip to content
Merged
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@
- Fixed a bug in build_child_link_to_related_experiment for the publish mode `"all"`.

## Changes in 0.1.8 (in Development)

- Fixed a crash in workflow publishing when `jupyter_notebook_url` is absent in the config.
- Added STAC Item and S3-hosted STAC Catalog generation for Zarr datasets (opt-in via `stac_catalog_s3_root` in dataset config).
- `osc_project` is now a configurable parameter on `OscDatasetStacGenerator` (default: `"deep-earth-system-data-lab"`).
- Publisher automatically creates the OSC project collection and registers it in `projects/catalog.json` when it does not yet exist.
1 change: 1 addition & 0 deletions deep_code/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@
APPLICATION_STAC_EXTENSION_SPEC = (
"https://stac-extensions.github.io/application/v0.1.0/schema.json"
)
ZARR_MEDIA_TYPE = "application/vnd+zarr"
201 changes: 201 additions & 0 deletions deep_code/tests/tools/test_publish.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
import tempfile
import unittest
from pathlib import Path
Expand Down Expand Up @@ -174,6 +175,206 @@ def test_publish_builds_pr_params(self, mock_wf, mock_ds):
assert "workflow/experiment: wf" in kwargs["pr_title"]


# ------------------------------------------------------------------
# S3 credential resolution
# ------------------------------------------------------------------

def test_get_stac_s3_storage_options_prefers_stac_env_vars(self):
env = {
"STAC_S3_KEY": "stac-key",
"STAC_S3_SECRET": "stac-secret",
"AWS_ACCESS_KEY_ID": "aws-key",
"AWS_SECRET_ACCESS_KEY": "aws-secret",
}
with patch.dict(os.environ, env):
opts = self.publisher._get_stac_s3_storage_options()
self.assertEqual(opts["key"], "stac-key")
self.assertEqual(opts["secret"], "stac-secret")
self.assertEqual(opts["s3_additional_kwargs"], {"ACL": ""})

def test_get_stac_s3_storage_options_falls_back_to_aws_env_vars(self):
env = {"AWS_ACCESS_KEY_ID": "aws-key", "AWS_SECRET_ACCESS_KEY": "aws-secret"}
patched_env = {
k: v
for k, v in os.environ.items()
if k not in ("STAC_S3_KEY", "STAC_S3_SECRET")
}
patched_env.update(env)
with patch.dict(os.environ, patched_env, clear=True):
opts = self.publisher._get_stac_s3_storage_options()
self.assertEqual(opts["key"], "aws-key")
self.assertEqual(opts["secret"], "aws-secret")
self.assertEqual(opts["s3_additional_kwargs"], {"ACL": ""})

def test_get_stac_s3_storage_options_returns_acl_suppression_for_boto3_chain(self):
no_cred_env = {
k: v
for k, v in os.environ.items()
if k
not in (
"STAC_S3_KEY",
"STAC_S3_SECRET",
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
)
}
with patch.dict(os.environ, no_cred_env, clear=True):
opts = self.publisher._get_stac_s3_storage_options()
self.assertEqual(opts, {"s3_additional_kwargs": {"ACL": ""}})

# ------------------------------------------------------------------
# S3 write helper
# ------------------------------------------------------------------

@patch("deep_code.tools.publish.fsspec.open")
def test_write_stac_catalog_to_s3(self, mock_fsspec_open):
mock_file = MagicMock()
mock_ctx = MagicMock()
mock_ctx.__enter__ = MagicMock(return_value=mock_file)
mock_ctx.__exit__ = MagicMock(return_value=False)
mock_fsspec_open.return_value = mock_ctx

file_dict = {
"s3://bucket/catalog.json": {"type": "Catalog", "id": "test"},
"s3://bucket/col/item.json": {"type": "Feature", "id": "item"},
}
self.publisher._write_stac_catalog_to_s3(
file_dict, {"key": "k", "secret": "s"}
)

self.assertEqual(mock_fsspec_open.call_count, 2)
mock_fsspec_open.assert_any_call(
"s3://bucket/catalog.json", "w", key="k", secret="s"
)
mock_fsspec_open.assert_any_call(
"s3://bucket/col/item.json", "w", key="k", secret="s"
)

# ------------------------------------------------------------------
# End-to-end zarr STAC publishing wired into publish()
# ------------------------------------------------------------------

@patch("deep_code.tools.publish.fsspec.open")
@patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}})
def test_publish_writes_zarr_stac_to_s3_when_configured(
self, mock_publish_ds, mock_fsspec_open
):
self.publisher.dataset_config["stac_catalog_s3_root"] = (
"s3://test-bucket/stac/"
)

mock_ctx = MagicMock()
mock_ctx.__enter__ = MagicMock(return_value=MagicMock())
mock_ctx.__exit__ = MagicMock(return_value=False)
mock_fsspec_open.return_value = mock_ctx

mock_generator = MagicMock()
mock_generator.build_zarr_stac_catalog_file_dict.return_value = {
"s3://test-bucket/stac/catalog.json": {"type": "Catalog"},
"s3://test-bucket/stac/test-collection/item.json": {"type": "Feature"},
}
# Simulate what publish_dataset() normally does: store the generator
self.publisher._last_generator = mock_generator
self.publisher.gh_publisher.publish_files.return_value = "PR_URL"

self.publisher.publish(mode="dataset")

mock_generator.build_zarr_stac_catalog_file_dict.assert_called_once_with(
"s3://test-bucket/stac/"
)
# Two S3 files written: catalog.json + item.json
self.assertEqual(mock_fsspec_open.call_count, 2)

# ------------------------------------------------------------------
# Project collection create-vs-update branching
# ------------------------------------------------------------------

@patch("deep_code.tools.publish.OscDatasetStacGenerator")
def test_publish_dataset_creates_project_collection_when_missing(
self, MockGenerator
):
"""When the project collection does not exist, build_project_collection is
called and projects/catalog.json is updated via _update_and_add_to_file_dict."""
mock_gen = MagicMock()
mock_gen.osc_project = "test-project"
mock_gen.get_variable_ids.return_value = []
mock_gen.build_dataset_stac_collection.return_value.to_dict.return_value = {}
mock_gen.build_project_collection.return_value = {
"type": "Collection",
"id": "test-project",
}
MockGenerator.return_value = mock_gen

self.publisher.dataset_config = {
"dataset_id": "test-dataset",
"collection_id": "test-collection",
"license_type": "CC-BY-4.0",
}
self.publisher.collection_id = "test-collection"

# Project collection is missing; all other file_exists calls return True
self.publisher.gh_publisher.github_automation.file_exists.return_value = False

with patch.object(self.publisher, "_update_and_add_to_file_dict") as mock_update, \
patch.object(self.publisher, "_update_variable_catalogs"):
file_dict = self.publisher.publish_dataset(write_to_file=False)

mock_gen.build_project_collection.assert_called_once()
self.assertIn("projects/test-project/collection.json", file_dict)
mock_gen.update_deepesdl_collection.assert_not_called()

# projects/catalog.json must be updated
updated_paths = [call.args[1] for call in mock_update.call_args_list]
self.assertIn("projects/catalog.json", updated_paths)

@patch("deep_code.tools.publish.OscDatasetStacGenerator")
def test_publish_dataset_updates_project_collection_when_exists(
self, MockGenerator
):
"""When the project collection exists, update_deepesdl_collection is called
via _update_and_add_to_file_dict and build_project_collection is not called."""
mock_gen = MagicMock()
mock_gen.osc_project = "test-project"
mock_gen.get_variable_ids.return_value = []
mock_gen.build_dataset_stac_collection.return_value.to_dict.return_value = {}
MockGenerator.return_value = mock_gen

self.publisher.dataset_config = {
"dataset_id": "test-dataset",
"collection_id": "test-collection",
"license_type": "CC-BY-4.0",
}
self.publisher.collection_id = "test-collection"

# Project collection already exists
self.publisher.gh_publisher.github_automation.file_exists.return_value = True

with patch.object(self.publisher, "_update_and_add_to_file_dict") as mock_update, \
patch.object(self.publisher, "_update_variable_catalogs"):
self.publisher.publish_dataset(write_to_file=False)

mock_gen.build_project_collection.assert_not_called()

# update_deepesdl_collection passed to _update_and_add_to_file_dict
update_methods = [call.args[2] for call in mock_update.call_args_list]
self.assertIn(mock_gen.update_deepesdl_collection, update_methods)

@patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}})
def test_publish_skips_zarr_stac_when_not_configured(self, mock_publish_ds):
# No stac_catalog_s3_root in config
self.publisher.dataset_config = {
"collection_id": "test-collection",
"dataset_id": "test-dataset",
}
self.publisher.gh_publisher.publish_files.return_value = "PR_URL"

with patch.object(
self.publisher, "_write_stac_catalog_to_s3"
) as mock_write:
self.publisher.publish(mode="dataset")
mock_write.assert_not_called()


class TestParseGithubNotebookUrl:
@pytest.mark.parametrize(
"url,repo_url,repo_name,branch,file_path",
Expand Down
Loading
Loading