From 4224fef9ddcba7cd5dc7fbd29fde907a23df15f5 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 15:24:56 +0000 Subject: [PATCH 01/22] Switch to real hashing --- .gitignore | 3 +++ .pre-commit-config.yaml | 3 ++- README.md | 22 +++++++++++++++++ config.EXAMPLE/.env.EXAMPLE | 5 ++++ config.EXAMPLE/exporter.env.EXAMPLE | 6 +++++ config.EXAMPLE/hasher.env.EXAMPLE | 12 ++++++---- docker-compose.yml | 2 +- pyproject.toml | 2 +- src/pseudon/hashing.py | 37 ++++++++++++++++++++++------- src/settings.py | 3 +++ 10 files changed, 79 insertions(+), 16 deletions(-) create mode 100644 config.EXAMPLE/.env.EXAMPLE diff --git a/.gitignore b/.gitignore index be3f5ce..a44b7dd 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ wheels/ # settings files (should not be in the source tree anyway, but just in case) *.env + +# snakemake tracking files +.snakemake diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4b0fd9..a217d81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,8 @@ repos: [ "pandas-stubs", "types-psycopg2", - "types-pika" + "types-pika", + "types-requests", ] files: src/ # a collection of sanity checks: check for merge conflicts, check the end of diff --git a/README.md b/README.md index 4743482..77dc783 100644 --- a/README.md +++ b/README.md @@ -48,18 +48,38 @@ separate to the Emap project root. ### Instructions for achieving this structure + +#### Clone repos Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL), both inside your root directory. +#### make config files Set up the config files as follows: ``` mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env +cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. +#### fill in config files +Fill out the config, as appropriate. + +Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the +PIXL hasher (as things stand, we are doing so). + +When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, +eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. + +This checks if any config options have been added/removed from the .EXAMPLE, and thus should be +added/removed from the live file. + +> [!CAUTION] +> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file! + +#### make necessary directories If it doesn't already exist you should create a directory named `waveform-export` in the parent directory to store the saved waveform messages. @@ -68,6 +88,8 @@ messages. mkdir waveform-export ``` +#### run it! + Build and start the controller and exporter with docker ``` cd waveform-controller diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE new file mode 100644 index 0000000..bd473cc --- /dev/null +++ b/config.EXAMPLE/.env.EXAMPLE @@ -0,0 +1,5 @@ +# This is an EXAMPLE file, do not put real secrets in here. +# Copy it to .env and then DELETE THIS COMMENT. +# (This is a bit different from the others, which live in config. +# This one is just for the variables in the docker compose file) +HASHER_API_LISTEN_PORT= diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 7be714c..14a601d 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -2,7 +2,13 @@ # Copy it to ../config/exporter.env and then DELETE THIS COMMENT. # When does the exporter run EXPORTER_CRON_SCHEDULE="14 5 * * *" +# Where to upload via FTPS FTPS_HOST=myftps.example.com FTPS_PORT=990 FTPS_USERNAME= FTPS_PASSWORD= +# only run workflow up to and including the specified rule +SNAKEMAKE_RULE_UNTIL= +# point to the hasher we wish to use +HASHER_API_HOSTNAME=waveform-hasher +HASHER_API_PORT= diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 1cd6ad0..ad79b7a 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,6 +1,10 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. -HASHER_API_AZ_CLIENT_ID= -HASHER_API_AZ_CLIENT_PASSWORD= -HASHER_API_AZ_TENANT_ID= -HASHER_API_AZ_KEY_VAULT_NAME= +AZURE_CLIENT_ID= +AZURE_CLIENT_SECRET= +AZURE_TENANT_ID= +AZURE_KEY_VAULT_NAME= +# This is the "variable name" of the actual secret, +# and can be fixed and is not secret itself +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" + diff --git a/docker-compose.yml b/docker-compose.yml index d20153a..56ecbb8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,7 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_PORT}:8000" + - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000" env_file: - ../config/hasher.env restart: unless-stopped diff --git a/pyproject.toml b/pyproject.toml index 7ee53db..bb0d238 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "snakemake==9.14.5", # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't) "psycopg2-binary>=2.9.10", - "stablehash==0.3.0", + "requests==2.32.3", # trick for making a "relative" path, works inside or outside container image "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core", ] diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py index d7ac7b9..35bde98 100644 --- a/src/pseudon/hashing.py +++ b/src/pseudon/hashing.py @@ -1,17 +1,36 @@ +import logging from functools import lru_cache -from stablehash import stablehash +import requests -@lru_cache +import settings + +logger = logging.getLogger(__name__) + + +@lru_cache(maxsize=1000) def do_hash(type_prefix: str, value: str): - """Stub implementation of deidentification function for testing purposes. + """Pass data to the hasher API for de-identification purposes. Not that I think this will happen in practice, but we'd want the CSN "1234" to hash to a different value than the MRN "1234", so prefix each value with its type. """ - # Full implementation of issue #6 must remove this code and call the real hasher!! - SALT = "waveform-exporter" - full_value_to_hash = f"{SALT}:{type_prefix}:{value}" - full_hash = stablehash(full_value_to_hash).hexdigest() - tiny_hash = full_hash[:8] - return tiny_hash + + project_slug = "waveform-exporter" + full_value_to_hash = f"{type_prefix}:{value}" + + hasher_hostname = settings.HASHER_API_HOSTNAME + hasher_port = settings.HASHER_API_PORT + hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash" + request_params: dict[str, str | int] = { + "project_slug": project_slug, + "message": full_value_to_hash, + } + # do we need to specify a particular hash length? + # request_params["length"] = hash_len + + response = requests.get(hasher_req_url, params=request_params) + logger.debug("RESPONSE = {}", response.text) + response.raise_for_status() + real_hash = response.text + return real_hash diff --git a/src/settings.py b/src/settings.py index dfbebc1..8aaed85 100644 --- a/src/settings.py +++ b/src/settings.py @@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None): get_from_env("FTPS_PORT", default_value=990) get_from_env("FTPS_USERNAME") get_from_env("FTPS_PASSWORD") + +get_from_env("HASHER_API_HOSTNAME") +get_from_env("HASHER_API_PORT") From a1e63ab37256369da1708ac53860c7e34af46282 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 16:56:07 +0000 Subject: [PATCH 02/22] We don't actually need to listen on a specified host port because we access directly by service name on the docker network. Just let it pick an ephemeral one. --- README.md | 4 ---- config.EXAMPLE/.env.EXAMPLE | 5 ----- docker-compose.yml | 3 ++- 3 files changed, 2 insertions(+), 10 deletions(-) delete mode 100644 config.EXAMPLE/.env.EXAMPLE diff --git a/README.md b/README.md index 77dc783..97d29d7 100644 --- a/README.md +++ b/README.md @@ -60,16 +60,12 @@ mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env -cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. #### fill in config files Fill out the config, as appropriate. -Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the -PIXL hasher (as things stand, we are doing so). - When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE deleted file mode 100644 index bd473cc..0000000 --- a/config.EXAMPLE/.env.EXAMPLE +++ /dev/null @@ -1,5 +0,0 @@ -# This is an EXAMPLE file, do not put real secrets in here. -# Copy it to .env and then DELETE THIS COMMENT. -# (This is a bit different from the others, which live in config. -# This one is just for the variables in the docker compose file) -HASHER_API_LISTEN_PORT= diff --git a/docker-compose.yml b/docker-compose.yml index 56ecbb8..6c30ca7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,8 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000" + # this is only here as a convenience for testing, we don't actually use it from the exporter + - "127.0.0.1::8000" env_file: - ../config/hasher.env restart: unless-stopped From 7da0c5f208952667f5fc9f64599794ce2a2510a0 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 17:00:18 +0000 Subject: [PATCH 03/22] Specify hasher port as default --- config.EXAMPLE/exporter.env.EXAMPLE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 14a601d..134ac38 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -11,4 +11,4 @@ FTPS_PASSWORD= SNAKEMAKE_RULE_UNTIL= # point to the hasher we wish to use HASHER_API_HOSTNAME=waveform-hasher -HASHER_API_PORT= +HASHER_API_PORT=8000 From e6aaf337e9ed4a7121e8e7da6cef2fc1b3dd669e Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 17:20:03 +0000 Subject: [PATCH 04/22] Fix linting --- config.EXAMPLE/hasher.env.EXAMPLE | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index ad79b7a..0631345 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -6,5 +6,4 @@ AZURE_TENANT_ID= AZURE_KEY_VAULT_NAME= # This is the "variable name" of the actual secret, # and can be fixed and is not secret itself -AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" - +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret From 8f3c455abdb9322ece197e597ab6cb1b97b665bc Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Mon, 26 Jan 2026 16:24:39 +0000 Subject: [PATCH 05/22] Doc tweak --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 97d29d7..0afc24a 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Set up the config files as follows: ``` mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env -cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env +cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/exporter.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. From eb2c16a75d0610da63f3f4b7d5081da71d62f489 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 30 Jan 2026 15:21:23 +0000 Subject: [PATCH 06/22] Document how to config the hasher --- README.md | 6 ++-- config.EXAMPLE/hasher.env.EXAMPLE | 6 ++++ docs/azure_hashing.md | 48 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 docs/azure_hashing.md diff --git a/README.md b/README.md index 0afc24a..a7df500 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,11 @@ cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. -#### fill in config files +#### Fill in config files Fill out the config, as appropriate. +See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher. + When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. @@ -86,7 +88,7 @@ mkdir waveform-export #### run it! -Build and start the controller and exporter with docker +Build and start the hasher, controller and exporter with docker. ``` cd waveform-controller docker compose build diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 0631345..efeae85 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,9 +1,15 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. +# Details for the Azure service principal, so it can log in to the keyvault. +# aka "appId" AZURE_CLIENT_ID= +# aka "password" AZURE_CLIENT_SECRET= +# aka "tenant" AZURE_TENANT_ID= +# the name of the key vault, NOT the service principal AZURE_KEY_VAULT_NAME= + # This is the "variable name" of the actual secret, # and can be fixed and is not secret itself AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md new file mode 100644 index 0000000..43898fb --- /dev/null +++ b/docs/azure_hashing.md @@ -0,0 +1,48 @@ +# Setting up Azure + Hashing in Dev and Production + +# Create and configure Azure key vaults + +Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these. + +For each, there is an Azure service principal (aka. machine account) that can read/write secrets to +the key vault. + +# Configure hasher +The hasher needs to be given the service principal details so it can create/obtain the +secrets. It also needs to know the name of the manually-created secret (see next section for more details). + +See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. + +# Manual Azure config + +There is a one-off (per key vault) step that needs to be performed manually. + +First, install the Azure CLI tools in the usual way for your OS. + +Log in using the service principal. +Do not include password on command line; let it prompt you and then paste it in. +``` +az login --service-principal --username --tenant +``` + +Now you can run commands to inspect the existing setup: +``` +# show all keyvaults +az keyvault list + +# Show keyvault details (not secrets). name is "name" key from previous command +az keyvault show --name + +# list all secrets in keyvault +az keyvault secret list --vault-name +``` +As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4), +you need to manually create a secret project-level key: +``` +az keyvault secret set --vault-name --name --value +``` +Note that you can choose the name of this secret (`` above), and its name (NOT its value) +should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME` + +In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass +in, the first time that you request a hash using that project slug. From 5ea60e6c8dc943902f489ae2942e8504d341473d Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 16:17:08 +0000 Subject: [PATCH 07/22] Run hasher service during test and expect longer hash values now that we are using real hashes --- docs/azure_hashing.md | 7 +++ docs/develop.md | 21 +++++++ tests/test_snakemake_integration.py | 86 ++++++++++++++++++++++------- 3 files changed, 93 insertions(+), 21 deletions(-) diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md index 43898fb..ea7223a 100644 --- a/docs/azure_hashing.md +++ b/docs/azure_hashing.md @@ -7,6 +7,13 @@ Azure key vaults for dev and prod already exist. Ask your team-mates how to find For each, there is an Azure service principal (aka. machine account) that can read/write secrets to the key vault. +> ![CAUTION] +> Never use the production key vault credentials anywhere except on the production machines! +> The dev key vault credentials can be used for testing on your dev machine, testing +> on GitHub Actions, etc, but should still be kept safe. +> Also, there is no need to store the actual secret keys themselves outside of the key vaults, +> that's what the key vaults are for! + # Configure hasher The hasher needs to be given the service principal details so it can create/obtain the secrets. It also needs to know the name of the manually-created secret (see next section for more details). diff --git a/docs/develop.md b/docs/develop.md index 03618b1..f5dd3aa 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -86,6 +86,13 @@ From the repo root, install the software and its deps: uv pip install -e '.[dev]' ``` +Some tests run in Docker and require the hasher to be configured with dev key vault credentials, which +it likely already is if you've followed the steps in [azure and hasher setup](azure_hashing.md) . + +The tests bring up temporary containers as needed, so you don't need to manually pre-start +any containers to make the tests work, and they shouldn't interfere with anything you might already have running. + +When these tests run on GitHub Actions, they use GitHub Secrets to get the creds for the dev key vault. ### API tests @@ -94,3 +101,17 @@ from PROJECT_ROOT directory ```shell script ./tools/run-tests.sh ``` + +## Manual hash lookup + +Use `docker ps` to find the ephemeral port of the hasher you are running locally. It should only be listening on +127.0.0.1. + +Use curl to send it HTTP requests as below. Don't forget the prefix "csn:" that indicates what type of value you +want to hash. + +```bash +# example port 32801 +curl 'http://localhost:32801/hash?project_slug=waveform-exporter&message=csn:SECRET_CSN_1235' +ea2fda353f54926ae9d43fbc0ff4253912c250a137e9bd38bed860abacfe03ef +``` diff --git a/tests/test_snakemake_integration.py b/tests/test_snakemake_integration.py index 59b56b2..c164c65 100644 --- a/tests/test_snakemake_integration.py +++ b/tests/test_snakemake_integration.py @@ -15,13 +15,12 @@ import pytest from stablehash import stablehash -from src.pseudon.hashing import do_hash - def _run_compose( compose_file: Path, args: list[str], cwd: Path ) -> subprocess.CompletedProcess: - cmd = ["docker", "compose", "-f", str(compose_file), *args] + # set project name so as not to interfere with images/containers the user might already have + cmd = ["docker", "compose", "-p", "pytest", "-f", str(compose_file), *args] return subprocess.run(cmd, cwd=str(cwd), capture_output=True, text=True) @@ -37,14 +36,20 @@ def _run_compose( "values", ] +REPO_ROOT = Path(__file__).resolve().parents[1] +COMPOSE_FILE = REPO_ROOT / "docker-compose.yml" + @pytest.fixture(scope="session", autouse=True) -def build_exporter_image(): - repo_root = Path(__file__).resolve().parents[1] - compose_file = repo_root / "docker-compose.yml" - result = _run_compose(compose_file, ["build", "waveform-exporter"], cwd=repo_root) - print(f"stdout:\n{result.stdout}\n" f"stderr:\n{result.stderr}") - result.check_returncode() +def build_required_images(): + for image in ["waveform-exporter", "waveform-hasher"]: + print(f"BUILDING {image}:") + result = _run_compose(COMPOSE_FILE, ["build", image], cwd=REPO_ROOT) + print( + f"{image} build output:\nstdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}END {image} build output [rc={result.returncode}]" + ) + result.check_returncode() @dataclass @@ -62,7 +67,26 @@ class TestFileDescription: _test_values: list = None def get_hashed_csn(self): - return do_hash("csn", self.csn) + """This test runs outside of a docker container and the hasher doesn't expose a + fixed port, so is somewhat hard to find. + + Easier to just hard code the CSNs since we have a limited set of input CSNs. We + are defining expected values here; the test exporter will still use the test + hasher to generate the actual values. + """ + static_lookup = { + # These hashes are not secrets, they're keyed hashes of the input values + "SECRET_CSN_1234": "253d32c67e0d5aa4cdc7e9fc8442710dee8338c92abc3b905ab4b2f03194fc7e", # pragma: allowlist secret + "SECRET_CSN_1235": "ea2fda353f54926ae9d43fbc0ff4253912c250a137e9bd38bed860abacfe03ef", # pragma: allowlist secret + } + try: + return static_lookup[self.csn] + except KeyError as e: + # See develop.md "Manual hash lookup" if you need to add value + raise KeyError( + f"Unknown CSN '{self.csn}' passed to static get_hashed_csn(). " + f"You may need to manually add the known hash for your CSN. See docs for details." + ) from e def get_orig_csv(self): return f"{self.date}.{self.csn}.{self.variable_id}.{self.channel_id}.{self.units}.csv" @@ -135,7 +159,31 @@ def _make_test_input_csv(tmp_path, t: TestFileDescription) -> list[list[Decimal] return test_data -def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): +@pytest.fixture(scope="function") +def background_hasher(): + # run hasher in background + _run_compose( + COMPOSE_FILE, + [ + "up", + "-d", + # Azure envs should be there already + "waveform-hasher", + ], + cwd=REPO_ROOT, + ).check_returncode() + yield + _run_compose( + COMPOSE_FILE, + [ + "down", + "waveform-hasher", + ], + cwd=REPO_ROOT, + ).check_returncode() + + +def test_snakemake_pipeline(tmp_path: Path, background_hasher): # ARRANGE # all fields that need to be de-IDed should contain the string "SECRET" so we can search for it later @@ -227,7 +275,7 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): } # ACT - run_snakemake(tmp_path) + _run_snakemake(tmp_path) # ASSERT (data files) for filename, expected_data in test_data_files: @@ -259,10 +307,8 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): assert 2 == len(list((tmp_path / "hash-lookups").iterdir())) -def run_snakemake(tmp_path): - repo_root = Path(__file__).resolve().parents[1] - compose_file = repo_root / "docker-compose.yml" - +def _run_snakemake(tmp_path): + # run system under test (exporter container) in foreground compose_args = [ "run", "--rm", @@ -278,9 +324,9 @@ def run_snakemake(tmp_path): "waveform-exporter", ] result = _run_compose( - compose_file, + COMPOSE_FILE, compose_args, - cwd=repo_root, + cwd=REPO_ROOT, ) # for convenience print the snakemake log files if they exist (on success or error) outer_logs_dir = tmp_path / "snakemake-logs" @@ -329,9 +375,7 @@ def _compare_parquets( else: # pseudon expected, check that it looks like a hash assert all( - # will need lengthening when we use real hashes! - re.match(r"[a-f0-9]{8}$", str(v)) - for v in pseudon_all_values + re.match(r"[a-f0-9]{64}$", str(v)) for v in pseudon_all_values ), f"{pseudon_all_values} in column {column_name} does not appear to be a hash" # orig, all sensitive values contain SECRET assert all( From 974fc389c956918e665f8e969cf4ff3673aa2798 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 16:34:41 +0000 Subject: [PATCH 08/22] Re-add stablehash as a dev dependency --- pyproject.toml | 5 ++++- uv.lock | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 03e6875..8f09dcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,10 @@ dependencies = [ ] [project.optional-dependencies] -dev = ["pytest>=9.0.2"] +dev = [ + "pytest>=9.0.2", + "stablehash==0.3.0", +] [project.scripts] emap-extract-waveform = "controller:receiver" diff --git a/uv.lock b/uv.lock index 716dfcd..0c78cab 100644 --- a/uv.lock +++ b/uv.lock @@ -2125,13 +2125,14 @@ dependencies = [ { name = "pika" }, { name = "pre-commit" }, { name = "psycopg2-binary" }, + { name = "requests" }, { name = "snakemake" }, - { name = "stablehash" }, ] [package.optional-dependencies] dev = [ { name = "pytest" }, + { name = "stablehash" }, ] [package.metadata] @@ -2142,8 +2143,9 @@ requires-dist = [ { name = "pre-commit", specifier = ">=4.5.0" }, { name = "psycopg2-binary", specifier = ">=2.9.10" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, + { name = "requests", specifier = "==2.32.3" }, { name = "snakemake", specifier = "==9.14.5" }, - { name = "stablehash", specifier = "==0.3.0" }, + { name = "stablehash", marker = "extra == 'dev'", specifier = "==0.3.0" }, ] [[package]] From 457fea52c133ea2e4abbfeb4bf00e777c40848cb Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 16:56:31 +0000 Subject: [PATCH 09/22] Put Azure secrets into hasher config on GHA --- .github/workflows/pytest.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index eb2a872..94d0de7 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -42,11 +42,25 @@ jobs: - name: Prepare config env files for compose working-directory: waveform-controller + env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_KEY_VAULT_NAME: ${{ secrets.AZURE_KEY_VAULT_NAME }} + AZURE_KEY_VAULT_SECRET_NAME: ${{ vars.AZURE_KEY_VAULT_SECRET_NAME }} run: | mkdir -p ../config cp config.EXAMPLE/exporter.env.EXAMPLE ../config/exporter.env cp config.EXAMPLE/hasher.env.EXAMPLE ../config/hasher.env cp config.EXAMPLE/controller.env.EXAMPLE ../config/controller.env + { + echo "" + echo "AZURE_CLIENT_ID=${AZURE_CLIENT_ID}" + echo "AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET}" + echo "AZURE_TENANT_ID=${AZURE_TENANT_ID}" + echo "AZURE_KEY_VAULT_NAME=${AZURE_KEY_VAULT_NAME}" + echo "AZURE_KEY_VAULT_SECRET_NAME=${AZURE_KEY_VAULT_SECRET_NAME}" + } >> ../config/hasher.env - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -62,4 +76,10 @@ jobs: - name: Run the tests working-directory: waveform-controller + env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_KEY_VAULT_NAME: ${{ secrets.AZURE_KEY_VAULT_NAME }} + AZURE_KEY_VAULT_SECRET_NAME: ${{ vars.AZURE_KEY_VAULT_SECRET_NAME }} run: uv run pytest tests From 6ab0fc8c855e36b68b09380059806c63147b2502 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 17:08:48 +0000 Subject: [PATCH 10/22] Only need secrets in config file creation step --- .github/workflows/pytest.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 94d0de7..c00c09a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -76,10 +76,4 @@ jobs: - name: Run the tests working-directory: waveform-controller - env: - AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} - AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} - AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} - AZURE_KEY_VAULT_NAME: ${{ secrets.AZURE_KEY_VAULT_NAME }} - AZURE_KEY_VAULT_SECRET_NAME: ${{ vars.AZURE_KEY_VAULT_SECRET_NAME }} run: uv run pytest tests From 916bb26cd96b1bb2a7e2d89e306ad814a3b1d793 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 17:12:42 +0000 Subject: [PATCH 11/22] Show hasher logs if pytest failed --- .github/workflows/pytest.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index c00c09a..8fefa5a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -77,3 +77,8 @@ jobs: - name: Run the tests working-directory: waveform-controller run: uv run pytest tests + + - name: Dump waveform-hasher logs on failure + if: failure() + working-directory: waveform-controller + run: docker compose -p pytest -f docker-compose.yml logs waveform-hasher From 84ad64489f32471e8c2995b1a2b5ddd9c8d8949c Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 17:21:19 +0000 Subject: [PATCH 12/22] Why is docker logs not showing anything? --- .github/workflows/pytest.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 8fefa5a..b7198bc 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -81,4 +81,11 @@ jobs: - name: Dump waveform-hasher logs on failure if: failure() working-directory: waveform-controller - run: docker compose -p pytest -f docker-compose.yml logs waveform-hasher + run: | + echo FOO1 + docker ps -a + echo FOO2 + docker ps -q | while read c; do echo "$c"; docker logs "$c"; done + echo FOO3 + docker compose -p pytest -f docker-compose.yml logs waveform-hasher + echo FOO4 From 5920f9f667ad3fe5f4d8923d817927fffd764b6b Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 17:39:04 +0000 Subject: [PATCH 13/22] Print hasher logs in pytest itself, before the container is destroyed --- tests/test_snakemake_integration.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_snakemake_integration.py b/tests/test_snakemake_integration.py index c164c65..6c54e8b 100644 --- a/tests/test_snakemake_integration.py +++ b/tests/test_snakemake_integration.py @@ -167,12 +167,23 @@ def background_hasher(): [ "up", "-d", - # Azure envs should be there already + # Assume Azure envs will come from config file "waveform-hasher", ], cwd=REPO_ROOT, ).check_returncode() yield + # print hasher logs whether failed or not + result = _run_compose( + COMPOSE_FILE, + [ + "logs", + "--no-color", + "waveform-hasher", + ], + cwd=REPO_ROOT, + ) + print(f"waveform-hasher logs:\n{result.stdout}\n{result.stderr}") _run_compose( COMPOSE_FILE, [ From d4c9490ed458771d9e58ae2b07be8d90a60e9c88 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 17:46:03 +0000 Subject: [PATCH 14/22] Secrets/variables are set in an environment, not at repo level, so need to refer to that. --- .github/workflows/pytest.yml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index b7198bc..45dea06 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest env: PROJECT_ROOT: ${{ github.workspace }}/waveform-controller - + environment: hasher steps: - uses: actions/checkout@v5 with: @@ -78,14 +78,3 @@ jobs: working-directory: waveform-controller run: uv run pytest tests - - name: Dump waveform-hasher logs on failure - if: failure() - working-directory: waveform-controller - run: | - echo FOO1 - docker ps -a - echo FOO2 - docker ps -q | while read c; do echo "$c"; docker logs "$c"; done - echo FOO3 - docker compose -p pytest -f docker-compose.yml logs waveform-hasher - echo FOO4 From 25bfc47de97335cc83319617b73fa63f6af958ad Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Wed, 4 Feb 2026 13:03:06 +0000 Subject: [PATCH 15/22] Need to apply defaults *after* we have loaded the config file containing the intended values --- exporter-scripts/scheduled-script.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/exporter-scripts/scheduled-script.sh b/exporter-scripts/scheduled-script.sh index 40b2887..02f9e3b 100755 --- a/exporter-scripts/scheduled-script.sh +++ b/exporter-scripts/scheduled-script.sh @@ -7,9 +7,6 @@ set -euo pipefail # The snakemake output goes to its own log file as defined here. # These files will end up on Windows so be careful about disallowed characters in the names. date_str=$(date --utc +"%Y%m%dT%H%M%S") -SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}" -# for temporarily making the pipeline not go all the way -SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}" # log file for the overall snakemake run (as opposed to per-job logs, # which are defined in the snakefile) @@ -22,6 +19,10 @@ touch "$outer_log_file" set -a source /config/exporter.env set +a +# Now that we have loaded config file, apply default values +SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}" +# For telling the pipeline not to go all the way +SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}" set +e snakemake --snakefile /app/src/pipeline/Snakefile \ --cores "$SNAKEMAKE_CORES" \ From 3644975349109028b6b74d19cb8e0ff7a57cf99d Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Wed, 4 Feb 2026 13:25:17 +0000 Subject: [PATCH 16/22] Don't test FTPS from GHA --- .github/workflows/pytest.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 45dea06..8a55137 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -61,6 +61,10 @@ jobs: echo "AZURE_KEY_VAULT_NAME=${AZURE_KEY_VAULT_NAME}" echo "AZURE_KEY_VAULT_SECRET_NAME=${AZURE_KEY_VAULT_SECRET_NAME}" } >> ../config/hasher.env + # Don't attempt to upload FTPS from GHA + { + echo "SNAKEMAKE_RULE_UNTIL=all_daily_hash_lookups" + } >> ../config/exporter.env - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 From f80ef45204dfc7cee934f0a217d8cf29c5f1c3f2 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Wed, 4 Feb 2026 13:25:29 +0000 Subject: [PATCH 17/22] Explicitly mark helper class as not a test, to remove pytest warning --- tests/test_snakemake_integration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_snakemake_integration.py b/tests/test_snakemake_integration.py index 6c54e8b..74b3e58 100644 --- a/tests/test_snakemake_integration.py +++ b/tests/test_snakemake_integration.py @@ -54,6 +54,7 @@ def build_required_images(): @dataclass class TestFileDescription: + __test__ = False date: str start_timestamp: float csn: str From 982e3aad6c210e3c51551fc77f0770dc01006b8a Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Tue, 3 Feb 2026 16:21:18 +0000 Subject: [PATCH 18/22] If we're building images in the pytest test, then surely we don't need to build here too. --- .github/workflows/pytest.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 8a55137..28499ba 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -66,18 +66,6 @@ jobs: echo "SNAKEMAKE_RULE_UNTIL=all_daily_hash_lookups" } >> ../config/exporter.env - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build exporter image (cached) - uses: docker/build-push-action@v6 - with: - context: . - file: waveform-controller/Dockerfile - target: waveform_exporter - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Run the tests working-directory: waveform-controller run: uv run pytest tests From a5a2d8880b478e9cbeb7e08ad830c6c44d8d33b4 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Wed, 4 Feb 2026 15:41:50 +0000 Subject: [PATCH 19/22] Run tests on any PR branch --- .github/workflows/pytest.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 28499ba..08943d6 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -4,7 +4,6 @@ on: push: branches: [main, dev] pull_request: - branches: [main, dev] workflow_dispatch: jobs: @@ -69,4 +68,3 @@ jobs: - name: Run the tests working-directory: waveform-controller run: uv run pytest tests - From 19571ece3e040f36efc91f67a6a31f051bbdf707 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Wed, 4 Feb 2026 15:43:44 +0000 Subject: [PATCH 20/22] Do linting on any PR branch --- .github/workflows/linting.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4f130d4..582f11d 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -7,8 +7,6 @@ on: push: branches: [dev, main] pull_request: - branches: [dev, main] - types: ["opened", "reopened", "synchronize", "ready_for_review"] workflow_dispatch: concurrency: From 75f71050e3c5d9b919a496217312800909602386 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Thu, 5 Feb 2026 15:05:25 +0000 Subject: [PATCH 21/22] Fix dead link Co-authored-by: Stephen Thompson --- docs/azure_hashing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md index ea7223a..1fafd48 100644 --- a/docs/azure_hashing.md +++ b/docs/azure_hashing.md @@ -18,7 +18,7 @@ the key vault. The hasher needs to be given the service principal details so it can create/obtain the secrets. It also needs to know the name of the manually-created secret (see next section for more details). -See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. +See [hasher example config](../config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. # Manual Azure config From 2400901fdcbfa5f4ecf62d25938c10477b259737 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Thu, 5 Feb 2026 15:04:51 +0000 Subject: [PATCH 22/22] Fix markdown typo --- docs/azure_hashing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md index 1fafd48..5db9928 100644 --- a/docs/azure_hashing.md +++ b/docs/azure_hashing.md @@ -7,7 +7,7 @@ Azure key vaults for dev and prod already exist. Ask your team-mates how to find For each, there is an Azure service principal (aka. machine account) that can read/write secrets to the key vault. -> ![CAUTION] +> [!CAUTION] > Never use the production key vault credentials anywhere except on the production machines! > The dev key vault credentials can be used for testing on your dev machine, testing > on GitHub Actions, etc, but should still be kept safe.