diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4f130d4..582f11d 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -7,8 +7,6 @@ on: push: branches: [dev, main] pull_request: - branches: [dev, main] - types: ["opened", "reopened", "synchronize", "ready_for_review"] workflow_dispatch: concurrency: diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index eb2a872..08943d6 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -4,7 +4,6 @@ on: push: branches: [main, dev] pull_request: - branches: [main, dev] workflow_dispatch: jobs: @@ -13,7 +12,7 @@ jobs: runs-on: ubuntu-latest env: PROJECT_ROOT: ${{ github.workspace }}/waveform-controller - + environment: hasher steps: - uses: actions/checkout@v5 with: @@ -42,23 +41,29 @@ jobs: - name: Prepare config env files for compose working-directory: waveform-controller + env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_KEY_VAULT_NAME: ${{ secrets.AZURE_KEY_VAULT_NAME }} + AZURE_KEY_VAULT_SECRET_NAME: ${{ vars.AZURE_KEY_VAULT_SECRET_NAME }} run: | mkdir -p ../config cp config.EXAMPLE/exporter.env.EXAMPLE ../config/exporter.env cp config.EXAMPLE/hasher.env.EXAMPLE ../config/hasher.env cp config.EXAMPLE/controller.env.EXAMPLE ../config/controller.env - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build exporter image (cached) - uses: docker/build-push-action@v6 - with: - context: . - file: waveform-controller/Dockerfile - target: waveform_exporter - cache-from: type=gha - cache-to: type=gha,mode=max + { + echo "" + echo "AZURE_CLIENT_ID=${AZURE_CLIENT_ID}" + echo "AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET}" + echo "AZURE_TENANT_ID=${AZURE_TENANT_ID}" + echo "AZURE_KEY_VAULT_NAME=${AZURE_KEY_VAULT_NAME}" + echo "AZURE_KEY_VAULT_SECRET_NAME=${AZURE_KEY_VAULT_SECRET_NAME}" + } >> ../config/hasher.env + # Don't attempt to upload FTPS from GHA + { + echo "SNAKEMAKE_RULE_UNTIL=all_daily_hash_lookups" + } >> ../config/exporter.env - name: Run the tests working-directory: waveform-controller diff --git a/.gitignore b/.gitignore index cc2903b..4cf281c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ wheels/ # settings files (should not be in the source tree anyway, but just in case) *.env +# snakemake tracking files +.snakemake # pytest tmp paths .pytest_tmp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4b0fd9..a217d81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,8 @@ repos: [ "pandas-stubs", "types-psycopg2", - "types-pika" + "types-pika", + "types-requests", ] files: src/ # a collection of sanity checks: check for merge conflicts, check the end of diff --git a/README.md b/README.md index 209f00a..0b95ef6 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,12 @@ separate to the Emap project root. ### Instructions for achieving this structure + +#### Clone repos Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL), both inside your root directory. +#### make config files Set up the config files as follows: ``` mkdir config @@ -66,6 +69,21 @@ cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. +#### Fill in config files +Fill out the config, as appropriate. + +See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher. + +When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, +eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. + +This checks if any config options have been added/removed from the .EXAMPLE, and thus should be +added/removed from the live file. + +> [!CAUTION] +> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file! + +#### make necessary directories If it doesn't already exist you should create a directory named `waveform-export` in the parent directory to store the saved waveform messages. @@ -74,7 +92,9 @@ messages. mkdir waveform-export ``` -Build and start the controller and exporter with docker +#### run it! + +Build and start the hasher, controller and exporter with docker. ``` cd waveform-controller docker compose build diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 7be714c..134ac38 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -2,7 +2,13 @@ # Copy it to ../config/exporter.env and then DELETE THIS COMMENT. # When does the exporter run EXPORTER_CRON_SCHEDULE="14 5 * * *" +# Where to upload via FTPS FTPS_HOST=myftps.example.com FTPS_PORT=990 FTPS_USERNAME= FTPS_PASSWORD= +# only run workflow up to and including the specified rule +SNAKEMAKE_RULE_UNTIL= +# point to the hasher we wish to use +HASHER_API_HOSTNAME=waveform-hasher +HASHER_API_PORT=8000 diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 1cd6ad0..efeae85 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,6 +1,15 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. -HASHER_API_AZ_CLIENT_ID= -HASHER_API_AZ_CLIENT_PASSWORD= -HASHER_API_AZ_TENANT_ID= -HASHER_API_AZ_KEY_VAULT_NAME= +# Details for the Azure service principal, so it can log in to the keyvault. +# aka "appId" +AZURE_CLIENT_ID= +# aka "password" +AZURE_CLIENT_SECRET= +# aka "tenant" +AZURE_TENANT_ID= +# the name of the key vault, NOT the service principal +AZURE_KEY_VAULT_NAME= + +# This is the "variable name" of the actual secret, +# and can be fixed and is not secret itself +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret diff --git a/docker-compose.yml b/docker-compose.yml index d20153a..6c30ca7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,8 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_PORT}:8000" + # this is only here as a convenience for testing, we don't actually use it from the exporter + - "127.0.0.1::8000" env_file: - ../config/hasher.env restart: unless-stopped diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md new file mode 100644 index 0000000..5db9928 --- /dev/null +++ b/docs/azure_hashing.md @@ -0,0 +1,55 @@ +# Setting up Azure + Hashing in Dev and Production + +# Create and configure Azure key vaults + +Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these. + +For each, there is an Azure service principal (aka. machine account) that can read/write secrets to +the key vault. + +> [!CAUTION] +> Never use the production key vault credentials anywhere except on the production machines! +> The dev key vault credentials can be used for testing on your dev machine, testing +> on GitHub Actions, etc, but should still be kept safe. +> Also, there is no need to store the actual secret keys themselves outside of the key vaults, +> that's what the key vaults are for! + +# Configure hasher +The hasher needs to be given the service principal details so it can create/obtain the +secrets. It also needs to know the name of the manually-created secret (see next section for more details). + +See [hasher example config](../config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. + +# Manual Azure config + +There is a one-off (per key vault) step that needs to be performed manually. + +First, install the Azure CLI tools in the usual way for your OS. + +Log in using the service principal. +Do not include password on command line; let it prompt you and then paste it in. +``` +az login --service-principal --username --tenant +``` + +Now you can run commands to inspect the existing setup: +``` +# show all keyvaults +az keyvault list + +# Show keyvault details (not secrets). name is "name" key from previous command +az keyvault show --name + +# list all secrets in keyvault +az keyvault secret list --vault-name +``` +As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4), +you need to manually create a secret project-level key: +``` +az keyvault secret set --vault-name --name --value +``` +Note that you can choose the name of this secret (`` above), and its name (NOT its value) +should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME` + +In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass +in, the first time that you request a hash using that project slug. diff --git a/docs/develop.md b/docs/develop.md index 58fdfde..e970eac 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -84,7 +84,30 @@ From the repo root, install the software and its deps: ``` uv pip install -e '.[dev]' ``` + +Some tests run in Docker and require the hasher to be configured with dev key vault credentials, which +it likely already is if you've followed the steps in [azure and hasher setup](azure_hashing.md) . + +The tests bring up temporary containers as needed, so you don't need to manually pre-start +any containers to make the tests work, and they shouldn't interfere with anything you might already have running. + +When these tests run on GitHub Actions, they use GitHub Secrets to get the creds for the dev key vault. + Run tests with: ``` pytest ``` + +## Manual hash lookup + +Use `docker ps` to find the ephemeral port of the hasher you are running locally. It should only be listening on +127.0.0.1. + +Use curl to send it HTTP requests as below. Don't forget the prefix "csn:" that indicates what type of value you +want to hash. + +```bash +# example port 32801 +curl 'http://localhost:32801/hash?project_slug=waveform-exporter&message=csn:SECRET_CSN_1235' +ea2fda353f54926ae9d43fbc0ff4253912c250a137e9bd38bed860abacfe03ef +``` diff --git a/exporter-scripts/scheduled-script.sh b/exporter-scripts/scheduled-script.sh index 40b2887..02f9e3b 100755 --- a/exporter-scripts/scheduled-script.sh +++ b/exporter-scripts/scheduled-script.sh @@ -7,9 +7,6 @@ set -euo pipefail # The snakemake output goes to its own log file as defined here. # These files will end up on Windows so be careful about disallowed characters in the names. date_str=$(date --utc +"%Y%m%dT%H%M%S") -SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}" -# for temporarily making the pipeline not go all the way -SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}" # log file for the overall snakemake run (as opposed to per-job logs, # which are defined in the snakefile) @@ -22,6 +19,10 @@ touch "$outer_log_file" set -a source /config/exporter.env set +a +# Now that we have loaded config file, apply default values +SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}" +# For telling the pipeline not to go all the way +SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}" set +e snakemake --snakefile /app/src/pipeline/Snakefile \ --cores "$SNAKEMAKE_CORES" \ diff --git a/pyproject.toml b/pyproject.toml index 17f395e..8f09dcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,13 +12,16 @@ dependencies = [ "snakemake==9.14.5", # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't) "psycopg2-binary>=2.9.10", - "stablehash==0.3.0", + "requests==2.32.3", # trick for making a "relative" path, works inside or outside container image "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core", ] [project.optional-dependencies] -dev = ["pytest>=9.0.2"] +dev = [ + "pytest>=9.0.2", + "stablehash==0.3.0", +] [project.scripts] emap-extract-waveform = "controller:receiver" diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py index d7ac7b9..35bde98 100644 --- a/src/pseudon/hashing.py +++ b/src/pseudon/hashing.py @@ -1,17 +1,36 @@ +import logging from functools import lru_cache -from stablehash import stablehash +import requests -@lru_cache +import settings + +logger = logging.getLogger(__name__) + + +@lru_cache(maxsize=1000) def do_hash(type_prefix: str, value: str): - """Stub implementation of deidentification function for testing purposes. + """Pass data to the hasher API for de-identification purposes. Not that I think this will happen in practice, but we'd want the CSN "1234" to hash to a different value than the MRN "1234", so prefix each value with its type. """ - # Full implementation of issue #6 must remove this code and call the real hasher!! - SALT = "waveform-exporter" - full_value_to_hash = f"{SALT}:{type_prefix}:{value}" - full_hash = stablehash(full_value_to_hash).hexdigest() - tiny_hash = full_hash[:8] - return tiny_hash + + project_slug = "waveform-exporter" + full_value_to_hash = f"{type_prefix}:{value}" + + hasher_hostname = settings.HASHER_API_HOSTNAME + hasher_port = settings.HASHER_API_PORT + hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash" + request_params: dict[str, str | int] = { + "project_slug": project_slug, + "message": full_value_to_hash, + } + # do we need to specify a particular hash length? + # request_params["length"] = hash_len + + response = requests.get(hasher_req_url, params=request_params) + logger.debug("RESPONSE = {}", response.text) + response.raise_for_status() + real_hash = response.text + return real_hash diff --git a/src/settings.py b/src/settings.py index dfbebc1..8aaed85 100644 --- a/src/settings.py +++ b/src/settings.py @@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None): get_from_env("FTPS_PORT", default_value=990) get_from_env("FTPS_USERNAME") get_from_env("FTPS_PASSWORD") + +get_from_env("HASHER_API_HOSTNAME") +get_from_env("HASHER_API_PORT") diff --git a/tests/test_snakemake_integration.py b/tests/test_snakemake_integration.py index 59b56b2..74b3e58 100644 --- a/tests/test_snakemake_integration.py +++ b/tests/test_snakemake_integration.py @@ -15,13 +15,12 @@ import pytest from stablehash import stablehash -from src.pseudon.hashing import do_hash - def _run_compose( compose_file: Path, args: list[str], cwd: Path ) -> subprocess.CompletedProcess: - cmd = ["docker", "compose", "-f", str(compose_file), *args] + # set project name so as not to interfere with images/containers the user might already have + cmd = ["docker", "compose", "-p", "pytest", "-f", str(compose_file), *args] return subprocess.run(cmd, cwd=str(cwd), capture_output=True, text=True) @@ -37,18 +36,25 @@ def _run_compose( "values", ] +REPO_ROOT = Path(__file__).resolve().parents[1] +COMPOSE_FILE = REPO_ROOT / "docker-compose.yml" + @pytest.fixture(scope="session", autouse=True) -def build_exporter_image(): - repo_root = Path(__file__).resolve().parents[1] - compose_file = repo_root / "docker-compose.yml" - result = _run_compose(compose_file, ["build", "waveform-exporter"], cwd=repo_root) - print(f"stdout:\n{result.stdout}\n" f"stderr:\n{result.stderr}") - result.check_returncode() +def build_required_images(): + for image in ["waveform-exporter", "waveform-hasher"]: + print(f"BUILDING {image}:") + result = _run_compose(COMPOSE_FILE, ["build", image], cwd=REPO_ROOT) + print( + f"{image} build output:\nstdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}END {image} build output [rc={result.returncode}]" + ) + result.check_returncode() @dataclass class TestFileDescription: + __test__ = False date: str start_timestamp: float csn: str @@ -62,7 +68,26 @@ class TestFileDescription: _test_values: list = None def get_hashed_csn(self): - return do_hash("csn", self.csn) + """This test runs outside of a docker container and the hasher doesn't expose a + fixed port, so is somewhat hard to find. + + Easier to just hard code the CSNs since we have a limited set of input CSNs. We + are defining expected values here; the test exporter will still use the test + hasher to generate the actual values. + """ + static_lookup = { + # These hashes are not secrets, they're keyed hashes of the input values + "SECRET_CSN_1234": "253d32c67e0d5aa4cdc7e9fc8442710dee8338c92abc3b905ab4b2f03194fc7e", # pragma: allowlist secret + "SECRET_CSN_1235": "ea2fda353f54926ae9d43fbc0ff4253912c250a137e9bd38bed860abacfe03ef", # pragma: allowlist secret + } + try: + return static_lookup[self.csn] + except KeyError as e: + # See develop.md "Manual hash lookup" if you need to add value + raise KeyError( + f"Unknown CSN '{self.csn}' passed to static get_hashed_csn(). " + f"You may need to manually add the known hash for your CSN. See docs for details." + ) from e def get_orig_csv(self): return f"{self.date}.{self.csn}.{self.variable_id}.{self.channel_id}.{self.units}.csv" @@ -135,7 +160,42 @@ def _make_test_input_csv(tmp_path, t: TestFileDescription) -> list[list[Decimal] return test_data -def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): +@pytest.fixture(scope="function") +def background_hasher(): + # run hasher in background + _run_compose( + COMPOSE_FILE, + [ + "up", + "-d", + # Assume Azure envs will come from config file + "waveform-hasher", + ], + cwd=REPO_ROOT, + ).check_returncode() + yield + # print hasher logs whether failed or not + result = _run_compose( + COMPOSE_FILE, + [ + "logs", + "--no-color", + "waveform-hasher", + ], + cwd=REPO_ROOT, + ) + print(f"waveform-hasher logs:\n{result.stdout}\n{result.stderr}") + _run_compose( + COMPOSE_FILE, + [ + "down", + "waveform-hasher", + ], + cwd=REPO_ROOT, + ).check_returncode() + + +def test_snakemake_pipeline(tmp_path: Path, background_hasher): # ARRANGE # all fields that need to be de-IDed should contain the string "SECRET" so we can search for it later @@ -227,7 +287,7 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): } # ACT - run_snakemake(tmp_path) + _run_snakemake(tmp_path) # ASSERT (data files) for filename, expected_data in test_data_files: @@ -259,10 +319,8 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path): assert 2 == len(list((tmp_path / "hash-lookups").iterdir())) -def run_snakemake(tmp_path): - repo_root = Path(__file__).resolve().parents[1] - compose_file = repo_root / "docker-compose.yml" - +def _run_snakemake(tmp_path): + # run system under test (exporter container) in foreground compose_args = [ "run", "--rm", @@ -278,9 +336,9 @@ def run_snakemake(tmp_path): "waveform-exporter", ] result = _run_compose( - compose_file, + COMPOSE_FILE, compose_args, - cwd=repo_root, + cwd=REPO_ROOT, ) # for convenience print the snakemake log files if they exist (on success or error) outer_logs_dir = tmp_path / "snakemake-logs" @@ -329,9 +387,7 @@ def _compare_parquets( else: # pseudon expected, check that it looks like a hash assert all( - # will need lengthening when we use real hashes! - re.match(r"[a-f0-9]{8}$", str(v)) - for v in pseudon_all_values + re.match(r"[a-f0-9]{64}$", str(v)) for v in pseudon_all_values ), f"{pseudon_all_values} in column {column_name} does not appear to be a hash" # orig, all sensitive values contain SECRET assert all( diff --git a/uv.lock b/uv.lock index 716dfcd..0c78cab 100644 --- a/uv.lock +++ b/uv.lock @@ -2125,13 +2125,14 @@ dependencies = [ { name = "pika" }, { name = "pre-commit" }, { name = "psycopg2-binary" }, + { name = "requests" }, { name = "snakemake" }, - { name = "stablehash" }, ] [package.optional-dependencies] dev = [ { name = "pytest" }, + { name = "stablehash" }, ] [package.metadata] @@ -2142,8 +2143,9 @@ requires-dist = [ { name = "pre-commit", specifier = ">=4.5.0" }, { name = "psycopg2-binary", specifier = ">=2.9.10" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, + { name = "requests", specifier = "==2.32.3" }, { name = "snakemake", specifier = "==9.14.5" }, - { name = "stablehash", specifier = "==0.3.0" }, + { name = "stablehash", marker = "extra == 'dev'", specifier = "==0.3.0" }, ] [[package]]