SAFEHR-data · jeremyestein · Feb 5, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -7,8 +7,6 @@ on:
   push:
     branches: [dev, main]
   pull_request:
-    branches: [dev, main]
-    types: ["opened", "reopened", "synchronize", "ready_for_review"]
   workflow_dispatch:
 
 concurrency:

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches: [main, dev]
   pull_request:
-    branches: [main, dev]
   workflow_dispatch:
 
 jobs:
@@ -13,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       PROJECT_ROOT: ${{ github.workspace }}/waveform-controller
-
+    environment: hasher
     steps:
       - uses: actions/checkout@v5
         with:
@@ -42,23 +41,29 @@ jobs:
 
       - name: Prepare config env files for compose
         working-directory: waveform-controller
+        env:
+          AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+          AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
+          AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+          AZURE_KEY_VAULT_NAME: ${{ secrets.AZURE_KEY_VAULT_NAME }}
+          AZURE_KEY_VAULT_SECRET_NAME: ${{ vars.AZURE_KEY_VAULT_SECRET_NAME }}
         run: |
           mkdir -p ../config
           cp config.EXAMPLE/exporter.env.EXAMPLE ../config/exporter.env
           cp config.EXAMPLE/hasher.env.EXAMPLE ../config/hasher.env
           cp config.EXAMPLE/controller.env.EXAMPLE ../config/controller.env
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build exporter image (cached)
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: waveform-controller/Dockerfile
-          target: waveform_exporter
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          {
+            echo ""
+            echo "AZURE_CLIENT_ID=${AZURE_CLIENT_ID}"
+            echo "AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET}"
+            echo "AZURE_TENANT_ID=${AZURE_TENANT_ID}"
+            echo "AZURE_KEY_VAULT_NAME=${AZURE_KEY_VAULT_NAME}"
+            echo "AZURE_KEY_VAULT_SECRET_NAME=${AZURE_KEY_VAULT_SECRET_NAME}"
+          } >> ../config/hasher.env
+          # Don't attempt to upload FTPS from GHA
+          {
+            echo "SNAKEMAKE_RULE_UNTIL=all_daily_hash_lookups"
+          } >> ../config/exporter.env
 
       - name: Run the tests
         working-directory: waveform-controller

diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,8 @@ wheels/
 # settings files (should not be in the source tree anyway, but just in case)
 *.env
 
+# snakemake tracking files
+.snakemake
 
 # pytest tmp paths
 .pytest_tmp
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,7 +33,8 @@ repos:
           [
             "pandas-stubs",
             "types-psycopg2",
-            "types-pika"
+            "types-pika",
+            "types-requests",
           ]
         files: src/
   # a collection of sanity checks: check for merge conflicts, check the end of

diff --git a/README.md b/README.md
@@ -54,9 +54,12 @@ separate to the Emap project root.
 
 ### Instructions for achieving this structure
 
+
+#### Clone repos
 Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL),
 both inside your root directory.
 
+#### make config files
 Set up the config files as follows:
 ```
 mkdir config
@@ -66,6 +69,21 @@ cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.
 
+#### Fill in config files
+Fill out the config, as appropriate.
+
+See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher.
+
+When updating to a new version of this code, you should diff the .EXAMPLE file against its live version,
+eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`.
+
+This checks if any config options have been added/removed from the .EXAMPLE, and thus should be
+added/removed from the live file.
+
+> [!CAUTION]
+> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file!
+
+#### make necessary directories
 If it doesn't already exist you should create a directory named
 `waveform-export` in the parent directory to store the saved waveform
 messages.
@@ -74,7 +92,9 @@ messages.
 mkdir waveform-export
 ```
 
-Build and start the controller and exporter with docker
+#### run it!
+
+Build and start the hasher, controller and exporter with docker.
 ```
 cd waveform-controller
 docker compose build

diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE
@@ -2,7 +2,13 @@
 # Copy it to ../config/exporter.env and then DELETE THIS COMMENT.
 # When does the exporter run
 EXPORTER_CRON_SCHEDULE="14 5 * * *"
+# Where to upload via FTPS
 FTPS_HOST=myftps.example.com
 FTPS_PORT=990
 FTPS_USERNAME=
 FTPS_PASSWORD=
+# only run workflow up to and including the specified rule
+SNAKEMAKE_RULE_UNTIL=
+# point to the hasher we wish to use
+HASHER_API_HOSTNAME=waveform-hasher
+HASHER_API_PORT=8000
diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE
@@ -1,6 +1,15 @@
 # This is an EXAMPLE file, do not put real secrets in here.
 # Copy it to ../config/hasher.env and then DELETE THIS COMMENT.
-HASHER_API_AZ_CLIENT_ID=
-HASHER_API_AZ_CLIENT_PASSWORD=
-HASHER_API_AZ_TENANT_ID=
-HASHER_API_AZ_KEY_VAULT_NAME=
+# Details for the Azure service principal, so it can log in to the keyvault.
+# aka "appId"
+AZURE_CLIENT_ID=
+# aka "password"
+AZURE_CLIENT_SECRET=
+# aka "tenant"
+AZURE_TENANT_ID=
+# the name of the key vault, NOT the service principal
+AZURE_KEY_VAULT_NAME=
+
+# This is the "variable name" of the actual secret,
+# and can be fixed and is not secret itself
+AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -47,7 +47,8 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY}
         https_proxy: ${https_proxy}
     ports:
-      - "127.0.0.1:${HASHER_API_PORT}:8000"
+      # this is only here as a convenience for testing, we don't actually use it from the exporter
+      - "127.0.0.1::8000"
     env_file:
       - ../config/hasher.env
     restart: unless-stopped
diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md
@@ -0,0 +1,55 @@
+# Setting up Azure + Hashing in Dev and Production
+
+# Create and configure Azure key vaults
+
+Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these.
+
+For each, there is an Azure service principal (aka. machine account) that can read/write secrets to
+the key vault.
+
+> [!CAUTION]
+> Never use the production key vault credentials anywhere except on the production machines!
+> The dev key vault credentials can be used for testing on your dev machine, testing
+> on GitHub Actions, etc, but should still be kept safe.
+> Also, there is no need to store the actual secret keys themselves outside of the key vaults,
+> that's what the key vaults are for!
+
+# Configure hasher
+The hasher needs to be given the service principal details so it can create/obtain the
+secrets. It also needs to know the name of the manually-created secret (see next section for more details).
+
+See [hasher example config](../config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars.
+
+# Manual Azure config
+
+There is a one-off (per key vault) step that needs to be performed manually.
+
+First, install the Azure CLI tools in the usual way for your OS.
+
+Log in using the service principal.
+Do not include password on command line; let it prompt you and then paste it in.
+```
+az login --service-principal --username <APP_ID> --tenant <TENANT_ID>
+```
+
+Now you can run commands to inspect the existing setup:
+```
+# show all keyvaults
+az keyvault list
+
+# Show keyvault details (not secrets). name is "name" key from previous command
+az keyvault show --name <keyvault_name>
+
+# list all secrets in keyvault
+az keyvault secret list --vault-name <keyvault_name>
+```
+As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4),
+you need to manually create a secret project-level key:
+```
+az keyvault secret set --vault-name <keyvault_name> --name <secret_name> --value <secret_value>
+```
+Note that you can choose the name of this secret (`<secret_name>` above), and its name (NOT its value)
+should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME`
+
+In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass
+in, the first time that you request a hash using that project slug.
diff --git a/docs/develop.md b/docs/develop.md
@@ -84,7 +84,30 @@ From the repo root, install the software and its deps:
 ```
 uv pip install -e '.[dev]'
 ```
+
+Some tests run in Docker and require the hasher to be configured with dev key vault credentials, which
+it likely already is if you've followed the steps in [azure and hasher setup](azure_hashing.md) .
+
+The tests bring up temporary containers as needed, so you don't need to manually pre-start
+any containers to make the tests work, and they shouldn't interfere with anything you might already have running.
+
+When these tests run on GitHub Actions, they use GitHub Secrets to get the creds for the dev key vault.
+
 Run tests with:
 ```
 pytest
 ```
+
+## Manual hash lookup
+
+Use `docker ps` to find the ephemeral port of the hasher you are running locally. It should only be listening on
+127.0.0.1.
+
+Use curl to send it HTTP requests as below. Don't forget the prefix "csn:" that indicates what type of value you
+want to hash.
+
+```bash
+# example port 32801
+curl 'http://localhost:32801/hash?project_slug=waveform-exporter&message=csn:SECRET_CSN_1235'
+ea2fda353f54926ae9d43fbc0ff4253912c250a137e9bd38bed860abacfe03ef
+```
diff --git a/exporter-scripts/scheduled-script.sh b/exporter-scripts/scheduled-script.sh
@@ -7,9 +7,6 @@ set -euo pipefail
 # The snakemake output goes to its own log file as defined here.
 # These files will end up on Windows so be careful about disallowed characters in the names.
 date_str=$(date --utc +"%Y%m%dT%H%M%S")
-SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}"
-# for temporarily making the pipeline not go all the way
-SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}"
 
 # log file for the overall snakemake run (as opposed to per-job logs,
 # which are defined in the snakefile)
@@ -22,6 +19,10 @@ touch "$outer_log_file"
 set -a
 source /config/exporter.env
 set +a
+# Now that we have loaded config file, apply default values
+SNAKEMAKE_CORES="${SNAKEMAKE_CORES:-1}"
+# For telling the pipeline not to go all the way
+SNAKEMAKE_RULE_UNTIL="${SNAKEMAKE_RULE_UNTIL:-all}"
 set +e
 snakemake --snakefile /app/src/pipeline/Snakefile \
   --cores "$SNAKEMAKE_CORES" \

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,13 +12,16 @@ dependencies = [
     "snakemake==9.14.5",
     # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't)
     "psycopg2-binary>=2.9.10",
-    "stablehash==0.3.0",
+    "requests==2.32.3",
     # trick for making a "relative" path, works inside or outside container image
     "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core",
 ]
 
 [project.optional-dependencies]
-dev = ["pytest>=9.0.2"]
+dev = [
+    "pytest>=9.0.2",
+    "stablehash==0.3.0",
+]
 
 [project.scripts]
 emap-extract-waveform = "controller:receiver"

diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py
@@ -1,17 +1,36 @@
+import logging
 from functools import lru_cache
-from stablehash import stablehash
 
+import requests
 
-@lru_cache
+import settings
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=1000)
 def do_hash(type_prefix: str, value: str):
-    """Stub implementation of deidentification function for testing purposes.
+    """Pass data to the hasher API for de-identification purposes.
 
     Not that I think this will happen in practice, but we'd want the CSN "1234" to hash
     to a different value than the MRN "1234", so prefix each value with its type.
     """
-    # Full implementation of issue #6 must remove this code and call the real hasher!!
-    SALT = "waveform-exporter"
-    full_value_to_hash = f"{SALT}:{type_prefix}:{value}"
-    full_hash = stablehash(full_value_to_hash).hexdigest()
-    tiny_hash = full_hash[:8]
-    return tiny_hash
+
+    project_slug = "waveform-exporter"
+    full_value_to_hash = f"{type_prefix}:{value}"
+
+    hasher_hostname = settings.HASHER_API_HOSTNAME
+    hasher_port = settings.HASHER_API_PORT
+    hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash"
+    request_params: dict[str, str | int] = {
+        "project_slug": project_slug,
+        "message": full_value_to_hash,
+    }
+    # do we need to specify a particular hash length?
+    # request_params["length"] = hash_len
+
+    response = requests.get(hasher_req_url, params=request_params)
+    logger.debug("RESPONSE = {}", response.text)
+    response.raise_for_status()
+    real_hash = response.text
+    return real_hash
diff --git a/src/settings.py b/src/settings.py
@@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None):
 get_from_env("FTPS_PORT", default_value=990)
 get_from_env("FTPS_USERNAME")
 get_from_env("FTPS_PASSWORD")
+
+get_from_env("HASHER_API_HOSTNAME")
+get_from_env("HASHER_API_PORT")