diff --git a/.Pipelines/template-pipeline-stages.yml b/.Pipelines/template-pipeline-stages.yml
index d62d670f..b763f01a 100644
--- a/.Pipelines/template-pipeline-stages.yml
+++ b/.Pipelines/template-pipeline-stages.yml
@@ -94,7 +94,7 @@ stages:
       ob_outputDirectory: '$(Build.ArtifactStagingDirectory)'
     strategy:
       matrix:
-        Python39:  { python.version: '3.9'  }
+        Python39:  { python.version: '3.9' }
         Python310: { python.version: '3.10' }
         Python311: { python.version: '3.11' }
         Python312: { python.version: '3.12' }
@@ -128,6 +128,7 @@ stages:
           --deselect tests/test_cryptography.py::CryptographyTestCase::test_should_be_run_with_latest_version_of_cryptography \
           2>&1 | tee test-results/pytest-unit.log
       displayName: 'Run pytest (unit)'
+      timeoutInMinutes: 5
       env:
         # Force unbuffered stdout so ADO logs stream in real time through the tee pipe.
         PYTHONUNBUFFERED: '1'
diff --git a/.gitignore b/.gitignore
index 1af10eff..862aea53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,7 +61,7 @@ tests/config.json
 msal_cache.bin
 
 .env
-.perf.baseline
+.perf-baseline/
 
 *.pfx
 .vscode/settings.json
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 8aef40a4..58b821f2 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -70,15 +70,23 @@ extends:
     - stage: Benchmark
       displayName: 'Run benchmarks'
       dependsOn: E2ETests
-      # Only run on post-merge pushes to dev - not on PRs or scheduled runs.
+      # Run on post-merge pushes to dev, or on manual builds when forceBenchmarks=true.
       condition: |
         and(
           succeeded('E2ETests'),
-          eq(variables['Build.SourceBranch'], 'refs/heads/dev'),
           or(
-            eq(variables['Build.Reason'], 'IndividualCI'),
-            eq(variables['Build.Reason'], 'BatchedCI'),
-            eq(variables['Build.Reason'], 'Manual')
+            and(
+              eq(variables['Build.Reason'], 'Manual'),
+              in(variables['forceBenchmarks'], 'true', 'True')
+            ),
+            and(
+              eq(variables['Build.SourceBranch'], 'refs/heads/dev'),
+              or(
+                eq(variables['Build.Reason'], 'IndividualCI'),
+                eq(variables['Build.Reason'], 'BatchedCI'),
+                eq(variables['Build.Reason'], 'Manual')
+              )
+            )
           )
         )
       jobs:
@@ -105,9 +113,10 @@ extends:
           displayName: 'Restore performance baseline cache'
           inputs:
             key: 'perf-baseline | "$(Agent.OS)" | tests/test_benchmark.py'
-            path: .perf.baseline
+            path: $(System.DefaultWorkingDirectory)/.perf-baseline
 
         - bash: |
+            mkdir -p $(System.DefaultWorkingDirectory)/.perf-baseline
             pytest --benchmark-only --benchmark-json benchmark.json --log-cli-level INFO tests/test_benchmark.py
           displayName: 'Run benchmarks'
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 9aaeac05..6eeaae47 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,8 +1,14 @@
+import os
+from pathlib import Path
+
 from tests.simulator import ClientCredentialGrantSimulator as CcaTester
 from perf_baseline import Baseline
 
 
-baseline = Baseline(".perf.baseline", threshold=1.5)  # Up to 1.5x slower than baseline
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+_BASELINE_DIR = _REPO_ROOT / ".perf-baseline"
+os.makedirs(_BASELINE_DIR, exist_ok=True)
+baseline = Baseline(str(_BASELINE_DIR / "data"), threshold=1.5)  # Up to 1.5x slower than baseline
 
 # Here come benchmark test cases, powered by pytest-benchmark
 # Func names will become diag names.