From e79df94733a9b24d84dc8c3f955d4bb4d584f7c7 Mon Sep 17 00:00:00 2001
From: Stella Huang <stellahuang@microsoft.com>
Date: Thu, 26 Mar 2026 14:39:56 -0700
Subject: [PATCH 1/3] Add Kusto telemetry dashboard and queries

---
 .gitattributes                                |   3 +
 analysis/kusto/00-telemetry-validation.kql    | 228 +++++++++++++++
 .../kusto/01-overall-setup-success-rate.kql   |  15 +
 analysis/kusto/02-manager-availability.kql    |  38 +++
 analysis/kusto/03-daily-trend.kql             |  20 ++
 analysis/kusto/04-error-type-distribution.kql |  30 ++
 .../kusto/05-hang-failure-correlation.kql     |  26 ++
 analysis/kusto/06-weekly-health-summary.kql   |  50 ++++
 analysis/kusto/dashboard.ipynb                | 274 ++++++++++++++++++
 analysis/kusto/initialize.py                  |  28 ++
 analysis/kusto/query_runner.py                |  77 +++++
 analysis/kusto/requirements.txt               |   4 +
 12 files changed, 793 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 analysis/kusto/00-telemetry-validation.kql
 create mode 100644 analysis/kusto/01-overall-setup-success-rate.kql
 create mode 100644 analysis/kusto/02-manager-availability.kql
 create mode 100644 analysis/kusto/03-daily-trend.kql
 create mode 100644 analysis/kusto/04-error-type-distribution.kql
 create mode 100644 analysis/kusto/05-hang-failure-correlation.kql
 create mode 100644 analysis/kusto/06-weekly-health-summary.kql
 create mode 100644 analysis/kusto/dashboard.ipynb
 create mode 100644 analysis/kusto/initialize.py
 create mode 100644 analysis/kusto/query_runner.py
 create mode 100644 analysis/kusto/requirements.txt

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..f24d9335
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.ipynb filter=nbstripout
+*.zpln filter=nbstripout
+*.ipynb diff=ipynb
diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql
new file mode 100644
index 00000000..a2f30fce
--- /dev/null
+++ b/analysis/kusto/00-telemetry-validation.kql
@@ -0,0 +1,228 @@
+// =============================================================================
+// TELEMETRY VALIDATION
+// Checks that PR #1365 telemetry events are arriving with correct properties.
+// Each query is self-contained — just copy-paste into Azure Data Explorer.
+// =============================================================================
+
+
+// =============================================================================
+// CHECK 0: UniqueMachines with Telemetry Changes
+// Shows unique machines per version for builds >= 1.23.10781012 (all stable + pre-release).
+// Use this to confirm the latest pre-release build has real users.
+// =============================================================================
+let filtered = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where ExtensionName == "ms-python.vscode-python-envs"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012);
+filtered
+| summarize UniqueMachines = dcount(VSCodeMachineId) by ExtensionVersion
+| extend TotalUniqueMachines = toscalar(filtered | summarize dcount(VSCodeMachineId))
+| order by UniqueMachines desc;
+
+
+// =============================================================================
+// CHECK 1: Are all 4 events arriving? (3 new + 1 enhanced)
+// Expected: at least 1 row per event. If any row shows 0, that event is broken.
+// Broken down by extension version so you can confirm the new build is reporting.
+// =============================================================================
+let allEvents = datatable(EventName: string) [
+    "ms-python.vscode-python-envs/manager_registration.failed",
+    "ms-python.vscode-python-envs/manager_registration.skipped",
+    "ms-python.vscode-python-envs/setup.hang_detected",
+    "ms-python.vscode-python-envs/extension.manager_registration_duration"
+];
+let observed = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName in (
+    "ms-python.vscode-python-envs/manager_registration.failed",
+    "ms-python.vscode-python-envs/manager_registration.skipped",
+    "ms-python.vscode-python-envs/setup.hang_detected",
+    "ms-python.vscode-python-envs/extension.manager_registration_duration"
+)
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize EventCount = count(), UniqueMachines = dcount(VSCodeMachineId) by EventName, ExtVersion;
+allEvents
+| join kind=leftouter observed on EventName
+| project 
+    EventName, 
+    ExtVersion = coalesce(ExtVersion, ""),
+    EventCount = coalesce(EventCount, 0), 
+    UniqueMachines = coalesce(UniqueMachines, 0),
+    Status = iff(coalesce(EventCount, 0) > 0, "✅ RECEIVING DATA", "⚠️ NO DATA YET")
+| order by EventName asc, ExtVersion desc;
+
+
+// =============================================================================
+// CHECK 2: Enhanced event — does "result" property exist?
+// The existing event previously had only "duration". Now it should have "result".
+// Expected: rows with result = "success" (should be the majority) and maybe a few "error".
+//           If result is empty/"", the property isn't being sent correctly.
+// =============================================================================
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize 
+    TotalEvents = count(),
+    HasResult = countif(isnotempty(tostring(Properties.result))),
+    ResultSuccess = countif(tostring(Properties.result) == "success"),
+    ResultError = countif(tostring(Properties.result) == "error"),
+    HasDuration = countif(todouble(Properties.duration) > 0)
+| extend 
+    ResultPopulatedPct = round(todouble(HasResult) / todouble(TotalEvents) * 100, 1),
+    ErrorPct = round(todouble(ResultError) / todouble(HasResult) * 100, 2),
+    Status = iff(HasResult == TotalEvents, "✅ ALL EVENTS HAVE result", "⚠️ SOME EVENTS MISSING result");
+
+
+// =============================================================================
+// CHECK 3: MANAGER_REGISTRATION.SKIPPED — property validation
+// Expected: managerName in {conda, pyenv, pipenv, poetry}, reason = "tool_not_found"
+//           These should be common — most users don't have all 4 tools.
+// =============================================================================
+let totalMachines = toscalar(
+    RawEventsVSCodeExt
+    | where ServerTimestamp > ago(7d)
+    | where ExtensionName == "ms-python.vscode-python-envs"
+    | where ExtensionVersion != ""
+    | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+    | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+    | summarize dcount(VSCodeMachineId)
+);
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize 
+    EventCount = count(),
+    UniqueMachines = dcount(VSCodeMachineId)
+    by ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason)
+| extend TotalUniqueMachines = totalMachines
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1)
+| order by EventCount desc;
+
+
+// =============================================================================
+// CHECK 4: MANAGER_REGISTRATION.FAILED — property validation
+// Expected: managerName in {system, conda, pyenv, pipenv, poetry, shellStartupVars}
+//           errorType in {spawn_timeout, spawn_enoent, permission_denied, canceled, parse_error, unknown}
+//           This may have 0 rows if no one hit errors — that's fine.
+//
+// errorType reference (from errorClassifier.ts):
+//   spawn_timeout    — tool process started but hung or didn't respond in time
+//   spawn_enoent     — OS couldn't find the executable (ENOENT: not installed or not on PATH)
+//   permission_denied — OS returned EACCES/EPERM (exists but no permission to run)
+//   canceled         — operation was stopped (user closed VS Code, workspace changed, CancellationToken fired)
+//   parse_error      — tool ran but returned unparseable output (malformed JSON, unexpected format)
+//   unknown          — didn't match any known pattern (catch-all for unexpected errors)
+// =============================================================================
+let totalMachines = toscalar(
+    RawEventsVSCodeExt
+    | where ServerTimestamp > ago(7d)
+    | where ExtensionName == "ms-python.vscode-python-envs"
+    | where ExtensionVersion != ""
+    | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+    | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+    | summarize dcount(VSCodeMachineId)
+);
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize 
+    EventCount = count(),
+    UniqueMachines = dcount(VSCodeMachineId)
+    by ManagerName = tostring(Properties.managername), ErrorType = tostring(Properties.errortype)
+| extend TotalUniqueMachines = totalMachines
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1)
+| order by EventCount desc;
+
+
+// =============================================================================
+// CHECK 5: SETUP.HANG_DETECTED — property validation
+// Expected: failureStage in {nativeFinder, managerRegistration, envSelection, terminalWatcher, settingsListener}
+//           This should have very few or 0 rows (hangs are rare). 0 is normal.
+// =============================================================================
+let totalMachines = toscalar(
+    RawEventsVSCodeExt
+    | where ServerTimestamp > ago(7d)
+    | where ExtensionName == "ms-python.vscode-python-envs"
+    | where ExtensionVersion != ""
+    | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+    | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+    | summarize dcount(VSCodeMachineId)
+);
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/setup.hang_detected"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize 
+    EventCount = count(),
+    UniqueMachines = dcount(VSCodeMachineId)
+    by FailureStage = tostring(Properties.failurestage)
+| extend TotalUniqueMachines = totalMachines
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1)
+| order by EventCount desc;
+
+
+// =============================================================================
+// CHECK 6: Registered vs Skipped — unique machine counts per manager
+// For each manager that can be skipped, compare registered vs skipped machine counts.
+// If a manager shows 0 registered AND 0 skipped, the telemetry path may be broken.
+// =============================================================================
+let totalMachines = toscalar(
+    RawEventsVSCodeExt
+    | where ServerTimestamp > ago(7d)
+    | where ExtensionName == "ms-python.vscode-python-envs"
+    | where ExtensionVersion != ""
+    | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+    | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+    | summarize dcount(VSCodeMachineId)
+);
+let skipped = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize SkippedMachines = dcount(VSCodeMachineId) by Manager = tostring(Properties.managername);
+let registered = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/environment_manager.registered"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend RawManagerId = tostring(Properties.managerid)
+| where RawManagerId has_any ("conda", "pyenv", "pipenv", "poetry")
+| extend Manager = case(
+    RawManagerId has "conda", "conda",
+    RawManagerId has "pyenv", "pyenv",
+    RawManagerId has "pipenv", "pipenv",
+    RawManagerId has "poetry", "poetry",
+    RawManagerId)
+| summarize RegisteredMachines = dcount(VSCodeMachineId) by Manager;
+skipped
+| join kind=fullouter registered on Manager
+| project 
+    Manager = coalesce(Manager, Manager1),
+    RegisteredMachines = coalesce(RegisteredMachines, 0),
+    SkippedMachines = coalesce(SkippedMachines, 0),
+    TotalUniqueMachines = totalMachines
+| extend 
+    RegisteredPct = round(todouble(RegisteredMachines) / todouble(TotalUniqueMachines) * 100, 1),
+    SkippedPct = round(todouble(SkippedMachines) / todouble(TotalUniqueMachines) * 100, 1)
+| order by Manager asc;
+
+
+
diff --git a/analysis/kusto/01-overall-setup-success-rate.kql b/analysis/kusto/01-overall-setup-success-rate.kql
new file mode 100644
index 00000000..335a832a
--- /dev/null
+++ b/analysis/kusto/01-overall-setup-success-rate.kql
@@ -0,0 +1,15 @@
+// Query 1 — Overall Setup Success Rate (Last 28 Days)
+// Of all machines that started manager setup, what % completed successfully?
+// This is the top-level health metric. If this drops, something is broken.
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(28d)
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize 
+    TotalMachines = dcount(VSCodeMachineId),
+    SuccessMachines = dcountif(VSCodeMachineId, tostring(Properties.result) == "success"),
+    ErrorMachines = dcountif(VSCodeMachineId, tostring(Properties.result) == "error")
+| extend SuccessRate = round(todouble(SuccessMachines) / todouble(TotalMachines) * 100, 1)
+| project TotalMachines, SuccessMachines, ErrorMachines, SuccessRate
diff --git a/analysis/kusto/02-manager-availability.kql b/analysis/kusto/02-manager-availability.kql
new file mode 100644
index 00000000..29950f0c
--- /dev/null
+++ b/analysis/kusto/02-manager-availability.kql
@@ -0,0 +1,38 @@
+// Query 2 — Manager Availability: What Tools Do Users Have Installed?
+// For each manager, counts: registered (tool found), skipped (tool not found), failed (crashed).
+// InstalledRate shows what % of users actually have each tool.
+let endDate = startofday(now()-1d);
+let skipped = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize SkippedMachines = dcount(VSCodeMachineId) by ManagerName = tostring(Properties.managername);
+let registered = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/environment_manager.registered"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend RawManagerId = tostring(Properties.managerid)
+| extend ManagerName = extract("[^:]+$", 0, RawManagerId)
+| summarize RegisteredMachines = dcount(VSCodeMachineId) by ManagerName;
+let failed = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize FailedMachines = dcount(VSCodeMachineId) by ManagerName = tostring(Properties.managername);
+skipped 
+| join kind=fullouter registered on ManagerName
+| join kind=fullouter failed on ManagerName
+| project 
+    Manager = coalesce(ManagerName, ManagerName1, ManagerName2),
+    Registered = coalesce(RegisteredMachines, 0),
+    Skipped = coalesce(SkippedMachines, 0),
+    Failed = coalesce(FailedMachines, 0)
+| extend Total = Registered + Skipped + Failed
+| extend InstalledRate = iff(Total > 0, round(todouble(Registered) / todouble(Total) * 100, 1), 0.0)
+| order by Total desc
diff --git a/analysis/kusto/03-daily-trend.kql b/analysis/kusto/03-daily-trend.kql
new file mode 100644
index 00000000..c3e53c7c
--- /dev/null
+++ b/analysis/kusto/03-daily-trend.kql
@@ -0,0 +1,20 @@
+// Query 3 — Daily Trend: Are Failures Increasing or Decreasing?
+// Day-by-day trend of setup results. Check this after shipping a new version.
+// A sudden SuccessRate drop means a regression; a rise means a fix is working.
+let endDate = startofday(now()-1d);
+RawEventsVSCodeExt
+| where ServerTimestamp > endDate-14d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend Day = startofday(ServerTimestamp)
+| extend Result = tostring(Properties.result)
+| where Result in ("success", "error")
+| summarize 
+    SuccessCount = countif(Result == "success"),
+    ErrorCount = countif(Result == "error")
+    by Day
+| extend TotalCount = SuccessCount + ErrorCount
+| extend SuccessRate = round(todouble(SuccessCount) / todouble(TotalCount) * 100, 1)
+| order by Day asc
diff --git a/analysis/kusto/04-error-type-distribution.kql b/analysis/kusto/04-error-type-distribution.kql
new file mode 100644
index 00000000..01d68139
--- /dev/null
+++ b/analysis/kusto/04-error-type-distribution.kql
@@ -0,0 +1,30 @@
+// Query 4 — Error Type Distribution Across All Failures
+// Combines errors from overall setup and individual managers, grouped by error type.
+// Action depends on results:
+//   spawn_timeout → native finder or tool is too slow
+//   spawn_enoent → binary path wrong or not on PATH
+//   permission_denied → file system permission issue
+//   parse_error → a tool returned unexpected output
+//   unknown → needs deeper investigation
+let endDate = startofday(now()-1d);
+let setupErrors = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| where tostring(Properties.result) == "error"
+| project ErrorType = tostring(Properties.errortype), Source = "setup_overall", VSCodeMachineId;
+let managerErrors = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| project ErrorType = tostring(Properties.errortype), Source = "individual_manager", VSCodeMachineId;
+union setupErrors, managerErrors
+| summarize 
+    EventCount = count(),
+    AffectedMachines = dcount(VSCodeMachineId)
+    by ErrorType, Source
+| order by AffectedMachines desc
diff --git a/analysis/kusto/05-hang-failure-correlation.kql b/analysis/kusto/05-hang-failure-correlation.kql
new file mode 100644
index 00000000..391397c7
--- /dev/null
+++ b/analysis/kusto/05-hang-failure-correlation.kql
@@ -0,0 +1,26 @@
+// Query 5 — Hang + Failure Correlation
+// Do hangs always result in failures, or do some self-recover?
+// If most hangs recover → just slow machines. If most also fail → real deadlock or crash.
+let endDate = startofday(now()-1d);
+let hangMachines = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/setup.hang_detected"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| distinct VSCodeMachineId;
+let failedMachines = RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| where tostring(Properties.result) == "error"
+| distinct VSCodeMachineId;
+let totalHangs = toscalar(hangMachines | count);
+let hangAndFail = hangMachines | join kind=inner failedMachines on VSCodeMachineId | count;
+let hangOnly = toscalar(hangMachines | join kind=leftanti failedMachines on VSCodeMachineId | count);
+print 
+    TotalHangMachines = totalHangs,
+    HangAndFailed = toscalar(hangAndFail),
+    HangButRecovered = hangOnly
diff --git a/analysis/kusto/06-weekly-health-summary.kql b/analysis/kusto/06-weekly-health-summary.kql
new file mode 100644
index 00000000..f2eca4d5
--- /dev/null
+++ b/analysis/kusto/06-weekly-health-summary.kql
@@ -0,0 +1,50 @@
+// Query 6 — Weekly Report: Full Health Summary
+// One-stop query for weekly health check. Run every Monday.
+// Returns one row with all key numbers: activations, setup results, manager failures, hangs.
+let endDate = startofday(now()-1d);
+let startDate = endDate - 7d;
+let totalActivations = toscalar(
+    RawEventsVSCodeExt
+    | where ServerTimestamp > startDate and ServerTimestamp < endDate
+    | where EventName == "ms-python.vscode-python-envs/extension.activation_duration"
+    | where ExtensionVersion != ""
+    | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+    | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+    | summarize dcount(VSCodeMachineId)
+);
+let setupResults = RawEventsVSCodeExt
+| where ServerTimestamp > startDate and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| where isnotempty(tostring(Properties.result))
+| summarize 
+    SetupSuccess = dcountif(VSCodeMachineId, tostring(Properties.result) == "success"),
+    SetupError = dcountif(VSCodeMachineId, tostring(Properties.result) == "error");
+let managerFailures = RawEventsVSCodeExt
+| where ServerTimestamp > startDate and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize ManagerFailures = count(), ManagerFailMachines = dcount(VSCodeMachineId);
+let hangs = RawEventsVSCodeExt
+| where ServerTimestamp > startDate and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/setup.hang_detected"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize Hangs = count(), HangMachines = dcount(VSCodeMachineId);
+setupResults | extend _k = 1
+| join (managerFailures | extend _k = 1) on _k
+| join (hangs | extend _k = 1) on _k
+| project 
+    TotalActivations = totalActivations,
+    SetupSuccess,
+    SetupError,
+    SetupSuccessRate = round(todouble(SetupSuccess) / todouble(SetupSuccess + SetupError) * 100, 1),
+    ManagerFailures,
+    ManagerFailMachines,
+    Hangs,
+    HangMachines
diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb
new file mode 100644
index 00000000..88fbbcaf
--- /dev/null
+++ b/analysis/kusto/dashboard.ipynb
@@ -0,0 +1,274 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Python Environments — Telemetry Dashboard\n",
+    "\n",
+    "Interactive notebook for running the Kusto queries in `analysis/kusto/*.kql`.\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "1. `pip install -r requirements.txt`\n",
+    "2. `az login` (Azure CLI authentication)\n",
+    "3. `nbstripout --install` (one-time setup to auto-strip notebook outputs before commits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from initialize import initialize\n",
+    "from query_runner import run_kql, run_kql_file, load_kql_sections\n",
+    "from IPython.display import display, Markdown, HTML\n",
+    "import pandas as pd\n",
+    "import altair as alt\n",
+    "\n",
+    "# Show all rows and wrap long column text\n",
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_columns\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
+    "\n",
+    "\n",
+    "client = initialize()\n",
+    "print(\"Connected to Kusto.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 0. Telemetry Validation\n",
+    "\n",
+    "Checks that all telemetry events are arriving with correct properties."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for title, query in load_kql_sections(\"00-telemetry-validation.kql\"):\n",
+    "    display(Markdown(f\"### {title}\"))\n",
+    "    try:\n",
+    "        df = run_kql(client, query)\n",
+    "        display(df)\n",
+    "        if \"TotalUniqueMachines\" in df.columns:\n",
+    "            print(f\"Total unique machines: {df['TotalUniqueMachines'].iloc[0]:,}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"  ⚠️ {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 1. Overall Setup Success Rate (28 days)\n",
+    "\n",
+    "Top-level health metric. If this drops, something is broken."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"01-overall-setup-success-rate.kql\")\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 2. Manager Availability\n",
+    "\n",
+    "What tools do users actually have installed? Shows registered vs skipped vs failed per manager."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"02-manager-availability.kql\")\n",
+    "display(df)\n",
+    "\n",
+    "if not df.empty:\n",
+    "    melted = df.melt(\n",
+    "        id_vars=\"Manager\",\n",
+    "        value_vars=[\"Registered\", \"Skipped\", \"Failed\"],\n",
+    "        var_name=\"Status\",\n",
+    "        value_name=\"Machines\",\n",
+    "    )\n",
+    "    chart = (\n",
+    "        alt.Chart(melted)\n",
+    "        .mark_bar()\n",
+    "        .encode(\n",
+    "            x=alt.X(\"Manager:N\", sort=\"-y\"),\n",
+    "            y=alt.Y(\"Machines:Q\"),\n",
+    "            color=alt.Color(\"Status:N\", scale=alt.Scale(\n",
+    "                domain=[\"Registered\", \"Skipped\", \"Failed\"],\n",
+    "                range=[\"#4c78a8\", \"#e45756\", \"#f58518\"],\n",
+    "            )),\n",
+    "            tooltip=[\"Manager\", \"Status\", \"Machines\"],\n",
+    "        )\n",
+    "        .properties(width=600, height=300, title=\"Manager Availability\")\n",
+    "    )\n",
+    "    display(chart)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Daily Trend (14 days)\n",
+    "\n",
+    "Day-by-day trend of setup success rate. Check after shipping a new version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"03-daily-trend.kql\")\n",
+    "display(df)\n",
+    "\n",
+    "if not df.empty:\n",
+    "    chart = (\n",
+    "        alt.Chart(df)\n",
+    "        .mark_line(point=True)\n",
+    "        .encode(\n",
+    "            x=alt.X(\"Day:T\", title=\"Date\"),\n",
+    "            y=alt.Y(\"SuccessRate:Q\", title=\"Success Rate (%)\", scale=alt.Scale(domain=[0, 100])),\n",
+    "            tooltip=[\"Day:T\", \"SuccessRate:Q\", \"SuccessCount:Q\", \"ErrorCount:Q\"],\n",
+    "        )\n",
+    "        .properties(width=700, height=300, title=\"Daily Setup Success Rate\")\n",
+    "    )\n",
+    "    display(chart)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. Error Type Distribution\n",
+    "\n",
+    "Groups all failures by error type across setup and individual managers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"04-error-type-distribution.kql\")\n",
+    "display(df)\n",
+    "\n",
+    "if not df.empty:\n",
+    "    chart = (\n",
+    "        alt.Chart(df)\n",
+    "        .mark_bar()\n",
+    "        .encode(\n",
+    "            x=alt.X(\"ErrorType:N\", sort=\"-y\", title=\"Error Type\"),\n",
+    "            y=alt.Y(\"AffectedMachines:Q\", title=\"Affected Machines\"),\n",
+    "            color=\"Source:N\",\n",
+    "            tooltip=[\"ErrorType\", \"Source\", \"EventCount\", \"AffectedMachines\"],\n",
+    "        )\n",
+    "        .properties(width=600, height=300, title=\"Error Type Distribution\")\n",
+    "    )\n",
+    "    display(chart)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. Hang ↔ Failure Correlation\n",
+    "\n",
+    "Do hangs always cause failures, or do some self-recover?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"05-hang-failure-correlation.kql\")\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 6. Weekly Health Summary\n",
+    "\n",
+    "One-stop query for weekly check. Returns all key numbers in a single row."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"06-weekly-health-summary.kql\")\n",
+    "display(df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv (3.13.12)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/kusto/initialize.py b/analysis/kusto/initialize.py
new file mode 100644
index 00000000..19b5f8ef
--- /dev/null
+++ b/analysis/kusto/initialize.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Kusto authentication for the telemetry dashboard.
+
+Prerequisites:
+    1. Install: pip install azure-kusto-data
+    2. Authenticate: az login
+"""
+
+import shutil
+
+from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
+
+CLUSTER = "ddtelvscode.kusto.windows.net"
+DATABASE = "VSCodeExt"
+
+
+def initialize(cluster: str = CLUSTER) -> KustoClient:
+    """Return an authenticated KustoClient.
+
+    Uses Azure CLI authentication if ``az`` is on PATH, otherwise falls back
+    to interactive browser login.
+    """
+    url = f"https://{cluster}"
+    if shutil.which("az"):
+        return KustoClient(KustoConnectionStringBuilder.with_az_cli_authentication(url))
+    return KustoClient(KustoConnectionStringBuilder.with_interactive_login(url))
diff --git a/analysis/kusto/query_runner.py b/analysis/kusto/query_runner.py
new file mode 100644
index 00000000..8a87050c
--- /dev/null
+++ b/analysis/kusto/query_runner.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Load and execute .kql files against Azure Data Explorer.
+
+Keeps the existing .kql files as the single source of truth — they can still
+be copy-pasted into the ADX web UI, AND this module can run them from Python.
+"""
+
+import pathlib
+import re
+from typing import List, Tuple
+
+import pandas as pd
+from azure.kusto.data import KustoClient
+from azure.kusto.data.helpers import dataframe_from_result_table
+
+KQL_DIR = pathlib.Path(__file__).parent
+DATABASE = "VSCodeExt"
+
+# Separator pattern used in multi-query files like 00-prerelease-telemetry-validation.kql
+_SECTION_SEP = re.compile(r"^// =====+", re.MULTILINE)
+
+
+def load_kql(filename: str) -> str:
+    """Read a .kql file and strip leading comment-only lines."""
+    path = KQL_DIR / filename
+    lines = path.read_text(encoding="utf-8").splitlines()
+    return "\n".join(line for line in lines if not line.strip().startswith("//"))
+
+
+def load_kql_sections(filename: str) -> List[Tuple[str, str]]:
+    """Split a multi-query .kql file into ``(title, query)`` pairs.
+
+    Sections are delimited by ``// ====...====`` separator lines.  The first
+    comment line after a separator is used as the section title.
+    """
+    text = (KQL_DIR / filename).read_text(encoding="utf-8")
+    raw_sections = _SECTION_SEP.split(text)
+
+    # Each section is either a "header" (comments only) or a "body" (has KQL).
+    # Headers set the title for the next body section.
+    results: List[Tuple[str, str]] = []
+    pending_title: str = ""
+    for section in raw_sections:
+        lines = section.strip().splitlines()
+        if not lines:
+            continue
+        comment_lines = [
+            ln.lstrip("/ ").strip() for ln in lines if ln.strip().startswith("//")
+        ]
+        query_lines = [
+            ln for ln in lines if not ln.strip().startswith("//") and ln.strip()
+        ]
+        if query_lines:
+            title = pending_title or (comment_lines[0] if comment_lines else "Untitled")
+            results.append((title, "\n".join(query_lines)))
+            pending_title = ""
+        elif comment_lines:
+            # Comment-only section — use first line as the title for the next query
+            pending_title = comment_lines[0]
+    return results
+
+
+def run_kql(
+    client: KustoClient, query: str, database: str = DATABASE
+) -> pd.DataFrame:
+    """Execute a KQL query string and return results as a DataFrame."""
+    response = client.execute(database, query)
+    return dataframe_from_result_table(response.primary_results[0])
+
+
+def run_kql_file(
+    client: KustoClient, filename: str, database: str = DATABASE
+) -> pd.DataFrame:
+    """Load a .kql file and execute it."""
+    return run_kql(client, load_kql(filename), database)
diff --git a/analysis/kusto/requirements.txt b/analysis/kusto/requirements.txt
new file mode 100644
index 00000000..ac8385bf
--- /dev/null
+++ b/analysis/kusto/requirements.txt
@@ -0,0 +1,4 @@
+azure-kusto-data>=4.0.0
+pandas>=2.0.0
+altair>=5.0.0
+nbstripout>=0.7.0

From c118ccc2531fdd9461494dda8c5e95db9d2bc5bd Mon Sep 17 00:00:00 2001
From: Stella Huang <stellahuang@microsoft.com>
Date: Tue, 31 Mar 2026 16:43:42 -0700
Subject: [PATCH 2/3] update

---
 analysis/kusto/00-telemetry-validation.kql    |  15 +--
 analysis/kusto/01-failure-stage-breakdown.kql |  18 +++
 .../kusto/02-error-type-x-failure-stage.kql   |  20 ++++
 .../kusto/03-failure-stage-daily-trend.kql    |  19 +++
 ....kql => 04-overall-setup-success-rate.kql} |   0
 ...bility.kql => 05-manager-availability.kql} |   0
 ...{03-daily-trend.kql => 06-daily-trend.kql} |   0
 ...ion.kql => 07-error-type-distribution.kql} |   0
 ...on.kql => 08-hang-failure-correlation.kql} |   0
 ...mmary.kql => 09-weekly-health-summary.kql} |   0
 analysis/kusto/dashboard.ipynb                | 112 ++++++++++++++----
 11 files changed, 155 insertions(+), 29 deletions(-)
 create mode 100644 analysis/kusto/01-failure-stage-breakdown.kql
 create mode 100644 analysis/kusto/02-error-type-x-failure-stage.kql
 create mode 100644 analysis/kusto/03-failure-stage-daily-trend.kql
 rename analysis/kusto/{01-overall-setup-success-rate.kql => 04-overall-setup-success-rate.kql} (100%)
 rename analysis/kusto/{02-manager-availability.kql => 05-manager-availability.kql} (100%)
 rename analysis/kusto/{03-daily-trend.kql => 06-daily-trend.kql} (100%)
 rename analysis/kusto/{04-error-type-distribution.kql => 07-error-type-distribution.kql} (100%)
 rename analysis/kusto/{05-hang-failure-correlation.kql => 08-hang-failure-correlation.kql} (100%)
 rename analysis/kusto/{06-weekly-health-summary.kql => 09-weekly-health-summary.kql} (100%)

diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql
index a2f30fce..609fec56 100644
--- a/analysis/kusto/00-telemetry-validation.kql
+++ b/analysis/kusto/00-telemetry-validation.kql
@@ -6,7 +6,7 @@
 
 
 // =============================================================================
-// CHECK 0: UniqueMachines with Telemetry Changes
+// CHECK 0: UniqueMachines with new telemetry
 // Shows unique machines per version for builds >= 1.23.10781012 (all stable + pre-release).
 // Use this to confirm the latest pre-release build has real users.
 // =============================================================================
@@ -23,9 +23,7 @@ filtered
 
 
 // =============================================================================
-// CHECK 1: Are all 4 events arriving? (3 new + 1 enhanced)
-// Expected: at least 1 row per event. If any row shows 0, that event is broken.
-// Broken down by extension version so you can confirm the new build is reporting.
+// CHECK 1: Are all 4 events arriving? (Only versions with ⚠️ NO DATA YET are displayed)
 // =============================================================================
 let allEvents = datatable(EventName: string) [
     "ms-python.vscode-python-envs/manager_registration.failed",
@@ -53,11 +51,12 @@ allEvents
     EventCount = coalesce(EventCount, 0), 
     UniqueMachines = coalesce(UniqueMachines, 0),
     Status = iff(coalesce(EventCount, 0) > 0, "✅ RECEIVING DATA", "⚠️ NO DATA YET")
+| where Status == "⚠️ NO DATA YET"
 | order by EventName asc, ExtVersion desc;
 
 
 // =============================================================================
-// CHECK 2: Enhanced event — does "result" property exist?
+// CHECK 2: MANAGER_REGISTRATION_DURATION event success rate
 // The existing event previously had only "duration". Now it should have "result".
 // Expected: rows with result = "success" (should be the majority) and maybe a few "error".
 //           If result is empty/"", the property isn't being sent correctly.
@@ -81,7 +80,7 @@ RawEventsVSCodeExt
 
 
 // =============================================================================
-// CHECK 3: MANAGER_REGISTRATION.SKIPPED — property validation
+// CHECK 3: MANAGER_REGISTRATION.SKIPPED (Percentage of users don't have each tool)
 // Expected: managerName in {conda, pyenv, pipenv, poetry}, reason = "tool_not_found"
 //           These should be common — most users don't have all 4 tools.
 // =============================================================================
@@ -100,10 +99,12 @@ RawEventsVSCodeExt
 | extend ExtVersion = tostring(Properties["common.extversion"])
 | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
 | where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason)
+| where isnotempty(ManagerName) and isnotempty(Reason)
 | summarize 
     EventCount = count(),
     UniqueMachines = dcount(VSCodeMachineId)
-    by ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason)
+    by ManagerName, Reason
 | extend TotalUniqueMachines = totalMachines
 | extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1)
 | order by EventCount desc;
diff --git a/analysis/kusto/01-failure-stage-breakdown.kql b/analysis/kusto/01-failure-stage-breakdown.kql
new file mode 100644
index 00000000..d0416ddf
--- /dev/null
+++ b/analysis/kusto/01-failure-stage-breakdown.kql
@@ -0,0 +1,18 @@
+// Query 1 — Registration Failure Stage Breakdown per Manager (28 days)
+// For each manager that failed, shows WHICH stage in the registration flow broke.
+// failureStage is hierarchical: "getPipenv:nativeFinderRefresh", "constructCondaSourcingStatus", etc.
+// High counts at a specific stage → that code path is the priority fix target.
+let endDate = startofday(now()-1d);
+RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend ManagerName = tostring(Properties.managername)
+| extend FailureStage = tostring(Properties.failurestage)
+| summarize
+    FailureCount = count(),
+    AffectedMachines = dcount(VSCodeMachineId)
+    by ManagerName, FailureStage
+| order by ManagerName asc, AffectedMachines desc
diff --git a/analysis/kusto/02-error-type-x-failure-stage.kql b/analysis/kusto/02-error-type-x-failure-stage.kql
new file mode 100644
index 00000000..69e58fdf
--- /dev/null
+++ b/analysis/kusto/02-error-type-x-failure-stage.kql
@@ -0,0 +1,20 @@
+// Query 2 — Error Type × Failure Stage Matrix (28 days)
+// Cross-tabulates errorType (what kind of error) with failureStage (where it happened).
+// This is the key diagnostic view: e.g., "connection_error at nativeFinderRefresh" means
+// PET process dies during native finder, while "tool_not_found at pathLookup" means the
+// tool binary wasn't on PATH. Prioritize cells with the highest AffectedMachines.
+let endDate = startofday(now()-1d);
+RawEventsVSCodeExt
+| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend ManagerName = tostring(Properties.managername)
+| extend ErrorType = tostring(Properties.errortype)
+| extend FailureStage = tostring(Properties.failurestage)
+| summarize
+    FailureCount = count(),
+    AffectedMachines = dcount(VSCodeMachineId)
+    by ManagerName, ErrorType, FailureStage
+| order by AffectedMachines desc, ManagerName asc
diff --git a/analysis/kusto/03-failure-stage-daily-trend.kql b/analysis/kusto/03-failure-stage-daily-trend.kql
new file mode 100644
index 00000000..9413064a
--- /dev/null
+++ b/analysis/kusto/03-failure-stage-daily-trend.kql
@@ -0,0 +1,19 @@
+// Query 3 — Daily Registration Failures by Manager and Stage (14 days)
+// Day-by-day failure counts per manager + failureStage combination.
+// Use this to detect if a specific stage started failing more after a release.
+// A spike in one stage on a specific day → regression in that code path.
+let endDate = startofday(now()-1d);
+RawEventsVSCodeExt
+| where ServerTimestamp > endDate-14d and ServerTimestamp < endDate
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| extend Day = startofday(ServerTimestamp)
+| extend ManagerName = tostring(Properties.managername)
+| extend FailureStage = tostring(Properties.failurestage)
+| summarize
+    FailureCount = count(),
+    AffectedMachines = dcount(VSCodeMachineId)
+    by Day, ManagerName, FailureStage
+| order by Day asc, ManagerName asc, AffectedMachines desc
diff --git a/analysis/kusto/01-overall-setup-success-rate.kql b/analysis/kusto/04-overall-setup-success-rate.kql
similarity index 100%
rename from analysis/kusto/01-overall-setup-success-rate.kql
rename to analysis/kusto/04-overall-setup-success-rate.kql
diff --git a/analysis/kusto/02-manager-availability.kql b/analysis/kusto/05-manager-availability.kql
similarity index 100%
rename from analysis/kusto/02-manager-availability.kql
rename to analysis/kusto/05-manager-availability.kql
diff --git a/analysis/kusto/03-daily-trend.kql b/analysis/kusto/06-daily-trend.kql
similarity index 100%
rename from analysis/kusto/03-daily-trend.kql
rename to analysis/kusto/06-daily-trend.kql
diff --git a/analysis/kusto/04-error-type-distribution.kql b/analysis/kusto/07-error-type-distribution.kql
similarity index 100%
rename from analysis/kusto/04-error-type-distribution.kql
rename to analysis/kusto/07-error-type-distribution.kql
diff --git a/analysis/kusto/05-hang-failure-correlation.kql b/analysis/kusto/08-hang-failure-correlation.kql
similarity index 100%
rename from analysis/kusto/05-hang-failure-correlation.kql
rename to analysis/kusto/08-hang-failure-correlation.kql
diff --git a/analysis/kusto/06-weekly-health-summary.kql b/analysis/kusto/09-weekly-health-summary.kql
similarity index 100%
rename from analysis/kusto/06-weekly-health-summary.kql
rename to analysis/kusto/09-weekly-health-summary.kql
diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb
index 88fbbcaf..278a46b8 100644
--- a/analysis/kusto/dashboard.ipynb
+++ b/analysis/kusto/dashboard.ipynb
@@ -73,9 +73,10 @@
    "metadata": {},
    "source": [
     "---\n",
-    "## 1. Overall Setup Success Rate (28 days)\n",
+    "## 1. Registration Failure Stage Breakdown (28 days)\n",
     "\n",
-    "Top-level health metric. If this drops, something is broken."
+    "For each manager that failed, shows **which stage** in the registration flow broke.\n",
+    "`failureStage` is hierarchical (e.g. `getPipenv:nativeFinderRefresh`). High counts at a specific stage = priority fix target."
    ]
   },
   {
@@ -85,7 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"01-overall-setup-success-rate.kql\")\n",
+    "df = run_kql_file(client, \"01-failure-stage-breakdown.kql\")\n",
     "display(df)"
    ]
   },
@@ -95,9 +96,10 @@
    "metadata": {},
    "source": [
     "---\n",
-    "## 2. Manager Availability\n",
+    "## 2. Error Type × Failure Stage Matrix (28 days)\n",
     "\n",
-    "What tools do users actually have installed? Shows registered vs skipped vs failed per manager."
+    "Cross-tabulates **error type** (what kind of error) with **failure stage** (where it happened).\n",
+    "This is the key diagnostic view — e.g. `connection_error` at `nativeFinderRefresh` means PET died during native finder."
    ]
   },
   {
@@ -107,7 +109,73 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"02-manager-availability.kql\")\n",
+    "df = run_kql_file(client, \"02-error-type-x-failure-stage.kql\")\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Daily Registration Failures by Manager & Stage (14 days)\n",
+    "\n",
+    "Day-by-day failure counts per manager + stage. A spike in one stage on a specific day = regression in that code path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"03-failure-stage-daily-trend.kql\")\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. Overall Setup Success Rate (28 days)\n",
+    "\n",
+    "Top-level health metric. If this drops, something is broken."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"04-overall-setup-success-rate.kql\")\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. Manager Availability\n",
+    "\n",
+    "What tools do users actually have installed? Shows registered vs skipped vs failed per manager."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = run_kql_file(client, \"05-manager-availability.kql\")\n",
     "display(df)\n",
     "\n",
     "if not df.empty:\n",
@@ -136,11 +204,11 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8",
+   "id": "14",
    "metadata": {},
    "source": [
     "---\n",
-    "## 3. Daily Trend (14 days)\n",
+    "## 6. Daily Trend (14 days)\n",
     "\n",
     "Day-by-day trend of setup success rate. Check after shipping a new version."
    ]
@@ -148,11 +216,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9",
+   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"03-daily-trend.kql\")\n",
+    "df = run_kql_file(client, \"06-daily-trend.kql\")\n",
     "display(df)\n",
     "\n",
     "if not df.empty:\n",
@@ -171,11 +239,11 @@
   },
   {
    "cell_type": "markdown",
-   "id": "10",
+   "id": "16",
    "metadata": {},
    "source": [
     "---\n",
-    "## 4. Error Type Distribution\n",
+    "## 7. Error Type Distribution\n",
     "\n",
     "Groups all failures by error type across setup and individual managers."
    ]
@@ -183,11 +251,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11",
+   "id": "17",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"04-error-type-distribution.kql\")\n",
+    "df = run_kql_file(client, \"07-error-type-distribution.kql\")\n",
     "display(df)\n",
     "\n",
     "if not df.empty:\n",
@@ -207,11 +275,11 @@
   },
   {
    "cell_type": "markdown",
-   "id": "12",
+   "id": "18",
    "metadata": {},
    "source": [
     "---\n",
-    "## 5. Hang ↔ Failure Correlation\n",
+    "## 8. Hang ↔ Failure Correlation\n",
     "\n",
     "Do hangs always cause failures, or do some self-recover?"
    ]
@@ -219,21 +287,21 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "13",
+   "id": "19",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"05-hang-failure-correlation.kql\")\n",
+    "df = run_kql_file(client, \"08-hang-failure-correlation.kql\")\n",
     "display(df)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "14",
+   "id": "20",
    "metadata": {},
    "source": [
     "---\n",
-    "## 6. Weekly Health Summary\n",
+    "## 9. Weekly Health Summary\n",
     "\n",
     "One-stop query for weekly check. Returns all key numbers in a single row."
    ]
@@ -241,11 +309,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
+   "id": "21",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = run_kql_file(client, \"06-weekly-health-summary.kql\")\n",
+    "df = run_kql_file(client, \"09-weekly-health-summary.kql\")\n",
     "display(df)"
    ]
   }

From c69a893f55d622f606f16ebee6e902ef2e8c0c42 Mon Sep 17 00:00:00 2001
From: Stella Huang <stellahuang@microsoft.com>
Date: Tue, 31 Mar 2026 17:35:21 -0700
Subject: [PATCH 3/3] update

---
 analysis/kusto/00-telemetry-validation.kql | 60 +++++++++++++++++++++-
 analysis/kusto/dashboard.ipynb             | 57 +++++++++++++-------
 2 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql
index 609fec56..aec7c5ad 100644
--- a/analysis/kusto/00-telemetry-validation.kql
+++ b/analysis/kusto/00-telemetry-validation.kql
@@ -144,10 +144,68 @@ RawEventsVSCodeExt
     UniqueMachines = dcount(VSCodeMachineId)
     by ManagerName = tostring(Properties.managername), ErrorType = tostring(Properties.errortype)
 | extend TotalUniqueMachines = totalMachines
-| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1)
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 2)
 | order by EventCount desc;
 
 
+// =============================================================================
+// CHECK 4a: SPAWN_TIMEOUT failures broken down by manager × extension version
+// Shows whether spawn_timeout is improving or worsening across versions.
+// If a new version shows higher MachinePct → that release regressed timeout handling.
+// =============================================================================
+let totalByVersion = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where ExtensionName == "ms-python.vscode-python-envs"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize TotalMachines = dcount(VSCodeMachineId) by ExtVersion = ExtensionVersion;
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| where tostring(Properties.errortype) == "spawn_timeout"
+| summarize
+    EventCount = count(),
+    UniqueMachines = dcount(VSCodeMachineId)
+    by ManagerName = tostring(Properties.managername), ExtVersion
+| join kind=inner totalByVersion on ExtVersion
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalMachines) * 100, 2)
+| project ManagerName, ExtVersion, EventCount, UniqueMachines, TotalMachines, MachinePct
+| order by ManagerName asc, ExtVersion desc;
+
+
+// =============================================================================
+// CHECK 4b: UNKNOWN failures broken down by manager × extension version
+// Shows whether unknown errors are improving or worsening across versions.
+// High counts in the latest version → new unclassified error paths need investigation.
+// =============================================================================
+let totalByVersion = RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where ExtensionName == "ms-python.vscode-python-envs"
+| where ExtensionVersion != ""
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| summarize TotalMachines = dcount(VSCodeMachineId) by ExtVersion = ExtensionVersion;
+RawEventsVSCodeExt
+| where ServerTimestamp > ago(7d)
+| where EventName == "ms-python.vscode-python-envs/manager_registration.failed"
+| extend ExtVersion = tostring(Properties["common.extversion"])
+| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion))
+| where _minor > 23 or (_minor == 23 and _patch >= 10781012)
+| where tostring(Properties.errortype) == "unknown"
+| summarize
+    EventCount = count(),
+    UniqueMachines = dcount(VSCodeMachineId)
+    by ManagerName = tostring(Properties.managername), ExtVersion
+| join kind=inner totalByVersion on ExtVersion
+| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalMachines) * 100, 2)
+| project ManagerName, ExtVersion, EventCount, UniqueMachines, TotalMachines, MachinePct
+| order by ManagerName asc, ExtVersion desc;
+
+
 // =============================================================================
 // CHECK 5: SETUP.HANG_DETECTED — property validation
 // Expected: failureStage in {nativeFinder, managerRegistration, envSelection, terminalWatcher, settingsListener}
diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb
index 278a46b8..378a44f7 100644
--- a/analysis/kusto/dashboard.ipynb
+++ b/analysis/kusto/dashboard.ipynb
@@ -56,7 +56,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for title, query in load_kql_sections(\"00-telemetry-validation.kql\"):\n",
+    "sections = load_kql_sections(\"00-telemetry-validation.kql\")\n",
+    "for title, query in sections[:4]:\n",
     "    display(Markdown(f\"### {title}\"))\n",
     "    try:\n",
     "        df = run_kql(client, query)\n",
@@ -68,9 +69,27 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "id": "4",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "for title, query in sections[4:]:\n",
+    "    display(Markdown(f\"### {title}\"))\n",
+    "    try:\n",
+    "        df = run_kql(client, query)\n",
+    "        display(df)\n",
+    "        if \"TotalUniqueMachines\" in df.columns:\n",
+    "            print(f\"Total unique machines: {df['TotalUniqueMachines'].iloc[0]:,}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"  ⚠️ {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
    "source": [
     "---\n",
     "## 1. Registration Failure Stage Breakdown (28 days)\n",
@@ -82,7 +101,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5",
+   "id": "6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -92,7 +111,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6",
+   "id": "7",
    "metadata": {},
    "source": [
     "---\n",
@@ -105,7 +124,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7",
+   "id": "8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -115,7 +134,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8",
+   "id": "9",
    "metadata": {},
    "source": [
     "---\n",
@@ -127,7 +146,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9",
+   "id": "10",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,7 +156,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "10",
+   "id": "11",
    "metadata": {},
    "source": [
     "---\n",
@@ -149,7 +168,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11",
+   "id": "12",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,7 +178,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "12",
+   "id": "13",
    "metadata": {},
    "source": [
     "---\n",
@@ -171,7 +190,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "13",
+   "id": "14",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -204,7 +223,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "14",
+   "id": "15",
    "metadata": {},
    "source": [
     "---\n",
@@ -216,7 +235,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
+   "id": "16",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -239,7 +258,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
+   "id": "17",
    "metadata": {},
    "source": [
     "---\n",
@@ -251,7 +270,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17",
+   "id": "18",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -275,7 +294,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "18",
+   "id": "19",
    "metadata": {},
    "source": [
     "---\n",
@@ -287,7 +306,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "19",
+   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -297,7 +316,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "20",
+   "id": "21",
    "metadata": {},
    "source": [
     "---\n",
@@ -309,7 +328,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "21",
+   "id": "22",
    "metadata": {},
    "outputs": [],
    "source": [