From e79df94733a9b24d84dc8c3f955d4bb4d584f7c7 Mon Sep 17 00:00:00 2001 From: Stella Huang Date: Thu, 26 Mar 2026 14:39:56 -0700 Subject: [PATCH 1/3] Add Kusto telemetry dashboard and queries --- .gitattributes | 3 + analysis/kusto/00-telemetry-validation.kql | 228 +++++++++++++++ .../kusto/01-overall-setup-success-rate.kql | 15 + analysis/kusto/02-manager-availability.kql | 38 +++ analysis/kusto/03-daily-trend.kql | 20 ++ analysis/kusto/04-error-type-distribution.kql | 30 ++ .../kusto/05-hang-failure-correlation.kql | 26 ++ analysis/kusto/06-weekly-health-summary.kql | 50 ++++ analysis/kusto/dashboard.ipynb | 274 ++++++++++++++++++ analysis/kusto/initialize.py | 28 ++ analysis/kusto/query_runner.py | 77 +++++ analysis/kusto/requirements.txt | 4 + 12 files changed, 793 insertions(+) create mode 100644 .gitattributes create mode 100644 analysis/kusto/00-telemetry-validation.kql create mode 100644 analysis/kusto/01-overall-setup-success-rate.kql create mode 100644 analysis/kusto/02-manager-availability.kql create mode 100644 analysis/kusto/03-daily-trend.kql create mode 100644 analysis/kusto/04-error-type-distribution.kql create mode 100644 analysis/kusto/05-hang-failure-correlation.kql create mode 100644 analysis/kusto/06-weekly-health-summary.kql create mode 100644 analysis/kusto/dashboard.ipynb create mode 100644 analysis/kusto/initialize.py create mode 100644 analysis/kusto/query_runner.py create mode 100644 analysis/kusto/requirements.txt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..f24d9335 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.ipynb filter=nbstripout +*.zpln filter=nbstripout +*.ipynb diff=ipynb diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql new file mode 100644 index 00000000..a2f30fce --- /dev/null +++ b/analysis/kusto/00-telemetry-validation.kql @@ -0,0 +1,228 @@ +// ============================================================================= +// TELEMETRY VALIDATION +// Checks that PR #1365 telemetry events are arriving with correct properties. +// Each query is self-contained — just copy-paste into Azure Data Explorer. +// ============================================================================= + + +// ============================================================================= +// CHECK 0: UniqueMachines with Telemetry Changes +// Shows unique machines per version for builds >= 1.23.10781012 (all stable + pre-release). +// Use this to confirm the latest pre-release build has real users. +// ============================================================================= +let filtered = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where ExtensionName == "ms-python.vscode-python-envs" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012); +filtered +| summarize UniqueMachines = dcount(VSCodeMachineId) by ExtensionVersion +| extend TotalUniqueMachines = toscalar(filtered | summarize dcount(VSCodeMachineId)) +| order by UniqueMachines desc; + + +// ============================================================================= +// CHECK 1: Are all 4 events arriving? (3 new + 1 enhanced) +// Expected: at least 1 row per event. If any row shows 0, that event is broken. +// Broken down by extension version so you can confirm the new build is reporting. +// ============================================================================= +let allEvents = datatable(EventName: string) [ + "ms-python.vscode-python-envs/manager_registration.failed", + "ms-python.vscode-python-envs/manager_registration.skipped", + "ms-python.vscode-python-envs/setup.hang_detected", + "ms-python.vscode-python-envs/extension.manager_registration_duration" +]; +let observed = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName in ( + "ms-python.vscode-python-envs/manager_registration.failed", + "ms-python.vscode-python-envs/manager_registration.skipped", + "ms-python.vscode-python-envs/setup.hang_detected", + "ms-python.vscode-python-envs/extension.manager_registration_duration" +) +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize EventCount = count(), UniqueMachines = dcount(VSCodeMachineId) by EventName, ExtVersion; +allEvents +| join kind=leftouter observed on EventName +| project + EventName, + ExtVersion = coalesce(ExtVersion, ""), + EventCount = coalesce(EventCount, 0), + UniqueMachines = coalesce(UniqueMachines, 0), + Status = iff(coalesce(EventCount, 0) > 0, "✅ RECEIVING DATA", "⚠️ NO DATA YET") +| order by EventName asc, ExtVersion desc; + + +// ============================================================================= +// CHECK 2: Enhanced event — does "result" property exist? +// The existing event previously had only "duration". Now it should have "result". +// Expected: rows with result = "success" (should be the majority) and maybe a few "error". +// If result is empty/"", the property isn't being sent correctly. +// ============================================================================= +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize + TotalEvents = count(), + HasResult = countif(isnotempty(tostring(Properties.result))), + ResultSuccess = countif(tostring(Properties.result) == "success"), + ResultError = countif(tostring(Properties.result) == "error"), + HasDuration = countif(todouble(Properties.duration) > 0) +| extend + ResultPopulatedPct = round(todouble(HasResult) / todouble(TotalEvents) * 100, 1), + ErrorPct = round(todouble(ResultError) / todouble(HasResult) * 100, 2), + Status = iff(HasResult == TotalEvents, "✅ ALL EVENTS HAVE result", "⚠️ SOME EVENTS MISSING result"); + + +// ============================================================================= +// CHECK 3: MANAGER_REGISTRATION.SKIPPED — property validation +// Expected: managerName in {conda, pyenv, pipenv, poetry}, reason = "tool_not_found" +// These should be common — most users don't have all 4 tools. +// ============================================================================= +let totalMachines = toscalar( + RawEventsVSCodeExt + | where ServerTimestamp > ago(7d) + | where ExtensionName == "ms-python.vscode-python-envs" + | where ExtensionVersion != "" + | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) + | where _minor > 23 or (_minor == 23 and _patch >= 10781012) + | summarize dcount(VSCodeMachineId) +); +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize + EventCount = count(), + UniqueMachines = dcount(VSCodeMachineId) + by ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason) +| extend TotalUniqueMachines = totalMachines +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1) +| order by EventCount desc; + + +// ============================================================================= +// CHECK 4: MANAGER_REGISTRATION.FAILED — property validation +// Expected: managerName in {system, conda, pyenv, pipenv, poetry, shellStartupVars} +// errorType in {spawn_timeout, spawn_enoent, permission_denied, canceled, parse_error, unknown} +// This may have 0 rows if no one hit errors — that's fine. +// +// errorType reference (from errorClassifier.ts): +// spawn_timeout — tool process started but hung or didn't respond in time +// spawn_enoent — OS couldn't find the executable (ENOENT: not installed or not on PATH) +// permission_denied — OS returned EACCES/EPERM (exists but no permission to run) +// canceled — operation was stopped (user closed VS Code, workspace changed, CancellationToken fired) +// parse_error — tool ran but returned unparseable output (malformed JSON, unexpected format) +// unknown — didn't match any known pattern (catch-all for unexpected errors) +// ============================================================================= +let totalMachines = toscalar( + RawEventsVSCodeExt + | where ServerTimestamp > ago(7d) + | where ExtensionName == "ms-python.vscode-python-envs" + | where ExtensionVersion != "" + | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) + | where _minor > 23 or (_minor == 23 and _patch >= 10781012) + | summarize dcount(VSCodeMachineId) +); +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize + EventCount = count(), + UniqueMachines = dcount(VSCodeMachineId) + by ManagerName = tostring(Properties.managername), ErrorType = tostring(Properties.errortype) +| extend TotalUniqueMachines = totalMachines +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1) +| order by EventCount desc; + + +// ============================================================================= +// CHECK 5: SETUP.HANG_DETECTED — property validation +// Expected: failureStage in {nativeFinder, managerRegistration, envSelection, terminalWatcher, settingsListener} +// This should have very few or 0 rows (hangs are rare). 0 is normal. +// ============================================================================= +let totalMachines = toscalar( + RawEventsVSCodeExt + | where ServerTimestamp > ago(7d) + | where ExtensionName == "ms-python.vscode-python-envs" + | where ExtensionVersion != "" + | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) + | where _minor > 23 or (_minor == 23 and _patch >= 10781012) + | summarize dcount(VSCodeMachineId) +); +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/setup.hang_detected" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize + EventCount = count(), + UniqueMachines = dcount(VSCodeMachineId) + by FailureStage = tostring(Properties.failurestage) +| extend TotalUniqueMachines = totalMachines +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1) +| order by EventCount desc; + + +// ============================================================================= +// CHECK 6: Registered vs Skipped — unique machine counts per manager +// For each manager that can be skipped, compare registered vs skipped machine counts. +// If a manager shows 0 registered AND 0 skipped, the telemetry path may be broken. +// ============================================================================= +let totalMachines = toscalar( + RawEventsVSCodeExt + | where ServerTimestamp > ago(7d) + | where ExtensionName == "ms-python.vscode-python-envs" + | where ExtensionVersion != "" + | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) + | where _minor > 23 or (_minor == 23 and _patch >= 10781012) + | summarize dcount(VSCodeMachineId) +); +let skipped = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize SkippedMachines = dcount(VSCodeMachineId) by Manager = tostring(Properties.managername); +let registered = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/environment_manager.registered" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend RawManagerId = tostring(Properties.managerid) +| where RawManagerId has_any ("conda", "pyenv", "pipenv", "poetry") +| extend Manager = case( + RawManagerId has "conda", "conda", + RawManagerId has "pyenv", "pyenv", + RawManagerId has "pipenv", "pipenv", + RawManagerId has "poetry", "poetry", + RawManagerId) +| summarize RegisteredMachines = dcount(VSCodeMachineId) by Manager; +skipped +| join kind=fullouter registered on Manager +| project + Manager = coalesce(Manager, Manager1), + RegisteredMachines = coalesce(RegisteredMachines, 0), + SkippedMachines = coalesce(SkippedMachines, 0), + TotalUniqueMachines = totalMachines +| extend + RegisteredPct = round(todouble(RegisteredMachines) / todouble(TotalUniqueMachines) * 100, 1), + SkippedPct = round(todouble(SkippedMachines) / todouble(TotalUniqueMachines) * 100, 1) +| order by Manager asc; + + + diff --git a/analysis/kusto/01-overall-setup-success-rate.kql b/analysis/kusto/01-overall-setup-success-rate.kql new file mode 100644 index 00000000..335a832a --- /dev/null +++ b/analysis/kusto/01-overall-setup-success-rate.kql @@ -0,0 +1,15 @@ +// Query 1 — Overall Setup Success Rate (Last 28 Days) +// Of all machines that started manager setup, what % completed successfully? +// This is the top-level health metric. If this drops, something is broken. +RawEventsVSCodeExt +| where ServerTimestamp > ago(28d) +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize + TotalMachines = dcount(VSCodeMachineId), + SuccessMachines = dcountif(VSCodeMachineId, tostring(Properties.result) == "success"), + ErrorMachines = dcountif(VSCodeMachineId, tostring(Properties.result) == "error") +| extend SuccessRate = round(todouble(SuccessMachines) / todouble(TotalMachines) * 100, 1) +| project TotalMachines, SuccessMachines, ErrorMachines, SuccessRate diff --git a/analysis/kusto/02-manager-availability.kql b/analysis/kusto/02-manager-availability.kql new file mode 100644 index 00000000..29950f0c --- /dev/null +++ b/analysis/kusto/02-manager-availability.kql @@ -0,0 +1,38 @@ +// Query 2 — Manager Availability: What Tools Do Users Have Installed? +// For each manager, counts: registered (tool found), skipped (tool not found), failed (crashed). +// InstalledRate shows what % of users actually have each tool. +let endDate = startofday(now()-1d); +let skipped = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.skipped" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize SkippedMachines = dcount(VSCodeMachineId) by ManagerName = tostring(Properties.managername); +let registered = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/environment_manager.registered" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend RawManagerId = tostring(Properties.managerid) +| extend ManagerName = extract("[^:]+$", 0, RawManagerId) +| summarize RegisteredMachines = dcount(VSCodeMachineId) by ManagerName; +let failed = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize FailedMachines = dcount(VSCodeMachineId) by ManagerName = tostring(Properties.managername); +skipped +| join kind=fullouter registered on ManagerName +| join kind=fullouter failed on ManagerName +| project + Manager = coalesce(ManagerName, ManagerName1, ManagerName2), + Registered = coalesce(RegisteredMachines, 0), + Skipped = coalesce(SkippedMachines, 0), + Failed = coalesce(FailedMachines, 0) +| extend Total = Registered + Skipped + Failed +| extend InstalledRate = iff(Total > 0, round(todouble(Registered) / todouble(Total) * 100, 1), 0.0) +| order by Total desc diff --git a/analysis/kusto/03-daily-trend.kql b/analysis/kusto/03-daily-trend.kql new file mode 100644 index 00000000..c3e53c7c --- /dev/null +++ b/analysis/kusto/03-daily-trend.kql @@ -0,0 +1,20 @@ +// Query 3 — Daily Trend: Are Failures Increasing or Decreasing? +// Day-by-day trend of setup results. Check this after shipping a new version. +// A sudden SuccessRate drop means a regression; a rise means a fix is working. +let endDate = startofday(now()-1d); +RawEventsVSCodeExt +| where ServerTimestamp > endDate-14d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend Day = startofday(ServerTimestamp) +| extend Result = tostring(Properties.result) +| where Result in ("success", "error") +| summarize + SuccessCount = countif(Result == "success"), + ErrorCount = countif(Result == "error") + by Day +| extend TotalCount = SuccessCount + ErrorCount +| extend SuccessRate = round(todouble(SuccessCount) / todouble(TotalCount) * 100, 1) +| order by Day asc diff --git a/analysis/kusto/04-error-type-distribution.kql b/analysis/kusto/04-error-type-distribution.kql new file mode 100644 index 00000000..01d68139 --- /dev/null +++ b/analysis/kusto/04-error-type-distribution.kql @@ -0,0 +1,30 @@ +// Query 4 — Error Type Distribution Across All Failures +// Combines errors from overall setup and individual managers, grouped by error type. +// Action depends on results: +// spawn_timeout → native finder or tool is too slow +// spawn_enoent → binary path wrong or not on PATH +// permission_denied → file system permission issue +// parse_error → a tool returned unexpected output +// unknown → needs deeper investigation +let endDate = startofday(now()-1d); +let setupErrors = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| where tostring(Properties.result) == "error" +| project ErrorType = tostring(Properties.errortype), Source = "setup_overall", VSCodeMachineId; +let managerErrors = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| project ErrorType = tostring(Properties.errortype), Source = "individual_manager", VSCodeMachineId; +union setupErrors, managerErrors +| summarize + EventCount = count(), + AffectedMachines = dcount(VSCodeMachineId) + by ErrorType, Source +| order by AffectedMachines desc diff --git a/analysis/kusto/05-hang-failure-correlation.kql b/analysis/kusto/05-hang-failure-correlation.kql new file mode 100644 index 00000000..391397c7 --- /dev/null +++ b/analysis/kusto/05-hang-failure-correlation.kql @@ -0,0 +1,26 @@ +// Query 5 — Hang + Failure Correlation +// Do hangs always result in failures, or do some self-recover? +// If most hangs recover → just slow machines. If most also fail → real deadlock or crash. +let endDate = startofday(now()-1d); +let hangMachines = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/setup.hang_detected" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| distinct VSCodeMachineId; +let failedMachines = RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| where tostring(Properties.result) == "error" +| distinct VSCodeMachineId; +let totalHangs = toscalar(hangMachines | count); +let hangAndFail = hangMachines | join kind=inner failedMachines on VSCodeMachineId | count; +let hangOnly = toscalar(hangMachines | join kind=leftanti failedMachines on VSCodeMachineId | count); +print + TotalHangMachines = totalHangs, + HangAndFailed = toscalar(hangAndFail), + HangButRecovered = hangOnly diff --git a/analysis/kusto/06-weekly-health-summary.kql b/analysis/kusto/06-weekly-health-summary.kql new file mode 100644 index 00000000..f2eca4d5 --- /dev/null +++ b/analysis/kusto/06-weekly-health-summary.kql @@ -0,0 +1,50 @@ +// Query 6 — Weekly Report: Full Health Summary +// One-stop query for weekly health check. Run every Monday. +// Returns one row with all key numbers: activations, setup results, manager failures, hangs. +let endDate = startofday(now()-1d); +let startDate = endDate - 7d; +let totalActivations = toscalar( + RawEventsVSCodeExt + | where ServerTimestamp > startDate and ServerTimestamp < endDate + | where EventName == "ms-python.vscode-python-envs/extension.activation_duration" + | where ExtensionVersion != "" + | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) + | where _minor > 23 or (_minor == 23 and _patch >= 10781012) + | summarize dcount(VSCodeMachineId) +); +let setupResults = RawEventsVSCodeExt +| where ServerTimestamp > startDate and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/extension.manager_registration_duration" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| where isnotempty(tostring(Properties.result)) +| summarize + SetupSuccess = dcountif(VSCodeMachineId, tostring(Properties.result) == "success"), + SetupError = dcountif(VSCodeMachineId, tostring(Properties.result) == "error"); +let managerFailures = RawEventsVSCodeExt +| where ServerTimestamp > startDate and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize ManagerFailures = count(), ManagerFailMachines = dcount(VSCodeMachineId); +let hangs = RawEventsVSCodeExt +| where ServerTimestamp > startDate and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/setup.hang_detected" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize Hangs = count(), HangMachines = dcount(VSCodeMachineId); +setupResults | extend _k = 1 +| join (managerFailures | extend _k = 1) on _k +| join (hangs | extend _k = 1) on _k +| project + TotalActivations = totalActivations, + SetupSuccess, + SetupError, + SetupSuccessRate = round(todouble(SetupSuccess) / todouble(SetupSuccess + SetupError) * 100, 1), + ManagerFailures, + ManagerFailMachines, + Hangs, + HangMachines diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb new file mode 100644 index 00000000..88fbbcaf --- /dev/null +++ b/analysis/kusto/dashboard.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Python Environments — Telemetry Dashboard\n", + "\n", + "Interactive notebook for running the Kusto queries in `analysis/kusto/*.kql`.\n", + "\n", + "**Prerequisites:**\n", + "1. `pip install -r requirements.txt`\n", + "2. `az login` (Azure CLI authentication)\n", + "3. `nbstripout --install` (one-time setup to auto-strip notebook outputs before commits)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "from initialize import initialize\n", + "from query_runner import run_kql, run_kql_file, load_kql_sections\n", + "from IPython.display import display, Markdown, HTML\n", + "import pandas as pd\n", + "import altair as alt\n", + "\n", + "# Show all rows and wrap long column text\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_columns\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "\n", + "\n", + "client = initialize()\n", + "print(\"Connected to Kusto.\")" + ] + }, + { + "cell_type": "markdown", + "id": "2", + "metadata": {}, + "source": [ + "---\n", + "## 0. Telemetry Validation\n", + "\n", + "Checks that all telemetry events are arriving with correct properties." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "for title, query in load_kql_sections(\"00-telemetry-validation.kql\"):\n", + " display(Markdown(f\"### {title}\"))\n", + " try:\n", + " df = run_kql(client, query)\n", + " display(df)\n", + " if \"TotalUniqueMachines\" in df.columns:\n", + " print(f\"Total unique machines: {df['TotalUniqueMachines'].iloc[0]:,}\")\n", + " except Exception as e:\n", + " print(f\" ⚠️ {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4", + "metadata": {}, + "source": [ + "---\n", + "## 1. Overall Setup Success Rate (28 days)\n", + "\n", + "Top-level health metric. If this drops, something is broken." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"01-overall-setup-success-rate.kql\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "---\n", + "## 2. Manager Availability\n", + "\n", + "What tools do users actually have installed? Shows registered vs skipped vs failed per manager." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"02-manager-availability.kql\")\n", + "display(df)\n", + "\n", + "if not df.empty:\n", + " melted = df.melt(\n", + " id_vars=\"Manager\",\n", + " value_vars=[\"Registered\", \"Skipped\", \"Failed\"],\n", + " var_name=\"Status\",\n", + " value_name=\"Machines\",\n", + " )\n", + " chart = (\n", + " alt.Chart(melted)\n", + " .mark_bar()\n", + " .encode(\n", + " x=alt.X(\"Manager:N\", sort=\"-y\"),\n", + " y=alt.Y(\"Machines:Q\"),\n", + " color=alt.Color(\"Status:N\", scale=alt.Scale(\n", + " domain=[\"Registered\", \"Skipped\", \"Failed\"],\n", + " range=[\"#4c78a8\", \"#e45756\", \"#f58518\"],\n", + " )),\n", + " tooltip=[\"Manager\", \"Status\", \"Machines\"],\n", + " )\n", + " .properties(width=600, height=300, title=\"Manager Availability\")\n", + " )\n", + " display(chart)" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "---\n", + "## 3. Daily Trend (14 days)\n", + "\n", + "Day-by-day trend of setup success rate. Check after shipping a new version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"03-daily-trend.kql\")\n", + "display(df)\n", + "\n", + "if not df.empty:\n", + " chart = (\n", + " alt.Chart(df)\n", + " .mark_line(point=True)\n", + " .encode(\n", + " x=alt.X(\"Day:T\", title=\"Date\"),\n", + " y=alt.Y(\"SuccessRate:Q\", title=\"Success Rate (%)\", scale=alt.Scale(domain=[0, 100])),\n", + " tooltip=[\"Day:T\", \"SuccessRate:Q\", \"SuccessCount:Q\", \"ErrorCount:Q\"],\n", + " )\n", + " .properties(width=700, height=300, title=\"Daily Setup Success Rate\")\n", + " )\n", + " display(chart)" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "---\n", + "## 4. Error Type Distribution\n", + "\n", + "Groups all failures by error type across setup and individual managers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"04-error-type-distribution.kql\")\n", + "display(df)\n", + "\n", + "if not df.empty:\n", + " chart = (\n", + " alt.Chart(df)\n", + " .mark_bar()\n", + " .encode(\n", + " x=alt.X(\"ErrorType:N\", sort=\"-y\", title=\"Error Type\"),\n", + " y=alt.Y(\"AffectedMachines:Q\", title=\"Affected Machines\"),\n", + " color=\"Source:N\",\n", + " tooltip=[\"ErrorType\", \"Source\", \"EventCount\", \"AffectedMachines\"],\n", + " )\n", + " .properties(width=600, height=300, title=\"Error Type Distribution\")\n", + " )\n", + " display(chart)" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "---\n", + "## 5. Hang ↔ Failure Correlation\n", + "\n", + "Do hangs always cause failures, or do some self-recover?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"05-hang-failure-correlation.kql\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": {}, + "source": [ + "---\n", + "## 6. Weekly Health Summary\n", + "\n", + "One-stop query for weekly check. Returns all key numbers in a single row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"06-weekly-health-summary.kql\")\n", + "display(df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.13.12)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/kusto/initialize.py b/analysis/kusto/initialize.py new file mode 100644 index 00000000..19b5f8ef --- /dev/null +++ b/analysis/kusto/initialize.py @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +"""Kusto authentication for the telemetry dashboard. + +Prerequisites: + 1. Install: pip install azure-kusto-data + 2. Authenticate: az login +""" + +import shutil + +from azure.kusto.data import KustoClient, KustoConnectionStringBuilder + +CLUSTER = "ddtelvscode.kusto.windows.net" +DATABASE = "VSCodeExt" + + +def initialize(cluster: str = CLUSTER) -> KustoClient: + """Return an authenticated KustoClient. + + Uses Azure CLI authentication if ``az`` is on PATH, otherwise falls back + to interactive browser login. + """ + url = f"https://{cluster}" + if shutil.which("az"): + return KustoClient(KustoConnectionStringBuilder.with_az_cli_authentication(url)) + return KustoClient(KustoConnectionStringBuilder.with_interactive_login(url)) diff --git a/analysis/kusto/query_runner.py b/analysis/kusto/query_runner.py new file mode 100644 index 00000000..8a87050c --- /dev/null +++ b/analysis/kusto/query_runner.py @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +"""Load and execute .kql files against Azure Data Explorer. + +Keeps the existing .kql files as the single source of truth — they can still +be copy-pasted into the ADX web UI, AND this module can run them from Python. +""" + +import pathlib +import re +from typing import List, Tuple + +import pandas as pd +from azure.kusto.data import KustoClient +from azure.kusto.data.helpers import dataframe_from_result_table + +KQL_DIR = pathlib.Path(__file__).parent +DATABASE = "VSCodeExt" + +# Separator pattern used in multi-query files like 00-prerelease-telemetry-validation.kql +_SECTION_SEP = re.compile(r"^// =====+", re.MULTILINE) + + +def load_kql(filename: str) -> str: + """Read a .kql file and strip leading comment-only lines.""" + path = KQL_DIR / filename + lines = path.read_text(encoding="utf-8").splitlines() + return "\n".join(line for line in lines if not line.strip().startswith("//")) + + +def load_kql_sections(filename: str) -> List[Tuple[str, str]]: + """Split a multi-query .kql file into ``(title, query)`` pairs. + + Sections are delimited by ``// ====...====`` separator lines. The first + comment line after a separator is used as the section title. + """ + text = (KQL_DIR / filename).read_text(encoding="utf-8") + raw_sections = _SECTION_SEP.split(text) + + # Each section is either a "header" (comments only) or a "body" (has KQL). + # Headers set the title for the next body section. + results: List[Tuple[str, str]] = [] + pending_title: str = "" + for section in raw_sections: + lines = section.strip().splitlines() + if not lines: + continue + comment_lines = [ + ln.lstrip("/ ").strip() for ln in lines if ln.strip().startswith("//") + ] + query_lines = [ + ln for ln in lines if not ln.strip().startswith("//") and ln.strip() + ] + if query_lines: + title = pending_title or (comment_lines[0] if comment_lines else "Untitled") + results.append((title, "\n".join(query_lines))) + pending_title = "" + elif comment_lines: + # Comment-only section — use first line as the title for the next query + pending_title = comment_lines[0] + return results + + +def run_kql( + client: KustoClient, query: str, database: str = DATABASE +) -> pd.DataFrame: + """Execute a KQL query string and return results as a DataFrame.""" + response = client.execute(database, query) + return dataframe_from_result_table(response.primary_results[0]) + + +def run_kql_file( + client: KustoClient, filename: str, database: str = DATABASE +) -> pd.DataFrame: + """Load a .kql file and execute it.""" + return run_kql(client, load_kql(filename), database) diff --git a/analysis/kusto/requirements.txt b/analysis/kusto/requirements.txt new file mode 100644 index 00000000..ac8385bf --- /dev/null +++ b/analysis/kusto/requirements.txt @@ -0,0 +1,4 @@ +azure-kusto-data>=4.0.0 +pandas>=2.0.0 +altair>=5.0.0 +nbstripout>=0.7.0 From c118ccc2531fdd9461494dda8c5e95db9d2bc5bd Mon Sep 17 00:00:00 2001 From: Stella Huang Date: Tue, 31 Mar 2026 16:43:42 -0700 Subject: [PATCH 2/3] update --- analysis/kusto/00-telemetry-validation.kql | 15 +-- analysis/kusto/01-failure-stage-breakdown.kql | 18 +++ .../kusto/02-error-type-x-failure-stage.kql | 20 ++++ .../kusto/03-failure-stage-daily-trend.kql | 19 +++ ....kql => 04-overall-setup-success-rate.kql} | 0 ...bility.kql => 05-manager-availability.kql} | 0 ...{03-daily-trend.kql => 06-daily-trend.kql} | 0 ...ion.kql => 07-error-type-distribution.kql} | 0 ...on.kql => 08-hang-failure-correlation.kql} | 0 ...mmary.kql => 09-weekly-health-summary.kql} | 0 analysis/kusto/dashboard.ipynb | 112 ++++++++++++++---- 11 files changed, 155 insertions(+), 29 deletions(-) create mode 100644 analysis/kusto/01-failure-stage-breakdown.kql create mode 100644 analysis/kusto/02-error-type-x-failure-stage.kql create mode 100644 analysis/kusto/03-failure-stage-daily-trend.kql rename analysis/kusto/{01-overall-setup-success-rate.kql => 04-overall-setup-success-rate.kql} (100%) rename analysis/kusto/{02-manager-availability.kql => 05-manager-availability.kql} (100%) rename analysis/kusto/{03-daily-trend.kql => 06-daily-trend.kql} (100%) rename analysis/kusto/{04-error-type-distribution.kql => 07-error-type-distribution.kql} (100%) rename analysis/kusto/{05-hang-failure-correlation.kql => 08-hang-failure-correlation.kql} (100%) rename analysis/kusto/{06-weekly-health-summary.kql => 09-weekly-health-summary.kql} (100%) diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql index a2f30fce..609fec56 100644 --- a/analysis/kusto/00-telemetry-validation.kql +++ b/analysis/kusto/00-telemetry-validation.kql @@ -6,7 +6,7 @@ // ============================================================================= -// CHECK 0: UniqueMachines with Telemetry Changes +// CHECK 0: UniqueMachines with new telemetry // Shows unique machines per version for builds >= 1.23.10781012 (all stable + pre-release). // Use this to confirm the latest pre-release build has real users. // ============================================================================= @@ -23,9 +23,7 @@ filtered // ============================================================================= -// CHECK 1: Are all 4 events arriving? (3 new + 1 enhanced) -// Expected: at least 1 row per event. If any row shows 0, that event is broken. -// Broken down by extension version so you can confirm the new build is reporting. +// CHECK 1: Are all 4 events arriving? (Only versions with ⚠️ NO DATA YET are displayed) // ============================================================================= let allEvents = datatable(EventName: string) [ "ms-python.vscode-python-envs/manager_registration.failed", @@ -53,11 +51,12 @@ allEvents EventCount = coalesce(EventCount, 0), UniqueMachines = coalesce(UniqueMachines, 0), Status = iff(coalesce(EventCount, 0) > 0, "✅ RECEIVING DATA", "⚠️ NO DATA YET") +| where Status == "⚠️ NO DATA YET" | order by EventName asc, ExtVersion desc; // ============================================================================= -// CHECK 2: Enhanced event — does "result" property exist? +// CHECK 2: MANAGER_REGISTRATION_DURATION event success rate // The existing event previously had only "duration". Now it should have "result". // Expected: rows with result = "success" (should be the majority) and maybe a few "error". // If result is empty/"", the property isn't being sent correctly. @@ -81,7 +80,7 @@ RawEventsVSCodeExt // ============================================================================= -// CHECK 3: MANAGER_REGISTRATION.SKIPPED — property validation +// CHECK 3: MANAGER_REGISTRATION.SKIPPED (Percentage of users don't have each tool) // Expected: managerName in {conda, pyenv, pipenv, poetry}, reason = "tool_not_found" // These should be common — most users don't have all 4 tools. // ============================================================================= @@ -100,10 +99,12 @@ RawEventsVSCodeExt | extend ExtVersion = tostring(Properties["common.extversion"]) | extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) | where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason) +| where isnotempty(ManagerName) and isnotempty(Reason) | summarize EventCount = count(), UniqueMachines = dcount(VSCodeMachineId) - by ManagerName = tostring(Properties.managername), Reason = tostring(Properties.reason) + by ManagerName, Reason | extend TotalUniqueMachines = totalMachines | extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1) | order by EventCount desc; diff --git a/analysis/kusto/01-failure-stage-breakdown.kql b/analysis/kusto/01-failure-stage-breakdown.kql new file mode 100644 index 00000000..d0416ddf --- /dev/null +++ b/analysis/kusto/01-failure-stage-breakdown.kql @@ -0,0 +1,18 @@ +// Query 1 — Registration Failure Stage Breakdown per Manager (28 days) +// For each manager that failed, shows WHICH stage in the registration flow broke. +// failureStage is hierarchical: "getPipenv:nativeFinderRefresh", "constructCondaSourcingStatus", etc. +// High counts at a specific stage → that code path is the priority fix target. +let endDate = startofday(now()-1d); +RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend ManagerName = tostring(Properties.managername) +| extend FailureStage = tostring(Properties.failurestage) +| summarize + FailureCount = count(), + AffectedMachines = dcount(VSCodeMachineId) + by ManagerName, FailureStage +| order by ManagerName asc, AffectedMachines desc diff --git a/analysis/kusto/02-error-type-x-failure-stage.kql b/analysis/kusto/02-error-type-x-failure-stage.kql new file mode 100644 index 00000000..69e58fdf --- /dev/null +++ b/analysis/kusto/02-error-type-x-failure-stage.kql @@ -0,0 +1,20 @@ +// Query 2 — Error Type × Failure Stage Matrix (28 days) +// Cross-tabulates errorType (what kind of error) with failureStage (where it happened). +// This is the key diagnostic view: e.g., "connection_error at nativeFinderRefresh" means +// PET process dies during native finder, while "tool_not_found at pathLookup" means the +// tool binary wasn't on PATH. Prioritize cells with the highest AffectedMachines. +let endDate = startofday(now()-1d); +RawEventsVSCodeExt +| where ServerTimestamp > endDate-28d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend ManagerName = tostring(Properties.managername) +| extend ErrorType = tostring(Properties.errortype) +| extend FailureStage = tostring(Properties.failurestage) +| summarize + FailureCount = count(), + AffectedMachines = dcount(VSCodeMachineId) + by ManagerName, ErrorType, FailureStage +| order by AffectedMachines desc, ManagerName asc diff --git a/analysis/kusto/03-failure-stage-daily-trend.kql b/analysis/kusto/03-failure-stage-daily-trend.kql new file mode 100644 index 00000000..9413064a --- /dev/null +++ b/analysis/kusto/03-failure-stage-daily-trend.kql @@ -0,0 +1,19 @@ +// Query 3 — Daily Registration Failures by Manager and Stage (14 days) +// Day-by-day failure counts per manager + failureStage combination. +// Use this to detect if a specific stage started failing more after a release. +// A spike in one stage on a specific day → regression in that code path. +let endDate = startofday(now()-1d); +RawEventsVSCodeExt +| where ServerTimestamp > endDate-14d and ServerTimestamp < endDate +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| extend Day = startofday(ServerTimestamp) +| extend ManagerName = tostring(Properties.managername) +| extend FailureStage = tostring(Properties.failurestage) +| summarize + FailureCount = count(), + AffectedMachines = dcount(VSCodeMachineId) + by Day, ManagerName, FailureStage +| order by Day asc, ManagerName asc, AffectedMachines desc diff --git a/analysis/kusto/01-overall-setup-success-rate.kql b/analysis/kusto/04-overall-setup-success-rate.kql similarity index 100% rename from analysis/kusto/01-overall-setup-success-rate.kql rename to analysis/kusto/04-overall-setup-success-rate.kql diff --git a/analysis/kusto/02-manager-availability.kql b/analysis/kusto/05-manager-availability.kql similarity index 100% rename from analysis/kusto/02-manager-availability.kql rename to analysis/kusto/05-manager-availability.kql diff --git a/analysis/kusto/03-daily-trend.kql b/analysis/kusto/06-daily-trend.kql similarity index 100% rename from analysis/kusto/03-daily-trend.kql rename to analysis/kusto/06-daily-trend.kql diff --git a/analysis/kusto/04-error-type-distribution.kql b/analysis/kusto/07-error-type-distribution.kql similarity index 100% rename from analysis/kusto/04-error-type-distribution.kql rename to analysis/kusto/07-error-type-distribution.kql diff --git a/analysis/kusto/05-hang-failure-correlation.kql b/analysis/kusto/08-hang-failure-correlation.kql similarity index 100% rename from analysis/kusto/05-hang-failure-correlation.kql rename to analysis/kusto/08-hang-failure-correlation.kql diff --git a/analysis/kusto/06-weekly-health-summary.kql b/analysis/kusto/09-weekly-health-summary.kql similarity index 100% rename from analysis/kusto/06-weekly-health-summary.kql rename to analysis/kusto/09-weekly-health-summary.kql diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb index 88fbbcaf..278a46b8 100644 --- a/analysis/kusto/dashboard.ipynb +++ b/analysis/kusto/dashboard.ipynb @@ -73,9 +73,10 @@ "metadata": {}, "source": [ "---\n", - "## 1. Overall Setup Success Rate (28 days)\n", + "## 1. Registration Failure Stage Breakdown (28 days)\n", "\n", - "Top-level health metric. If this drops, something is broken." + "For each manager that failed, shows **which stage** in the registration flow broke.\n", + "`failureStage` is hierarchical (e.g. `getPipenv:nativeFinderRefresh`). High counts at a specific stage = priority fix target." ] }, { @@ -85,7 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"01-overall-setup-success-rate.kql\")\n", + "df = run_kql_file(client, \"01-failure-stage-breakdown.kql\")\n", "display(df)" ] }, @@ -95,9 +96,10 @@ "metadata": {}, "source": [ "---\n", - "## 2. Manager Availability\n", + "## 2. Error Type × Failure Stage Matrix (28 days)\n", "\n", - "What tools do users actually have installed? Shows registered vs skipped vs failed per manager." + "Cross-tabulates **error type** (what kind of error) with **failure stage** (where it happened).\n", + "This is the key diagnostic view — e.g. `connection_error` at `nativeFinderRefresh` means PET died during native finder." ] }, { @@ -107,7 +109,73 @@ "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"02-manager-availability.kql\")\n", + "df = run_kql_file(client, \"02-error-type-x-failure-stage.kql\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "---\n", + "## 3. Daily Registration Failures by Manager & Stage (14 days)\n", + "\n", + "Day-by-day failure counts per manager + stage. A spike in one stage on a specific day = regression in that code path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"03-failure-stage-daily-trend.kql\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "---\n", + "## 4. Overall Setup Success Rate (28 days)\n", + "\n", + "Top-level health metric. If this drops, something is broken." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"04-overall-setup-success-rate.kql\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "---\n", + "## 5. Manager Availability\n", + "\n", + "What tools do users actually have installed? Shows registered vs skipped vs failed per manager." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], + "source": [ + "df = run_kql_file(client, \"05-manager-availability.kql\")\n", "display(df)\n", "\n", "if not df.empty:\n", @@ -136,11 +204,11 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "14", "metadata": {}, "source": [ "---\n", - "## 3. Daily Trend (14 days)\n", + "## 6. Daily Trend (14 days)\n", "\n", "Day-by-day trend of setup success rate. Check after shipping a new version." ] @@ -148,11 +216,11 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "15", "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"03-daily-trend.kql\")\n", + "df = run_kql_file(client, \"06-daily-trend.kql\")\n", "display(df)\n", "\n", "if not df.empty:\n", @@ -171,11 +239,11 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "16", "metadata": {}, "source": [ "---\n", - "## 4. Error Type Distribution\n", + "## 7. Error Type Distribution\n", "\n", "Groups all failures by error type across setup and individual managers." ] @@ -183,11 +251,11 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "17", "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"04-error-type-distribution.kql\")\n", + "df = run_kql_file(client, \"07-error-type-distribution.kql\")\n", "display(df)\n", "\n", "if not df.empty:\n", @@ -207,11 +275,11 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "18", "metadata": {}, "source": [ "---\n", - "## 5. Hang ↔ Failure Correlation\n", + "## 8. Hang ↔ Failure Correlation\n", "\n", "Do hangs always cause failures, or do some self-recover?" ] @@ -219,21 +287,21 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "19", "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"05-hang-failure-correlation.kql\")\n", + "df = run_kql_file(client, \"08-hang-failure-correlation.kql\")\n", "display(df)" ] }, { "cell_type": "markdown", - "id": "14", + "id": "20", "metadata": {}, "source": [ "---\n", - "## 6. Weekly Health Summary\n", + "## 9. Weekly Health Summary\n", "\n", "One-stop query for weekly check. Returns all key numbers in a single row." ] @@ -241,11 +309,11 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "21", "metadata": {}, "outputs": [], "source": [ - "df = run_kql_file(client, \"06-weekly-health-summary.kql\")\n", + "df = run_kql_file(client, \"09-weekly-health-summary.kql\")\n", "display(df)" ] } From c69a893f55d622f606f16ebee6e902ef2e8c0c42 Mon Sep 17 00:00:00 2001 From: Stella Huang Date: Tue, 31 Mar 2026 17:35:21 -0700 Subject: [PATCH 3/3] update --- analysis/kusto/00-telemetry-validation.kql | 60 +++++++++++++++++++++- analysis/kusto/dashboard.ipynb | 57 +++++++++++++------- 2 files changed, 97 insertions(+), 20 deletions(-) diff --git a/analysis/kusto/00-telemetry-validation.kql b/analysis/kusto/00-telemetry-validation.kql index 609fec56..aec7c5ad 100644 --- a/analysis/kusto/00-telemetry-validation.kql +++ b/analysis/kusto/00-telemetry-validation.kql @@ -144,10 +144,68 @@ RawEventsVSCodeExt UniqueMachines = dcount(VSCodeMachineId) by ManagerName = tostring(Properties.managername), ErrorType = tostring(Properties.errortype) | extend TotalUniqueMachines = totalMachines -| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 1) +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalUniqueMachines) * 100, 2) | order by EventCount desc; +// ============================================================================= +// CHECK 4a: SPAWN_TIMEOUT failures broken down by manager × extension version +// Shows whether spawn_timeout is improving or worsening across versions. +// If a new version shows higher MachinePct → that release regressed timeout handling. +// ============================================================================= +let totalByVersion = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where ExtensionName == "ms-python.vscode-python-envs" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize TotalMachines = dcount(VSCodeMachineId) by ExtVersion = ExtensionVersion; +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| where tostring(Properties.errortype) == "spawn_timeout" +| summarize + EventCount = count(), + UniqueMachines = dcount(VSCodeMachineId) + by ManagerName = tostring(Properties.managername), ExtVersion +| join kind=inner totalByVersion on ExtVersion +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalMachines) * 100, 2) +| project ManagerName, ExtVersion, EventCount, UniqueMachines, TotalMachines, MachinePct +| order by ManagerName asc, ExtVersion desc; + + +// ============================================================================= +// CHECK 4b: UNKNOWN failures broken down by manager × extension version +// Shows whether unknown errors are improving or worsening across versions. +// High counts in the latest version → new unclassified error paths need investigation. +// ============================================================================= +let totalByVersion = RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where ExtensionName == "ms-python.vscode-python-envs" +| where ExtensionVersion != "" +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtensionVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtensionVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| summarize TotalMachines = dcount(VSCodeMachineId) by ExtVersion = ExtensionVersion; +RawEventsVSCodeExt +| where ServerTimestamp > ago(7d) +| where EventName == "ms-python.vscode-python-envs/manager_registration.failed" +| extend ExtVersion = tostring(Properties["common.extversion"]) +| extend _minor = toint(extract("^1\\.(\\d+)", 1, ExtVersion)), _patch = tolong(extract("^1\\.\\d+\\.(\\d+)", 1, ExtVersion)) +| where _minor > 23 or (_minor == 23 and _patch >= 10781012) +| where tostring(Properties.errortype) == "unknown" +| summarize + EventCount = count(), + UniqueMachines = dcount(VSCodeMachineId) + by ManagerName = tostring(Properties.managername), ExtVersion +| join kind=inner totalByVersion on ExtVersion +| extend MachinePct = round(todouble(UniqueMachines) / todouble(TotalMachines) * 100, 2) +| project ManagerName, ExtVersion, EventCount, UniqueMachines, TotalMachines, MachinePct +| order by ManagerName asc, ExtVersion desc; + + // ============================================================================= // CHECK 5: SETUP.HANG_DETECTED — property validation // Expected: failureStage in {nativeFinder, managerRegistration, envSelection, terminalWatcher, settingsListener} diff --git a/analysis/kusto/dashboard.ipynb b/analysis/kusto/dashboard.ipynb index 278a46b8..378a44f7 100644 --- a/analysis/kusto/dashboard.ipynb +++ b/analysis/kusto/dashboard.ipynb @@ -56,7 +56,8 @@ "metadata": {}, "outputs": [], "source": [ - "for title, query in load_kql_sections(\"00-telemetry-validation.kql\"):\n", + "sections = load_kql_sections(\"00-telemetry-validation.kql\")\n", + "for title, query in sections[:4]:\n", " display(Markdown(f\"### {title}\"))\n", " try:\n", " df = run_kql(client, query)\n", @@ -68,9 +69,27 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "4", "metadata": {}, + "outputs": [], + "source": [ + "for title, query in sections[4:]:\n", + " display(Markdown(f\"### {title}\"))\n", + " try:\n", + " df = run_kql(client, query)\n", + " display(df)\n", + " if \"TotalUniqueMachines\" in df.columns:\n", + " print(f\"Total unique machines: {df['TotalUniqueMachines'].iloc[0]:,}\")\n", + " except Exception as e:\n", + " print(f\" ⚠️ {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, "source": [ "---\n", "## 1. Registration Failure Stage Breakdown (28 days)\n", @@ -82,7 +101,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "6", "metadata": {}, "outputs": [], "source": [ @@ -92,7 +111,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "7", "metadata": {}, "source": [ "---\n", @@ -105,7 +124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "8", "metadata": {}, "outputs": [], "source": [ @@ -115,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": {}, "source": [ "---\n", @@ -127,7 +146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -137,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": {}, "source": [ "---\n", @@ -149,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": {}, "outputs": [], "source": [ @@ -159,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": {}, "source": [ "---\n", @@ -171,7 +190,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -204,7 +223,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": {}, "source": [ "---\n", @@ -216,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": {}, "outputs": [], "source": [ @@ -239,7 +258,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": {}, "source": [ "---\n", @@ -251,7 +270,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "18", "metadata": {}, "outputs": [], "source": [ @@ -275,7 +294,7 @@ }, { "cell_type": "markdown", - "id": "18", + "id": "19", "metadata": {}, "source": [ "---\n", @@ -287,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "20", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +316,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "21", "metadata": {}, "source": [ "---\n", @@ -309,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "22", "metadata": {}, "outputs": [], "source": [