diff --git a/.gitignore b/.gitignore index f618cda..498ce89 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ # Build tree /build + +# Python junk +__pycache__ +*.py[co] diff --git a/README.md b/README.md index 7d0a7f7..151d804 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ The archive may include: - Kernel logs (`dmesg`), systemd unit list, optional systemd journal - Network configuration (interfaces, routes, iptables rules) - Edera Protect daemon configuration (`/var/lib/edera/protect/daemon.toml`) +- Edera Protect zone list and per-zone logs (`protect-ctl zone list`, `protect-ctl zone logs`) **Please inspect the ZIP contents yourself** to ensure you are comfortable with the data before sending it to Edera. You can open it with any ZIP tool. @@ -46,12 +47,13 @@ Optional Privacy Flags ----------------------------------------------------------------------- You may exclude certain data if desired: - --no-acpi Skip ACPI tables - --no-dmi Skip DMI/SMBIOS data - --no-journal Skip systemd journal logs for the whole system - --no-network Skip all network configuration - --no-systemd-units Skip 'systemctl list-units` (all unit state) - --no-unit-journal Skip systemd journal logs for Edera-specific systemd units + --no-acpi Skip ACPI tables + --no-dmi Skip DMI/SMBIOS data + --no-journal Skip systemd journal logs for the whole system + --no-network Skip all network configuration + --no-protect-zone-logs Skip Edera Protect zone list and per-zone logs + --no-systemd-units Skip 'systemctl list-units` (all unit state) + --no-unit-journal Skip systemd journal logs for Edera-specific systemd units Example: diff --git a/edera-debug-report b/edera-debug-report index 6edd88c..b576cd7 100644 --- a/edera-debug-report +++ b/edera-debug-report @@ -12,9 +12,11 @@ Notes: import argparse import datetime as _dt import io +import json import os import shlex import shutil +import signal import subprocess import sys import tempfile @@ -22,7 +24,7 @@ import threading import zipfile from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple, Iterable +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple # --------------------------- config --------------------------- @@ -35,6 +37,7 @@ class Config: unit_journal: bool = True network: bool = True systemctl: bool = True + protect_zone_logs: bool = True # ------------------------ env / basics ------------------------ @@ -52,6 +55,13 @@ def which(cmd: str) -> Optional[str]: return shutil.which(cmd) +def _progress(msg: str) -> None: + """Single-line progress to stderr. When something hangs, the last line + printed identifies the offending command.""" + if os.environ.get("EDR_DEBUG", "0") == "1": + print(f"[edr] {msg}", file=sys.stderr, flush=True) + + def stable_env() -> dict: env = os.environ.copy() for k in ( @@ -79,6 +89,154 @@ def set_environment() -> None: os.chdir(scriptDir) +# ------------------------ subprocess timeout helpers ------------------------ +# +# This tool runs in possibly-broken environments (a wedged protect-daemon, +# a stuck hypervisor, a hung xenstored, etc.). To keep collection bounded +# we run risky commands in their own process group and use a Python-side +# watchdog (TERM, then KILL after a grace period) to signal the whole +# group via os.killpg. The grouping matters because tools like protect-ctl +# may spawn helpers; killing only the parent could leave grandchildren +# holding the stdout pipe open and hanging our read loop. +# +# Side effect of start_new_session=True: the child no longer shares our +# foreground process group, so terminal-delivered SIGINT/SIGTERM/SIGHUP +# stop reaching it automatically. We compensate by tracking active +# child pgids and installing top-level signal handlers that forward to +# every live group before re-raising default behavior in the parent. + + +_active_pgids: Set[int] = set() +_active_pgids_lock = threading.Lock() +_signal_handlers_installed = False +_orig_signal_handlers: Dict[int, Any] = {} + + +def _signal_forwarder(signum, frame): + """Forward SIGINT/SIGTERM/SIGHUP to every tracked child pgid, then + restore the prior handler and re-raise so the parent dies normally.""" + with _active_pgids_lock: + pgids = list(_active_pgids) + for pgid in pgids: + try: + os.killpg(pgid, signal.SIGTERM) + except (ProcessLookupError, PermissionError, OSError): + pass + prior = _orig_signal_handlers.get(signum, signal.SIG_DFL) + try: + signal.signal(signum, prior) + except (TypeError, ValueError, OSError): + signal.signal(signum, signal.SIG_DFL) + os.kill(os.getpid(), signum) + + +def _install_signal_forwarders_once() -> None: + global _signal_handlers_installed + if _signal_handlers_installed: + return + for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGHUP): + try: + _orig_signal_handlers[sig] = signal.signal(sig, _signal_forwarder) + except (ValueError, OSError): + # signal.signal is main-thread-only on some platforms; if we + # can't install, the rest of the timeout machinery still works. + pass + _signal_handlers_installed = True + + +def _track_pgid(pgid: int) -> None: + with _active_pgids_lock: + _active_pgids.add(pgid) + + +def _untrack_pgid(pgid: int) -> None: + with _active_pgids_lock: + _active_pgids.discard(pgid) + + +@dataclass +class TimeoutResult: + rc: int + stdout: bytes + stderr: bytes + timed_out: bool + + +def run_with_timeout( + cmd: List[str], + *, + timeout_s: float, + kill_after_s: float = 1.0, + env: Optional[dict] = None, +) -> TimeoutResult: + """ + Run a command with a SIGTERM-then-SIGKILL watchdog and process-group + isolation. Captures stdout and stderr in memory. + + Use this for short, captured-output commands where the producer might + be hung talking to a wedged daemon/hypervisor. For streaming output + into a ZIP entry, use ZipArchiveWriter.add_stream_from_proc(timeout_s=). + """ + _install_signal_forwarders_once() + + cmd_str = shlex.join(cmd) + _progress(f"running: {cmd_str}") + + p = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env if env is not None else stable_env(), + start_new_session=True, + ) + + try: + pgid = os.getpgid(p.pid) + except ProcessLookupError: + # Child exited before we could read its pgid. + stdout, stderr = p.communicate() + return TimeoutResult( + rc=p.returncode, stdout=stdout or b"", stderr=stderr or b"", timed_out=False + ) + + _track_pgid(pgid) + timed_out = [False] + + def _term() -> None: + timed_out[0] = True + _progress(f"timeout ({timeout_s}s): SIGTERM -> {cmd_str}") + try: + os.killpg(pgid, signal.SIGTERM) + except (ProcessLookupError, PermissionError, OSError): + pass + + def _kill() -> None: + _progress(f"timeout ({timeout_s + kill_after_s}s): SIGKILL -> {cmd_str}") + try: + os.killpg(pgid, signal.SIGKILL) + except (ProcessLookupError, PermissionError, OSError): + pass + + term_timer = threading.Timer(timeout_s, _term) + kill_timer = threading.Timer(timeout_s + kill_after_s, _kill) + term_timer.start() + kill_timer.start() + + try: + stdout, stderr = p.communicate() + finally: + term_timer.cancel() + kill_timer.cancel() + _untrack_pgid(pgid) + + return TimeoutResult( + rc=p.returncode, + stdout=stdout or b"", + stderr=stderr or b"", + timed_out=timed_out[0], + ) + + # ------------------------ ZIP writer ------------------------ @@ -162,34 +320,86 @@ class ZipArchiveWriter: pipe_zstd: bool = False, zstd_level: int = 13, chunk: int = 1024 * 1024, - ) -> Tuple[int, str]: + timeout_s: Optional[float] = None, + kill_after_s: float = 1.0, + ) -> Tuple[int, str, bool]: """ Run a command (optionally piped through zstd) and stream stdout to a ZIP entry. - Returns (rc_of_last, combined_stderr_text). + Returns (rc_of_last, combined_stderr_text, timed_out). Uses DEFLATE unless pipe_zstd=True and zstd is available, in which case the entry is STORED. + + If timeout_s is set, the producer (first pipeline stage) runs in its + own session/process group. A watchdog sends SIGTERM at timeout_s and + SIGKILL at timeout_s + kill_after_s. Killing the producer closes its + stdout, which propagates EOF down the pipeline and unblocks the read + loop here without further intervention. """ + cmd_str = shlex.join(cmd) + _progress(f"running: {cmd_str}") + pipeline: List[List[str]] = [cmd] stored = False if pipe_zstd and which("zstd"): pipeline.append(["zstd", f"-{int(zstd_level)}", "-T0", "-c"]) stored = True # avoid double-compressing the zstd stream - # Build the pipeline + # Build the pipeline. Only the producer (stage 0) runs in a new + # session when a timeout is requested — downstream stages stay in + # our session so they die with us on parent SIGINT. procs: List[subprocess.Popen] = [] prev_stdout = None + producer_pgid: Optional[int] = None for stage, c in enumerate(pipeline): + new_session = stage == 0 and timeout_s is not None p = subprocess.Popen( c, stdin=prev_stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=stable_env(), + start_new_session=new_session, ) + if new_session: + try: + producer_pgid = os.getpgid(p.pid) + _track_pgid(producer_pgid) + except ProcessLookupError: + producer_pgid = None if prev_stdout is not None: prev_stdout.close() # allow SIGPIPE propagation prev_stdout = p.stdout # type: ignore[assignment] procs.append(p) + # Watchdog: TERM-then-KILL the producer's process group on timeout. + timed_out = [False] + term_timer: Optional[threading.Timer] = None + kill_timer: Optional[threading.Timer] = None + if timeout_s is not None and producer_pgid is not None: + _install_signal_forwarders_once() + pg = producer_pgid + + def _term() -> None: + timed_out[0] = True + _progress(f"timeout ({timeout_s}s): SIGTERM -> {cmd_str}") + try: + os.killpg(pg, signal.SIGTERM) + except (ProcessLookupError, PermissionError, OSError): + pass + + def _kill() -> None: + _progress( + f"timeout ({timeout_s + kill_after_s}s): SIGKILL -> {cmd_str}" + ) + try: + os.killpg(pg, signal.SIGKILL) + except (ProcessLookupError, PermissionError, OSError): + pass + + term_timer = threading.Timer(timeout_s, _term) + kill_timer = threading.Timer(timeout_s + kill_after_s, _kill) + term_timer.start() + kill_timer.start() + errs: List[Tuple[str, str]] = [] # Drain stderr concurrently to avoid deadlocks @@ -228,12 +438,20 @@ class ZipArchiveWriter: # Wait for processes and stderr drain rc = 0 - for p in procs: - p.wait() - rc = p.returncode # rc of last will overwrite earlier rc - - for t in threads: - t.join(timeout=1.0) + try: + for p in procs: + p.wait() + rc = p.returncode # rc of last will overwrite earlier rc + + for t in threads: + t.join(timeout=1.0) + finally: + if term_timer is not None: + term_timer.cancel() + if kill_timer is not None: + kill_timer.cancel() + if producer_pgid is not None: + _untrack_pgid(producer_pgid) # Build combined stderr (preserve stage order) combined_err = "" @@ -247,15 +465,16 @@ class ZipArchiveWriter: for cmd in pipeline: cmds.append(" ".join([shlex.quote(x) for x in cmd])) cmd_str = " | ".join(cmds) + status = "TIMEOUT" if timed_out[0] else "OK" log.append( - f"OK: {cmd_str} -> {arcname} ({total} bytes, {'STORED' if stored else 'DEFLATE'})" + f"{status}: {cmd_str} -> {arcname} ({total} bytes, {'STORED' if stored else 'DEFLATE'})" ) for i, c in enumerate(pipeline): log.append( f" {'OK' if procs[i].returncode == 0 else f'FAIL({procs[i].returncode})'}: {' '.join(shlex.quote(x) for x in c)}" ) - return rc, combined_err + return rc, combined_err, timed_out[0] # ------------------------ helpers for names/compression ------------------------ @@ -386,17 +605,27 @@ def run_and_write( pipe_zstd: bool = False, zstd_level: int = 13, stderr_sidecar: bool = True, + timeout_s: Optional[float] = None, + kill_after_s: float = 1.0, ) -> None: try: - rc, err = aw.add_stream_from_proc( - log, arcname, cmd, pipe_zstd=pipe_zstd, zstd_level=zstd_level + rc, err, timed_out = aw.add_stream_from_proc( + log, + arcname, + cmd, + pipe_zstd=pipe_zstd, + zstd_level=zstd_level, + timeout_s=timeout_s, + kill_after_s=kill_after_s, ) except FileNotFoundError: log.append(f"WARN: {cmd[0]}: not found, skipping") return if stderr_sidecar and err.strip(): aw.add_text(log, arcname + ".stderr.txt", err, stored=False) - if rc != 0: + if timed_out: + log.append(f"TIMEOUT({timeout_s}s): {shlex.join(cmd)} -> {arcname}") + elif rc != 0: log.append(f"FAIL({rc}): {shlex.join(cmd)} -> {arcname}") @@ -530,7 +759,7 @@ def run_json_then_fallback_text( Writes stderr sidecars when present (for both attempts). """ try: - rc, err = aw.add_stream_from_proc(log, arc_json, json_cmd) + rc, err, _ = aw.add_stream_from_proc(log, arc_json, json_cmd) except FileNotFoundError: log.append(f"WARN: {json_cmd[0]}: not found, skipping") return @@ -540,7 +769,7 @@ def run_json_then_fallback_text( return # Fallback to text try: - rc2, err2 = aw.add_stream_from_proc(log, arc_text, text_cmd) + rc2, err2, _ = aw.add_stream_from_proc(log, arc_text, text_cmd) except FileNotFoundError: log.append(f"WARN: {text_cmd[0]}: not found, skipping") return @@ -671,6 +900,102 @@ def collect_network( run_and_write(log, aw, f"{base}/ethtool_i.txt", [etht, "-i", nic]) +# ------------------------ Edera Protect zone logs ------------------------ + + +def collect_protect_zone_logs( + log: List[str], + aw: ZipArchiveWriter, + top_name: str, +) -> None: + """ + Collect `protect-ctl zone list -o json` and per-zone `protect-ctl zone logs`. + + Both calls are bounded by the in-process timeout helper (10s SIGTERM, then + SIGKILL after 1s grace), since protect-daemon being wedged is exactly the + kind of failure this report is meant to capture. Output goes under + `{top_name}/protect/`. Should run before journal collection so the journal + capture includes any activity from these calls. + """ + protect_ctl = which("protect-ctl") + if not protect_ctl: + log.append("INFO: protect-ctl not found; skipping protect zone log collection") + return + + arc_dir = f"{top_name}/protect" + + # zone list -o json: capture stdout in memory so we can both archive it + # and parse it for zone IDs. + zone_list_arc = f"{arc_dir}/zone-list.json" + try: + r = run_with_timeout( + [protect_ctl, "zone", "list", "-o", "json"], + timeout_s=10.0, + ) + except FileNotFoundError: + log.append("WARN: protect-ctl: not found, skipping zone log collection") + return + except Exception as e: + log.append(f"FAIL: protect-ctl zone list: {e}") + return + + if r.stdout: + write_bytes(log, aw, zone_list_arc, r.stdout, stored=False) + if r.stderr: + aw.add_text( + log, + zone_list_arc + ".stderr.txt", + r.stderr.decode("utf-8", errors="replace"), + stored=False, + ) + if r.timed_out: + log.append("FAIL: protect-ctl zone list timed out; cannot enumerate zones") + return + if r.rc != 0: + log.append(f"FAIL({r.rc}): protect-ctl zone list; cannot enumerate zones") + return + if not r.stdout: + log.append( + "FAIL: protect-ctl zone list produced empty output; cannot enumerate zones" + ) + return + + try: + zone_data = json.loads(r.stdout.decode("utf-8", errors="replace")) + zones = [z["id"] for z in zone_data] + except Exception as e: + log.append(f"FAIL: could not parse zone list JSON: {e}") + return + + if not zones: + log.append("INFO: no zones found") + return + + log.append(f"INFO: found {len(zones)} zone(s): {', '.join(zones)}") + + for zone in zones: + zone_log_arc = f"{arc_dir}/zone-{zone}.log" + try: + rc, err, timed_out = aw.add_stream_from_proc( + log, + zone_log_arc, + [protect_ctl, "zone", "logs", zone], + timeout_s=10.0, + kill_after_s=1.0, + ) + except FileNotFoundError: + log.append( + f"WARN: protect-ctl not found while collecting zone {zone}; skipping" + ) + continue + if err.strip(): + aw.add_text(log, zone_log_arc + ".stderr.txt", err, stored=False) + if timed_out: + log.append(f"WARN: zone {zone}: log collection timed out") + elif rc != 0: + log.append(f"FAIL({rc}): protect-ctl zone logs {zone}") + + # ------------------------ high-level collection ------------------------ @@ -695,6 +1020,13 @@ def collect_all( # dmesg run_and_write(log, aw, f"{top_name}/dmesg.txt", ["dmesg"]) + # Edera Protect zone logs: gather BEFORE journal so the journal capture + # picks up any activity these calls trigger. + if cfg.protect_zone_logs: + collect_protect_zone_logs(log, aw, top_name) + else: + log.append("SKIP: protect zone logs skipped at user request") + if which("journalctl"): use_zstd = bool(which("zstd")) if cfg.journal: @@ -711,6 +1043,7 @@ def collect_all( ["journalctl", "--system", "-n", "40000", "-b", "-0", "--output=json"], pipe_zstd=use_zstd, zstd_level=13, + timeout_s=300.0, ) else: log.append("SKIP: systemd journal skipped at user request") @@ -742,6 +1075,8 @@ def collect_all( [ "journalctl", "--system", + "-n", + "10000", "-b", "-0", "-u", @@ -750,6 +1085,7 @@ def collect_all( ], pipe_zstd=use_zstd, zstd_level=13, + timeout_s=120.0, ) else: log.append("SKIP: systemd unit journal skipped at user request") @@ -775,19 +1111,28 @@ def collect_all( if detect_xen_dom0(log): xl_bin = which("xl") if xl_bin: - run_and_write(log, aw, f"{top_name}/xl_dmesg.txt", [xl_bin, "dmesg"]) - run_and_write(log, aw, f"{top_name}/xl_info.txt", [xl_bin, "info"]) + run_and_write( + log, aw, f"{top_name}/xl_dmesg.txt", [xl_bin, "dmesg"], timeout_s=30.0 + ) + run_and_write( + log, aw, f"{top_name}/xl_info.txt", [xl_bin, "info"], timeout_s=30.0 + ) else: prot = which("protect") if prot: run_and_write( - log, aw, f"{top_name}/hv_console.txt", [prot, "host", "hv-console"] + log, + aw, + f"{top_name}/hv_console.txt", + [prot, "host", "hv-console"], + timeout_s=30.0, ) run_and_write( log, aw, f"{top_name}/hv_debug_info.txt", [prot, "host", "hv-debug-info"], + timeout_s=30.0, ) else: log.append( @@ -798,7 +1143,11 @@ def collect_all( xenstore_ls_bin = which("xenstore-ls") if xenstore_ls_bin: run_and_write( - log, aw, f"{top_name}/xenstore_ls.txt", [xenstore_ls_bin, "-f"] + log, + aw, + f"{top_name}/xenstore_ls.txt", + [xenstore_ls_bin, "-f"], + timeout_s=30.0, ) else: log.append("SKIP: xenstore-ls not found, skipping dump of xenstore state") @@ -810,7 +1159,13 @@ def collect_all( dmidecode_bin = which("dmidecode") if dmidecode_bin: # full dmidecode text (stderr is used for warnings; capture sidecar) - run_and_write(log, aw, f"{top_name}/dmidecode.txt", [dmidecode_bin]) + run_and_write( + log, + aw, + f"{top_name}/dmidecode.txt", + [dmidecode_bin], + timeout_s=30.0, + ) # dump-bin: needs a filename; use a temp file and add it with tempfile.TemporaryDirectory(prefix="dbg-dmi-") as td: tmp = Path(td) / "dmi.dump" @@ -819,6 +1174,7 @@ def collect_all( aw, f"{top_name}/dmi-bin.txt", [dmidecode_bin, f"--dump-bin={tmp}"], + timeout_s=30.0, ) if tmp.exists(): copy_file(log, aw, tmp, f"{top_name}/dmi.dump", stored=False) @@ -866,7 +1222,7 @@ def collect_all( run_and_write(log, aw, f"{top_name}/free_m.txt", ["free", "-m"]) # lsusb -v - run_and_write(log, aw, f"{top_name}/lsusb_v.txt", ["lsusb", "-v"]) + run_and_write(log, aw, f"{top_name}/lsusb_v.txt", ["lsusb", "-v"], timeout_s=30.0) # sysctl -a run_and_write(log, aw, f"{top_name}/sysctl.txt", ["sysctl", "-a"]) @@ -948,6 +1304,12 @@ def main(argv: Optional[List[str]] = None) -> int: action="store_true", default=False, ) + ap.add_argument( + "--no-protect-zone-logs", + help="Do not include Edera Protect zone list and per-zone logs in the debug report", + action="store_true", + default=False, + ) ap.add_argument( "--name", help="Top-level directory name inside archive (defaults to archive base name).", @@ -967,6 +1329,7 @@ def main(argv: Optional[List[str]] = None) -> int: unit_journal=not args.no_unit_journal, network=not args.no_network, systemctl=not args.no_systemd_units, + protect_zone_logs=not args.no_protect_zone_logs, ) set_environment() @@ -980,6 +1343,8 @@ def main(argv: Optional[List[str]] = None) -> int: out_path = out_path.with_suffix(".zip") top_name = ensure_topdir_name(out_path, args.name) + print(f"Creating: {out_path}") + log: List[str] = [] try: aw = ZipArchiveWriter(out_path)