From ed6173f0042fd6ecd4a01915e531debc0cd0d17f Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Mon, 2 Feb 2026 12:30:37 -0800
Subject: [PATCH 01/15] tools: Add traffic_grapher for real-time ATS metrics
 visualization

A Python tool that displays ATS metrics inline in iTerm2 using imgcat
with live updates and multi-host comparison.

Features:
- Real-time graphs of RPS, latency, cache hit rate, connections
- Support for 1-4 hosts with different line styles for comparison
- Collects metrics via JSONRPC Unix socket (batch collection)
- Dark theme optimized for terminal display
- Keyboard navigation between metric pages (4 pages)
- Configurable refresh interval and history window

Requirements:
- Python 3 with matplotlib
- iTerm2 (or compatible terminal for inline images)
- SSH access to remote ATS hosts

Usage:
  traffic_grapher.py ats-server1.example.com
  traffic_grapher.py ats{1..4}.example.com --interval 2
---
 tools/traffic_grapher/traffic_grapher.py | 1411 ++++++++++++++++++++++
 1 file changed, 1411 insertions(+)
 create mode 100755 tools/traffic_grapher/traffic_grapher.py

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
new file mode 100755
index 00000000000..b6b6015db3c
--- /dev/null
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -0,0 +1,1411 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Traffic Grapher - Real-time ATS metrics visualization.
+
+Displays metrics inline in iTerm using imgcat with multi-host overlay comparison,
+keyboard navigation between pages, and flicker-free live updates.
+
+Usage:
+    traffic_grapher.py ats-server1.example.com
+    traffic_grapher.py ats-server{1..4}.example.com
+    traffic_grapher.py --interval 2 --history 120 ats-server1.example.com ats-server2.example.com
+"""
+
+import argparse
+import base64
+import fcntl
+import gc
+import io
+import os
+import select
+import shutil
+import struct
+import subprocess
+import sys
+import termios
+import time
+import tty
+from collections import deque
+from datetime import datetime, timezone
+from typing import Optional, Tuple
+
+try:
+    from zoneinfo import ZoneInfo
+except ImportError:
+    from datetime import tzinfo
+    ZoneInfo = None  # Will fall back to UTC only
+
+import matplotlib
+# Check for --gui in sys.argv early to set backend before importing pyplot
+if '--gui' in sys.argv:
+    # Use MacOSX on macOS, fall back to TkAgg on other platforms
+    import platform
+    if platform.system() == 'Darwin':
+        matplotlib.use('MacOSX')
+    else:
+        matplotlib.use('TkAgg')
+else:
+    matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.ticker import FuncFormatter
+import yaml
+
+# =============================================================================
+# Built-in Default Pages - 2x2 grid per page
+# =============================================================================
+
+DEFAULT_PAGES = [
+    {
+        "name": "Traffic & Cache",
+        "panels":
+            [
+                {
+                    "title": "Requests/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Client",
+                                "type": "counter",
+                                "key": "proxy.process.http.incoming_requests",
+                                "color": "#FFFF00"
+                            },  # Yellow
+                            {
+                                "name": "Origin",
+                                "type": "counter",
+                                "key": "proxy.process.http.outgoing_requests",
+                                "color": "#BF00FF"
+                            },  # Electric Purple (complement)
+                        ]
+                },
+                {
+                    "title": "Latency (ms)",
+                    "metrics":
+                        [
+                            {
+                                "name": "Cache Hit",
+                                "type": "latency",
+                                "key": "proxy.process.http.transaction_totaltime.hit_fresh",
+                                "key2": "proxy.process.http.transaction_counts.hit_fresh",
+                                "color": "#00FF00"
+                            },  # Lime Green
+                            {
+                                "name": "Origin",
+                                "type": "latency",
+                                "key": "proxy.process.http.transaction_totaltime.miss_cold",
+                                "key2": "proxy.process.http.transaction_counts.miss_cold",
+                                "color": "#FF00FF"
+                            },  # Magenta (complement)
+                        ]
+                },
+                {
+                    "title": "Cache Hit Rate %",
+                    "metrics":
+                        [
+                            {
+                                "name": "Hit Rate",
+                                "type": "hit_rate",
+                                "key": "proxy.process.cache_total_hits",
+                                "key2": "proxy.process.cache_total_misses",
+                                "color": "#00FF7F"
+                            },  # Spring Green
+                        ]
+                },
+                {
+                    "title": "Connections",
+                    "metrics":
+                        [
+                            {
+                                "name": "Client",
+                                "type": "gauge",
+                                "key": "proxy.process.http.current_client_connections",
+                                "color": "#00FFFF"
+                            },  # Cyan
+                            {
+                                "name": "Origin",
+                                "type": "gauge",
+                                "key": "proxy.process.http.current_server_connections",
+                                "color": "#FF4040"
+                            },  # Bright Red (complement)
+                        ]
+                },
+            ]
+    },
+    {
+        "name": "Response Codes",
+        "panels":
+            [
+                {
+                    "title": "2xx Responses/sec",
+                    "metrics": [{
+                        "name": "2xx",
+                        "type": "counter",
+                        "key": "proxy.process.http.2xx_responses",
+                        "color": "#00FF00"
+                    },]
+                },
+                {
+                    "title": "3xx Responses/sec",
+                    "metrics": [{
+                        "name": "3xx",
+                        "type": "counter",
+                        "key": "proxy.process.http.3xx_responses",
+                        "color": "#00FFFF"
+                    },]
+                },
+                {
+                    "title": "4xx Responses/sec",
+                    "metrics": [{
+                        "name": "4xx",
+                        "type": "counter",
+                        "key": "proxy.process.http.4xx_responses",
+                        "color": "#FFA500"
+                    },]
+                },
+                {
+                    "title": "5xx Responses/sec",
+                    "metrics": [{
+                        "name": "5xx",
+                        "type": "counter",
+                        "key": "proxy.process.http.5xx_responses",
+                        "color": "#FF0000"
+                    },]
+                },
+            ]
+    },
+    {
+        "name": "TLS & HTTP/2",
+        "panels":
+            [
+                {
+                    "title": "SSL Handshakes/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Success",
+                                "type": "counter",
+                                "key": "proxy.process.ssl.total_success_handshake_count_in",
+                                "color": "#00FF00"
+                            },
+                            {
+                                "name": "Failed",
+                                "type": "counter",
+                                "key": "proxy.process.ssl.ssl_error_ssl",
+                                "color": "#FF0000"
+                            },
+                        ]
+                },
+                {
+                    "title": "SSL Connections",
+                    "metrics":
+                        [{
+                            "name": "Active",
+                            "type": "gauge",
+                            "key": "proxy.process.ssl.user_agent_sessions",
+                            "color": "#00FFFF"
+                        },]
+                },
+                {
+                    "title": "HTTP/2 Connections",
+                    "metrics":
+                        [
+                            {
+                                "name": "Total",
+                                "type": "counter",
+                                "key": "proxy.process.http2.total_client_connections",
+                                "color": "#FF00FF"
+                            },
+                            {
+                                "name": "Active",
+                                "type": "gauge",
+                                "key": "proxy.process.http2.current_client_sessions",
+                                "color": "#00FFFF"
+                            },
+                        ]
+                },
+                {
+                    "title": "HTTP/2 Errors/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Errors",
+                                "type": "counter",
+                                "key": "proxy.process.http2.connection_errors",
+                                "color": "#FF0000"
+                            },
+                        ]
+                },
+            ]
+    },
+    {
+        "name": "Network & Errors",
+        "panels":
+            [
+                {
+                    "title": "Client Bytes/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Read",
+                                "type": "counter",
+                                "key": "proxy.process.http.user_agent_total_request_bytes",
+                                "color": "#00FFFF"
+                            },
+                            {
+                                "name": "Write",
+                                "type": "counter",
+                                "key": "proxy.process.http.user_agent_total_response_bytes",
+                                "color": "#FF00FF"
+                            },
+                        ]
+                },
+                {
+                    "title": "Origin Bytes/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Read",
+                                "type": "counter",
+                                "key": "proxy.process.http.origin_server_total_response_bytes",
+                                "color": "#00FF00"
+                            },
+                            {
+                                "name": "Write",
+                                "type": "counter",
+                                "key": "proxy.process.http.origin_server_total_request_bytes",
+                                "color": "#FFA500"
+                            },
+                        ]
+                },
+                {
+                    "title": "Connection Errors/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Connect Fail",
+                                "type": "counter",
+                                "key": "proxy.process.http.err_connect_fail_count",
+                                "color": "#FF0000"
+                            },
+                        ]
+                },
+                {
+                    "title": "Transaction Errors/sec",
+                    "metrics":
+                        [
+                            {
+                                "name": "Aborts",
+                                "type": "counter",
+                                "key": "proxy.process.http.transaction_counts.errors.aborts",
+                                "color": "#FFA500"
+                            },
+                        ]
+                },
+            ]
+    },
+]
+
+# Command template for traffic_ctl
+# Default path (adjust for your installation)
+# Note: awk runs locally to avoid SSH quote escaping issues
+METRIC_COMMAND_REMOTE = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key}"
+METRIC_COMMAND_LOCAL = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key} | awk '{{print $2}}'"
+
+# =============================================================================
+# Terminal Utilities
+# =============================================================================
+
+
+def cursor_home():
+    """Move cursor to top-left position."""
+    sys.stdout.write('\033[H')
+    sys.stdout.flush()
+
+
+def clear_screen():
+    """Clear the terminal screen."""
+    sys.stdout.write('\033[2J\033[H')
+    sys.stdout.flush()
+
+
+def clear_scrollback():
+    """Clear iTerm2 scrollback buffer to free memory from accumulated images."""
+    # iTerm2-specific escape sequence to clear scrollback
+    sys.stdout.write('\033]1337;ClearScrollback\007')
+    # Also send standard escape sequence that works in some other terminals
+    sys.stdout.write('\033[3J')
+    sys.stdout.flush()
+
+
+def hide_cursor():
+    """Hide the terminal cursor."""
+    sys.stdout.write('\033[?25l')
+    sys.stdout.flush()
+
+
+def show_cursor():
+    """Show the terminal cursor."""
+    sys.stdout.write('\033[?25h')
+    sys.stdout.flush()
+
+
+def imgcat_display(fig):
+    """
+    Display a matplotlib figure inline in iTerm2 using imgcat protocol.
+    """
+    buf = io.BytesIO()
+    # Don't use bbox_inches='tight' - we want exact figure dimensions to fill terminal
+    fig.savefig(buf, format='png', dpi=100, facecolor=fig.get_facecolor(), edgecolor='none')
+    buf.seek(0)
+    image_data = base64.b64encode(buf.read()).decode('ascii')
+    buf.close()
+
+    # iTerm2 inline image protocol
+    sys.stdout.write(f'\033]1337;File=inline=1:{image_data}\a\n')
+    sys.stdout.flush()
+
+
+def format_value(value: float, is_percent: bool = False, is_latency: bool = False) -> str:
+    """Format a value with K/M suffix for readability, or as percentage/latency."""
+    if value is None:
+        return "N/A"
+    if is_percent:
+        return f"{value:.0f}%"
+    if is_latency:
+        # Format latency in milliseconds
+        if value >= 1000:
+            return f"{value/1000:.1f}s"
+        elif value >= 1:
+            return f"{value:.0f}ms"
+        else:
+            return f"{value:.1f}ms"
+    if abs(value) >= 1_000_000_000_000:
+        return f"{value/1_000_000_000_000:.1f}T"
+    if abs(value) >= 1_000_000_000:
+        return f"{value/1_000_000_000:.1f}G"
+    if abs(value) >= 1_000_000:
+        return f"{value/1_000_000:.1f}M"
+    elif abs(value) >= 1_000:
+        return f"{value/1_000:.1f}K"
+    elif abs(value) >= 1:
+        return f"{value:.0f}"
+    else:
+        return f"{value:.2f}"
+
+
+def format_ytick(value, pos):
+    """Format Y-axis tick values with K/M/G/T suffixes, no decimals."""
+    if value == 0:
+        return '0'
+    if abs(value) < 1:
+        # Very small values - show with appropriate precision
+        if abs(value) < 0.01:
+            return '0'
+        return f'{value:.1f}'
+    if abs(value) >= 1e12:
+        return f'{int(value/1e12)}T'
+    if abs(value) >= 1e9:
+        return f'{int(value/1e9)}G'
+    if abs(value) >= 1e6:
+        return f'{int(value/1e6)}M'
+    if abs(value) >= 1e3:
+        return f'{int(value/1e3)}K'
+    return f'{int(value)}'
+
+
+def get_terminal_pixel_size() -> Tuple[int, int]:
+    """
+    Get terminal size in pixels using TIOCGWINSZ ioctl.
+    Returns (width, height) in pixels, or estimated size from rows/cols.
+    """
+    try:
+        # Try to get pixel size from terminal
+        result = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, b'\x00' * 8)
+        rows, cols, xpixel, ypixel = struct.unpack('HHHH', result)
+
+        if xpixel > 0 and ypixel > 0:
+            return (xpixel, ypixel)
+
+        # Estimate from rows/cols (typical cell: 9x18 pixels)
+        return (cols * 9, rows * 18)
+    except:
+        # Fallback: assume reasonable terminal size
+        size = shutil.get_terminal_size((80, 24))
+        return (size.columns * 9, size.lines * 18)
+
+
+def get_figure_size_for_terminal() -> Tuple[float, float]:
+    """
+    Calculate matplotlib figure size to fill terminal.
+    Returns (width, height) in inches for matplotlib.
+    """
+    pixel_width, pixel_height = get_terminal_pixel_size()
+
+    # imgcat displays at native resolution, so we want pixel dimensions
+    # that match the terminal. Use 100 DPI as our reference.
+    dpi = 100
+
+    # Use nearly full terminal width/height - imgcat will display at native size
+    # Leave minimal margin for terminal chrome
+    width_inches = (pixel_width * 0.99) / dpi
+    height_inches = (pixel_height * 0.92) / dpi  # Leave room for status bar
+
+    # Minimum bounds for readability, no max - let it fill the terminal
+    width_inches = max(12, width_inches)
+    height_inches = max(8, height_inches)
+
+    return (width_inches, height_inches)
+
+
+class KeyboardHandler:
+    """Non-blocking keyboard input handler."""
+
+    def __init__(self):
+        self.old_settings = None
+        self.fd = None
+
+    def __enter__(self):
+        self.fd = sys.stdin.fileno()
+        self.old_settings = termios.tcgetattr(self.fd)
+        # Set raw mode for better key detection
+        new_settings = termios.tcgetattr(self.fd)
+        new_settings[3] = new_settings[3] & ~(termios.ICANON | termios.ECHO)
+        new_settings[6][termios.VMIN] = 0
+        new_settings[6][termios.VTIME] = 0
+        termios.tcsetattr(self.fd, termios.TCSADRAIN, new_settings)
+        return self
+
+    def __exit__(self, *args):
+        if self.old_settings and self.fd is not None:
+            termios.tcsetattr(self.fd, termios.TCSADRAIN, self.old_settings)
+
+    def get_key(self) -> Optional[str]:
+        """Get a keypress if available, non-blocking."""
+        try:
+            ch = os.read(self.fd, 1)
+            if not ch:
+                return None
+
+            if ch == b'\x1b':  # Escape sequence
+                # Read more bytes for arrow key sequences
+                try:
+                    ch2 = os.read(self.fd, 1)
+                    if ch2 == b'[':
+                        ch3 = os.read(self.fd, 1)
+                        if ch3 == b'D':
+                            return 'left'
+                        elif ch3 == b'C':
+                            return 'right'
+                        elif ch3 == b'A':
+                            return 'up'
+                        elif ch3 == b'B':
+                            return 'down'
+                except:
+                    pass
+                return 'escape'
+            elif ch in (b'q', b'Q'):
+                return 'quit'
+            elif ch in (b'h', b'H'):  # vim-style left
+                return 'left'
+            elif ch in (b'l', b'L'):  # vim-style right
+                return 'right'
+        except (OSError, IOError):
+            pass
+        return None
+
+
+# =============================================================================
+# Metric Collection
+# =============================================================================
+
+# JSONRPC script template - runs on remote host via SSH
+JSONRPC_SCRIPT = '''
+import socket
+import json
+
+sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+sock.connect("{socket_path}")
+
+request = {{
+    "jsonrpc": "2.0",
+    "method": "admin_lookup_records",
+    "params": [{{"record_name_regex": "{pattern}"}}],
+    "id": 1
+}}
+sock.sendall(json.dumps(request).encode() + b"\\n")
+response = sock.recv(1048576).decode()
+sock.close()
+
+# Parse and output simple key=value format
+data = json.loads(response)
+for rec in data.get("result", {{}}).get("recordList", []):
+    r = rec.get("record", {{}})
+    name = r.get("record_name", "")
+    value = r.get("current_value", "")
+    dtype = r.get("data_type", "")
+    print(f"{{name}}={{value}}={{dtype}}")
+'''
+
+# Default JSONRPC socket path (adjust for your installation)
+JSONRPC_SOCKET_PATH = "/opt/edge/trafficserver/10.0/var/trafficserver/jsonrpc20.sock"
+
+
+class JSONRPCBatchCollector:
+    """
+    Batch metric collector using JSONRPC Unix socket.
+    Collects all metrics for a host in a single SSH call.
+    """
+
+    def __init__(self, hostname: str, metric_keys: list, socket_path: str = JSONRPC_SOCKET_PATH):
+        self.hostname = hostname  # Bare hostname like "ats-server1.example.com"
+        self.metric_keys = metric_keys
+        self.socket_path = socket_path
+
+        # Build regex pattern matching all metric keys
+        # Escape dots and join with |
+        escaped_keys = [k.replace('.', r'\.') for k in metric_keys]
+        self.pattern = '|'.join(f"^{k}$" for k in escaped_keys)
+
+        # Cached results from last collection
+        self.last_values: dict = {}  # key -> (value, data_type)
+
+    def collect(self) -> dict:
+        """
+        Collect all metrics in one SSH call.
+        Returns dict of {metric_key: (value, data_type)}.
+        """
+        script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern)
+
+        # Build SSH command - hostname is passed directly, we add "ssh" prefix
+        # Encode script as base64 to avoid quoting issues
+        script_b64 = base64.b64encode(script.encode()).decode()
+        cmd = f"ssh {self.hostname} \"echo '{script_b64}' | base64 -d | python3\""
+
+        try:
+            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
+
+            if result.returncode != 0:
+                return self.last_values
+
+            # Parse output: key=value=dtype per line
+            values = {}
+            for line in result.stdout.strip().split('\n'):
+                if '=' in line:
+                    parts = line.split('=', 2)
+                    if len(parts) >= 2:
+                        key = parts[0]
+                        try:
+                            value = float(parts[1])
+                        except:
+                            continue
+                        dtype = parts[2] if len(parts) > 2 else "INT"
+                        values[key] = (value, dtype)
+
+            self.last_values = values
+            return values
+
+        except (subprocess.TimeoutExpired, OSError):
+            return self.last_values
+
+
+class MetricCollector:
+    """Collects metric values from a host via shell commands."""
+
+    def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefix: str = "", host_name: str = ""):
+        self.name = name
+        self.key = key
+        self.metric_type = metric_type.lower()
+        self.color = color
+        self.host_prefix = host_prefix
+        self.host_name = host_name
+
+        # Build the full command
+        if host_prefix:
+            # For remote hosts: run traffic_ctl on remote, awk locally
+            remote_cmd = METRIC_COMMAND_REMOTE.format(key=key)
+            self.command = f"{host_prefix} '{remote_cmd}' | awk '{{print $2}}'"
+        else:
+            # Local: run everything locally
+            self.command = METRIC_COMMAND_LOCAL.format(key=key)
+
+        # For counter metrics, track previous value and time
+        self._prev_value: Optional[float] = None
+        self._prev_time: Optional[float] = None
+
+        # Latest value for display
+        self.latest_value: Optional[float] = None
+
+    def _get_raw_value(self) -> Optional[float]:
+        """Run the command and return the raw numeric value."""
+        try:
+            result = subprocess.run(self.command, shell=True, capture_output=True, text=True, timeout=5)
+            if result.returncode != 0:
+                return None
+
+            output = result.stdout.strip()
+            if not output:
+                return None
+
+            return float(output)
+        except (subprocess.TimeoutExpired, ValueError, OSError):
+            return None
+
+    def collect(self) -> Optional[float]:
+        """Collect the metric value."""
+        raw_value = self._get_raw_value()
+        current_time = time.time()
+
+        if self.metric_type == 'gauge':
+            self.latest_value = raw_value
+            return raw_value
+
+        # Counter: calculate rate of change
+        if raw_value is None:
+            return None
+
+        if self._prev_value is None or self._prev_time is None:
+            self._prev_value = raw_value
+            self._prev_time = current_time
+            return None
+
+        time_delta = current_time - self._prev_time
+        if time_delta <= 0:
+            return None
+
+        # Handle counter reset
+        if raw_value < self._prev_value:
+            self._prev_value = raw_value
+            self._prev_time = current_time
+            return None
+
+        rate = (raw_value - self._prev_value) / time_delta
+        self._prev_value = raw_value
+        self._prev_time = current_time
+
+        self.latest_value = rate
+        return rate
+
+
+# =============================================================================
+# Main Grapher
+# =============================================================================
+
+
+class ATSGrapher:
+    """Main grapher class with 2x2 grid layout and dark theme."""
+
+    # Dark theme colors
+    FIG_BG_COLOR = '#000000'  # Pure black for figure background (titles, labels area)
+    PLOT_BG_COLOR = '#1a1a1a'  # Dark grey for inside the plots
+    TEXT_COLOR = '#ffffff'
+    GRID_COLOR = '#555555'  # Lighter grid for better visibility
+    AXIS_COLOR = '#888888'  # Visible axis/spine color
+
+    # Line styles for differentiating hosts (up to 4)
+    LINE_STYLES = ['-', '--', '-.', ':']  # solid, dashed, dash-dot, dotted
+
+    def __init__(
+            self,
+            hostnames: list,
+            interval: float,
+            history_seconds: int,
+            pages: list,
+            gui_mode: bool = False,
+            save_png: Optional[str] = None,
+            log_stats: Optional[str] = None,
+            run_for: Optional[int] = None,
+            no_keyboard: bool = False,
+            tz_name: str = "UTC"):
+        self.hostnames = hostnames  # Bare hostnames like ["ats-server1.example.com"]
+        self.interval = interval
+        self.history_seconds = history_seconds
+        self.pages = pages
+        self.gui_mode = gui_mode
+        self.save_png = save_png
+        self.log_stats = log_stats
+        self.run_for = run_for
+        self.no_keyboard = no_keyboard
+        self.tz_name = tz_name
+        self.num_hosts = len(hostnames)
+
+        # Set up timezone
+        if ZoneInfo and tz_name != "UTC":
+            try:
+                self.tz = ZoneInfo(tz_name)
+            except:
+                self.tz = timezone.utc
+                self.tz_name = "UTC"
+        else:
+            self.tz = timezone.utc
+            if tz_name != "UTC":
+                self.tz_name = "UTC"  # Fallback if zoneinfo not available
+
+        self.current_page = 0
+        self.running = True
+        self.start_time = time.time()
+        self.iteration = 0
+
+        # Extract short host names for display (e.g., "e1" from "ats-server1.example.com")
+        self.host_names = [self._extract_short_name(h) for h in hostnames]
+
+        # Initialize log file if specified
+        if self.log_stats:
+            with open(self.log_stats, 'w') as f:
+                f.write(f"# Traffic Grapher Stats Log - {datetime.now()}\n")
+                f.write(f"# Hosts: {', '.join(hostnames)}\n")
+                f.write("#\n")
+
+        # Collect ALL unique metric keys across ALL pages for combined batch collection
+        all_metric_keys = []
+        for page in pages:
+            for panel in page["panels"]:
+                for metric in panel["metrics"]:
+                    if metric["key"] not in all_metric_keys:
+                        all_metric_keys.append(metric["key"])
+                    # Also collect key2 for hit_rate metrics
+                    if "key2" in metric and metric["key2"] not in all_metric_keys:
+                        all_metric_keys.append(metric["key2"])
+
+        # Create ONE combined batch collector per host (collects all metrics at once)
+        self.combined_collectors = [JSONRPCBatchCollector(hostname, all_metric_keys) for hostname in hostnames]
+
+        # Initialize metric info and data for all pages
+        # metric_info[page][panel][metric] = {name, key, type, color}
+        # data[page][panel][metric][host] = deque of (elapsed_time, value)
+        self.metric_info: list = []
+        self.data: list = []
+
+        # Track previous values for counter rate calculation
+        # prev_values[page][panel][metric][host] = (prev_raw, prev_time)
+        self.prev_values: list = []
+
+        # Track latest values for display
+        # latest_values[page][panel][metric][host] = value
+        self.latest_values: list = []
+
+        for page in pages:
+            page_info = []
+            page_data = []
+            page_prev = []
+            page_latest = []
+            for panel in page["panels"]:
+                panel_info = []
+                panel_data = []
+                panel_prev = []
+                panel_latest = []
+                for metric in panel["metrics"]:
+                    info = {"name": metric["name"], "key": metric["key"], "type": metric["type"].lower(), "color": metric["color"]}
+                    # Store key2 for hit_rate metrics
+                    if "key2" in metric:
+                        info["key2"] = metric["key2"]
+                    panel_info.append(info)
+
+                    # Initialize data structures for each host
+                    metric_data = [deque() for _ in range(self.num_hosts)]
+                    metric_prev = [None for _ in range(self.num_hosts)]
+                    metric_latest = [None for _ in range(self.num_hosts)]
+                    panel_data.append(metric_data)
+                    panel_prev.append(metric_prev)
+                    panel_latest.append(metric_latest)
+                page_info.append(panel_info)
+                page_data.append(panel_data)
+                page_prev.append(panel_prev)
+                page_latest.append(panel_latest)
+            self.metric_info.append(page_info)
+            self.data.append(page_data)
+            self.prev_values.append(page_prev)
+            self.latest_values.append(page_latest)
+
+        # Terminal size tracking
+        self.last_terminal_size = shutil.get_terminal_size()
+
+    def _extract_short_name(self, hostname: str) -> str:
+        """Extract a short display name from a full hostname."""
+        if not hostname:
+            return "local"
+        # Take first part before dot (e.g., "e1" from "ats-server1.example.com")
+        if '.' in hostname:
+            return hostname.split('.')[0]
+        return hostname[:15]
+
+    def _check_resize(self) -> bool:
+        """Check if terminal was resized."""
+        current_size = shutil.get_terminal_size()
+        if current_size != self.last_terminal_size:
+            self.last_terminal_size = current_size
+            return True
+        return False
+
+    def _trim_history(self):
+        """Remove old data points outside the history window."""
+        cutoff = time.time() - self.start_time - self.history_seconds
+        for page_data in self.data:
+            for panel_data in page_data:
+                for metric_data in panel_data:
+                    for host_data in metric_data:
+                        while host_data and host_data[0][0] < cutoff:
+                            host_data.popleft()
+
+    def collect_all_pages(self):
+        """Collect data for ALL pages in a single batch per host."""
+        self.iteration += 1
+
+        log_lines = []
+        if self.log_stats:
+            elapsed = time.time() - self.start_time
+            log_lines.append(f"\n=== Iteration {self.iteration} at {elapsed:.1f}s ===\n")
+
+        # Batch collect from each host - capture time PER HOST
+        batch_results = []
+        host_times = []
+
+        for host_idx, batch_collector in enumerate(self.combined_collectors):
+            host_label = self.host_names[host_idx]
+
+            if self.log_stats:
+                log_lines.append(f"  Collecting from {host_label}...\n")
+
+            collect_time = time.time()
+            results = batch_collector.collect()
+            batch_results.append(results)
+            host_times.append(collect_time)
+
+            if self.log_stats:
+                log_lines.append(f"    Got {len(results)} metrics\n")
+
+        # Distribute values to ALL pages
+        for page_idx in range(len(self.pages)):
+            for panel_idx, panel_info in enumerate(self.metric_info[page_idx]):
+                panel = self.pages[page_idx]["panels"][panel_idx]
+
+                for metric_idx, metric in enumerate(panel_info):
+                    key = metric["key"]
+                    metric_type = metric["type"]
+
+                    for host_idx in range(self.num_hosts):
+                        host_label = self.host_names[host_idx]
+                        current_time = host_times[host_idx]
+
+                        if key in batch_results[host_idx]:
+                            raw_value, dtype = batch_results[host_idx][key]
+
+                            # Calculate value based on metric type
+                            if metric_type == 'gauge':
+                                value = raw_value
+                                self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value
+                            elif metric_type == 'hit_rate':
+                                # Calculate hit rate percentage: hits / (hits + misses) * 100
+                                key2 = metric.get("key2")
+                                if key2 and key2 in batch_results[host_idx]:
+                                    hits_raw = raw_value
+                                    misses_raw, _ = batch_results[host_idx][key2]
+
+                                    prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx]
+                                    if prev is None:
+                                        # Store both hits and misses as prev value
+                                        self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (
+                                            (hits_raw, misses_raw), current_time)
+                                        value = None
+                                    else:
+                                        (prev_hits, prev_misses), prev_time = prev
+                                        time_delta = current_time - prev_time
+                                        if time_delta > 0:
+                                            delta_hits = hits_raw - prev_hits
+                                            delta_misses = misses_raw - prev_misses
+                                            total = delta_hits + delta_misses
+                                            if total > 0:
+                                                value = (delta_hits / total) * 100.0
+                                                self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value
+                                            else:
+                                                value = None
+                                        else:
+                                            value = None
+                                        self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (
+                                            (hits_raw, misses_raw), current_time)
+                                else:
+                                    value = None
+                            elif metric_type == 'latency':
+                                # Calculate average latency: delta_time / delta_count
+                                # key = total time counter (milliseconds), key2 = count counter
+                                key2 = metric.get("key2")
+                                if key2 and key2 in batch_results[host_idx]:
+                                    time_raw = raw_value  # Total time in milliseconds
+                                    count_raw, _ = batch_results[host_idx][key2]
+
+                                    prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx]
+                                    if prev is None:
+                                        self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (
+                                            (time_raw, count_raw), current_time)
+                                        value = None
+                                    else:
+                                        (prev_time_raw, prev_count), prev_time = prev
+                                        delta_time_ms = time_raw - prev_time_raw
+                                        delta_count = count_raw - prev_count
+                                        if delta_count > 0 and delta_time_ms >= 0:
+                                            # Average latency already in milliseconds
+                                            value = delta_time_ms / delta_count
+                                            self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value
+                                        else:
+                                            value = None
+                                        self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (
+                                            (time_raw, count_raw), current_time)
+                                else:
+                                    value = None
+                            else:
+                                # Counter: calculate rate
+                                prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx]
+                                if prev is None:
+                                    self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (raw_value, current_time)
+                                    value = None
+                                else:
+                                    prev_raw, prev_time = prev
+                                    time_delta = current_time - prev_time
+                                    if time_delta > 0 and raw_value >= prev_raw:
+                                        value = (raw_value - prev_raw) / time_delta
+                                        self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value
+                                    else:
+                                        value = None
+                                    self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (raw_value, current_time)
+
+                            if value is not None:
+                                elapsed = current_time - self.start_time
+                                self.data[page_idx][panel_idx][metric_idx][host_idx].append((elapsed, value))
+
+        if self.log_stats:
+            # Log current page data point counts only
+            page_idx = self.current_page
+            log_lines.append(f"  Data points (page {page_idx + 1}):\n")
+            for panel_idx, panel_data in enumerate(self.data[page_idx]):
+                panel = self.pages[page_idx]["panels"][panel_idx]
+                for metric_idx, metric_data in enumerate(panel_data):
+                    metric = self.metric_info[page_idx][panel_idx][metric_idx]
+                    for host_idx, host_data in enumerate(metric_data):
+                        host_label = self.host_names[host_idx]
+                        log_lines.append(f"    [{panel['title']}] {metric['name']} ({host_label}): {len(host_data)} points\n")
+
+            with open(self.log_stats, 'a') as f:
+                f.writelines(log_lines)
+
+        self._trim_history()
+
+    def collect_data(self):
+        """Collect data - wrapper that calls collect_all_pages for backwards compatibility."""
+        self.collect_all_pages()
+
+    def render_page(self, fig: plt.Figure = None) -> plt.Figure:
+        """Render the current page as a matplotlib figure with 2x2 grid."""
+        page = self.pages[self.current_page]
+        current_elapsed = time.time() - self.start_time
+
+        # Set up dark theme - dynamic figure size to fill terminal
+        plt.style.use('dark_background')
+        fig_width, fig_height = get_figure_size_for_terminal()
+
+        if fig is None:
+            # Create new figure
+            fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height))
+        else:
+            # Reuse existing figure, create new axes
+            fig.set_size_inches(fig_width, fig_height)
+            axes = fig.subplots(2, 2)
+
+        fig.patch.set_facecolor(self.FIG_BG_COLOR)  # Pure black outside graphs
+
+        # Flatten axes for easier iteration
+        axes_flat = axes.flatten()
+
+        # Y-axis formatter for K/M/G/T
+        y_formatter = FuncFormatter(format_ytick)
+
+        num_hosts = self.num_hosts
+
+        for panel_idx, panel in enumerate(page["panels"]):
+            if panel_idx >= 4:
+                break
+            ax = axes_flat[panel_idx]
+            ax.set_facecolor(self.PLOT_BG_COLOR)  # Dark grey inside plots
+
+            panel_data = self.data[self.current_page][panel_idx]
+            panel_info = self.metric_info[self.current_page][panel_idx]
+            panel_latest = self.latest_values[self.current_page][panel_idx]
+
+            # Build title with current values
+            value_parts = []
+            metric_values = {}  # Group values by metric for compact display
+            has_data = False
+
+            for metric_idx, metric in enumerate(panel_info):
+                for host_idx in range(num_hosts):
+                    host_data = panel_data[metric_idx][host_idx]
+                    host_name = self.host_names[host_idx]
+
+                    # Plot the data - transform X to "seconds ago"
+                    if host_data:
+                        has_data = True
+                        # Transform: x = current_elapsed - data_elapsed = age of data point
+                        # Newest data (age ~0) on RIGHT, oldest data on LEFT
+                        times = [current_elapsed - t for t, _ in host_data]
+                        values = [v for _, v in host_data]
+
+                        color = metric["color"]
+                        linestyle = self.LINE_STYLES[host_idx % len(self.LINE_STYLES)]
+                        linewidth = 2.5 - (host_idx * 0.2)  # Slightly thinner for each host
+
+                        label = metric["name"]
+                        if self.num_hosts > 1:
+                            label = f"{metric['name']} ({host_name})"
+
+                        ax.plot(times, values, color=color, linewidth=linewidth, linestyle=linestyle, label=label)
+
+                    # Collect values per metric for compact display
+                    latest = panel_latest[metric_idx][host_idx]
+                    if latest is not None:
+                        if metric_idx not in metric_values:
+                            metric_values[metric_idx] = {'name': metric['name'], 'type': metric['type'], 'values': []}
+                        is_percent = metric['type'] == 'hit_rate'
+                        is_latency = metric['type'] == 'latency'
+                        metric_values[metric_idx]['values'].append(format_value(latest, is_percent, is_latency))
+
+            # Build compact title: "Requests/sec - Client: 3.6K/4.0K | Origin: 2.5K/2.7K"
+            for m_idx, m_data in metric_values.items():
+                if m_data['values']:
+                    if self.num_hosts > 1:
+                        value_parts.append(f"{m_data['name']}: {'/'.join(m_data['values'])}")
+                    else:
+                        value_parts.append(f"{m_data['name']}: {m_data['values'][0]}")
+
+            # Set title with values
+            if value_parts:
+                title = f"{panel['title']} - {' | '.join(value_parts)}"
+            else:
+                title = panel["title"]
+            ax.set_title(title, color=self.TEXT_COLOR, fontsize=12, fontweight='bold')
+
+            # Configure axes - more visible grid and borders
+            ax.grid(True, alpha=0.6, color=self.GRID_COLOR, linewidth=0.8)
+            ax.tick_params(colors=self.TEXT_COLOR, labelsize=12)
+            for spine in ax.spines.values():
+                spine.set_color(self.AXIS_COLOR)
+                spine.set_linewidth(1.5)
+
+            # X-axis: oldest on left, 0 (now) on right - data flows right to left
+            ax.set_xlim(self.history_seconds, 0)  # Reversed: oldest on left, 0=now on right
+            ax.set_xlabel('')  # No label - self explanatory
+            ax.tick_params(axis='x', labelsize=14)
+
+            # Check if this is a percentage or latency panel
+            is_percent_panel = any(m['type'] == 'hit_rate' for m in panel_info)
+            is_latency_panel = any(m['type'] == 'latency' for m in panel_info)
+
+            if is_percent_panel:
+                # Percentage panel: fixed 0-100% range, no K/M/G formatter
+                ax.set_ylim(0, 100)
+                ax.set_ylabel("Hit Rate %", color=self.TEXT_COLOR, fontsize=14, fontweight='bold')
+                ax.tick_params(axis='y', labelsize=14)
+            elif is_latency_panel:
+                # Latency panel: auto-scale, Y-axis label in ms
+                ax.set_ylabel("Latency (ms)", color=self.TEXT_COLOR, fontsize=14, fontweight='bold')
+                ax.tick_params(axis='y', labelsize=14)
+            else:
+                # Y-axis: use K/M/G/T formatter, no decimals, add label
+                ax.yaxis.set_major_formatter(y_formatter)
+                ax.tick_params(axis='y', labelsize=14)
+
+                # Y-axis label based on panel title
+                ylabel = panel["title"].split('/')[0].strip() if '/' in panel["title"] else ""
+                if ylabel:
+                    ax.set_ylabel(ylabel, color=self.TEXT_COLOR, fontsize=14, fontweight='bold')
+
+            # Legend if multiple metrics or hosts (only if we have data)
+            if has_data and (len(panel["metrics"]) > 1 or self.num_hosts > 1):
+                ax.legend(
+                    loc='upper left',
+                    fontsize=13,
+                    facecolor=self.PLOT_BG_COLOR,
+                    edgecolor=self.AXIS_COLOR,
+                    labelcolor=self.TEXT_COLOR)
+
+        # Overall title with date, time, and timezone
+        now = datetime.now(self.tz)
+        timestamp = now.strftime('%Y-%m-%d %H:%M:%S')
+        title = f"ATS Dashboard - {timestamp} {self.tz_name} - Page {self.current_page + 1}/{len(self.pages)}: {page['name']}"
+        if self.num_hosts > 1:
+            title += f"\n{' vs '.join(self.host_names)}"
+        else:
+            title += f" ({self.host_names[0]})"
+        fig.suptitle(title, color=self.TEXT_COLOR, fontsize=18, fontweight='bold')
+
+        # Status bar
+        status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history"
+        fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080')
+
+        plt.tight_layout()
+        plt.subplots_adjust(top=0.92, bottom=0.06, left=0.06, right=0.98, hspace=0.35, wspace=0.22)
+
+        # Save PNG if requested
+        if self.save_png:
+            png_path = self.save_png
+            if '{iter}' in png_path:
+                png_path = png_path.replace('{iter}', str(self.iteration))
+            fig.savefig(png_path, facecolor=fig.get_facecolor(), edgecolor='none', dpi=100)
+
+        return fig
+
+    def run_imgcat(self):
+        """Run the grapher in imgcat mode."""
+        run_until = None
+        if self.run_for:
+            run_until = time.time() + self.run_for
+
+        # Non-keyboard mode (for non-TTY environments)
+        if self.no_keyboard:
+            try:
+                while self.running:
+                    if run_until and time.time() >= run_until:
+                        print(f"\n\nRun time of {self.run_for}s reached. Exiting.")
+                        break
+
+                    self.collect_data()
+                    fig = self.render_page()
+
+                    # In no-keyboard mode, just save the PNG, don't try imgcat
+                    if self.save_png:
+                        print(f"Iteration {self.iteration}: saved to {self.save_png}")
+                    plt.close(fig)
+                    plt.close('all')  # Close any remaining figures
+                    gc.collect()  # Force garbage collection to reduce memory
+
+                    # Clear iTerm scrollback every 60 iterations to prevent memory buildup
+                    if self.iteration % 60 == 0:
+                        clear_scrollback()
+
+                    time.sleep(self.interval)
+            except KeyboardInterrupt:
+                print("\nStopped by user")
+            return
+
+        # Normal interactive mode
+        hide_cursor()
+        clear_screen()
+
+        try:
+            with KeyboardHandler() as kbd:
+                while self.running:
+                    # Check run_for timeout
+                    if run_until and time.time() >= run_until:
+                        print(f"\n\nRun time of {self.run_for}s reached. Exiting.")
+                        break
+
+                    # Check for resize
+                    if self._check_resize():
+                        clear_screen()
+                    else:
+                        cursor_home()
+
+                    # Collect data and render
+                    self.collect_data()
+                    fig = self.render_page()
+                    imgcat_display(fig)
+                    plt.close(fig)
+                    plt.close('all')
+                    gc.collect()
+
+                    # Clear iTerm scrollback every 60 iterations to prevent memory buildup
+                    if self.iteration % 60 == 0:
+                        clear_scrollback()
+
+                    # Handle keyboard input during sleep
+                    sleep_until = time.time() + self.interval
+                    while time.time() < sleep_until and self.running:
+                        # Check run_for timeout during sleep too
+                        if run_until and time.time() >= run_until:
+                            break
+                        key = kbd.get_key()
+                        if key == 'quit':
+                            self.running = False
+                            break
+                        elif key == 'left':
+                            self.current_page = (self.current_page - 1) % len(self.pages)
+                            break
+                        elif key == 'right':
+                            self.current_page = (self.current_page + 1) % len(self.pages)
+                            break
+                        time.sleep(0.05)
+        finally:
+            show_cursor()
+            clear_screen()
+
+    def run_gui(self):
+        """Run the grapher in GUI mode with matplotlib window."""
+        from matplotlib.animation import FuncAnimation
+
+        # Collect initial data for all pages
+        self.collect_all_pages()
+
+        # Create initial figure
+        fig = self.render_page()
+
+        # Calculate number of frames if run_for specified
+        if self.run_for:
+            num_frames = int(self.run_for / self.interval)
+            repeat = False
+        else:
+            num_frames = None  # Infinite
+            repeat = True
+
+        def update(frame):
+            self.collect_all_pages()
+            fig.clf()  # Clear the existing figure, don't create new one
+            # Re-render onto the same figure
+            self.render_page(fig=fig)
+            fig.canvas.draw_idle()
+            return []
+
+        anim = FuncAnimation(
+            fig, update, interval=int(self.interval * 1000), frames=num_frames, repeat=repeat, blit=False, cache_frame_data=False)
+        plt.show()
+
+    def run_once(self):
+        """Run a single snapshot and exit."""
+        self.collect_data()
+        time.sleep(self.interval)
+        self.collect_data()
+
+        fig = self.render_page()
+        imgcat_display(fig)
+        plt.close(fig)
+
+    def run(self, once: bool = False):
+        """Run the grapher."""
+        if once:
+            self.run_once()
+        elif self.gui_mode:
+            self.run_gui()
+        else:
+            self.run_imgcat()
+
+
+# =============================================================================
+# Configuration Loading
+# =============================================================================
+
+
+def load_config(config_path: str) -> dict:
+    """Load optional config file for layout customization."""
+    if not os.path.exists(config_path):
+        return {}
+
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f) or {}
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Traffic Grapher - Real-time ATS metrics visualization',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s ats-server1.example.com
+  %(prog)s ats-server{1..3}.example.com                    # bash expansion
+  %(prog)s --interval 2 --history 120 ats-server{1..4}.example.com
+  %(prog)s ats1.dc1.example.com ats1.dc2.example.com   # compare POPs
+""")
+
+    parser.add_argument(
+        'hosts', nargs='+', metavar='HOSTNAME', help='Hostnames to monitor (1-4 hosts, e.g., ats-server1.example.com)')
+    parser.add_argument('--interval', type=float, default=1.0, help='Refresh interval in seconds (default: 1.0)')
+    parser.add_argument('--history', type=int, default=60, help='History window in seconds (default: 60)')
+    parser.add_argument('--gui', action='store_true', help='Use matplotlib GUI window instead of imgcat')
+    parser.add_argument('--once', action='store_true', help='Single snapshot, then exit')
+    parser.add_argument('-c', '--config', default=None, help='Optional config file for layout customization')
+
+    # Display options
+    parser.add_argument('--timezone', '-tz', default='UTC', help='Timezone for display (default: UTC, e.g., America/Los_Angeles)')
+
+    # Debug options
+    parser.add_argument(
+        '--save-png', default=None, metavar='FILE', help='Save PNG to file after each render (use {iter} for iteration number)')
+    parser.add_argument('--log-stats', default=None, metavar='FILE', help='Log raw stats to file for debugging')
+    parser.add_argument('--run-for', type=int, default=None, metavar='SECONDS', help='Run for N seconds then exit (for debugging)')
+    parser.add_argument('--no-keyboard', action='store_true', help='Disable keyboard handling (for non-TTY environments)')
+
+    args = parser.parse_args()
+
+    # Load optional config
+    config = {}
+    if args.config:
+        config = load_config(args.config)
+
+    # Use custom pages from config, or defaults
+    pages = config.get('pages', DEFAULT_PAGES)
+
+    # Override history from config if specified
+    history = args.history
+    if 'history' in config:
+        history = config['history'].get('seconds', args.history)
+
+    # Get timezone from config or CLI
+    tz_name = args.timezone
+    if 'timezone' in config:
+        tz_name = config['timezone']
+
+    # Validate host count
+    if len(args.hosts) > 4:
+        parser.error("Maximum 4 hosts supported (limited by line styles)")
+
+    # Create and run grapher
+    grapher = ATSGrapher(
+        hostnames=args.hosts,
+        interval=args.interval,
+        history_seconds=history,
+        pages=pages,
+        gui_mode=args.gui,
+        save_png=args.save_png,
+        log_stats=args.log_stats,
+        run_for=args.run_for,
+        no_keyboard=args.no_keyboard,
+        tz_name=tz_name)
+
+    print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history")
+    if len(args.hosts) > 1:
+        print(f"Comparing: {' vs '.join(grapher.host_names)}")
+    else:
+        print(f"Monitoring: {grapher.host_names[0]}")
+    print("Starting in 2 seconds... (press Ctrl+C to cancel)")
+    time.sleep(2)
+
+    # Clear scrollback buffer at startup to free any previous accumulated images
+    clear_scrollback()
+
+    try:
+        grapher.run(once=args.once)
+    except KeyboardInterrupt:
+        print("\nStopped by user")
+        return 0
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())

From 436784e71d8d948351becc909e26c7668af5c135 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Tue, 10 Feb 2026 14:53:38 -0800
Subject: [PATCH 02/15] traffic_grapher: Add uv project and fix code quality
 issues

Address review feedback:
- Add pyproject.toml and PEP 723 inline script metadata so the tool
  can be run via 'uv run traffic_grapher.py' without manual pip installs
- Make paths configurable: --traffic-ctl, --socket-path CLI args with
  TRAFFIC_CTL_PATH and TRAFFICSERVER_JSONRPC_SOCKET env var fallbacks
  (replaces hard-coded /opt/edge/... paths)
- Fix command injection: use subprocess.run with list args instead of
  shell=True, add hostname validation via regex
- Replace bare except clauses with specific exception types
- Remove unused imports (select, tty)
- Parse traffic_ctl output in Python instead of piping through awk
---
 tools/traffic_grapher/pyproject.toml     | 29 +++++++
 tools/traffic_grapher/traffic_grapher.py | 97 ++++++++++++++++--------
 2 files changed, 94 insertions(+), 32 deletions(-)
 create mode 100644 tools/traffic_grapher/pyproject.toml

diff --git a/tools/traffic_grapher/pyproject.toml b/tools/traffic_grapher/pyproject.toml
new file mode 100644
index 00000000000..21746fba35b
--- /dev/null
+++ b/tools/traffic_grapher/pyproject.toml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[project]
+name = "traffic-grapher"
+version = "1.0.0"
+description = "Real-time ATS metrics visualization for iTerm2"
+requires-python = ">=3.9"
+license = "Apache-2.0"
+dependencies = [
+    "matplotlib>=3.7",
+    "pyyaml>=6.0",
+]
+
+[project.scripts]
+traffic-grapher = "traffic_grapher:main"
diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index b6b6015db3c..68a1e474cf9 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -16,6 +16,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# /// script
+# dependencies = ["matplotlib>=3.7", "pyyaml>=6.0"]
+# requires-python = ">=3.9"
+# ///
 """
 Traffic Grapher - Real-time ATS metrics visualization.
 
@@ -34,14 +38,13 @@
 import gc
 import io
 import os
-import select
+import re
 import shutil
 import struct
 import subprocess
 import sys
 import termios
 import time
-import tty
 from collections import deque
 from datetime import datetime, timezone
 from typing import Optional, Tuple
@@ -321,11 +324,8 @@
     },
 ]
 
-# Command template for traffic_ctl
-# Default path (adjust for your installation)
-# Note: awk runs locally to avoid SSH quote escaping issues
-METRIC_COMMAND_REMOTE = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key}"
-METRIC_COMMAND_LOCAL = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key} | awk '{{print $2}}'"
+# Default traffic_ctl path (configurable via --traffic-ctl or TRAFFIC_CTL_PATH env var)
+DEFAULT_TRAFFIC_CTL_PATH = os.environ.get("TRAFFIC_CTL_PATH", "traffic_ctl")
 
 # =============================================================================
 # Terminal Utilities
@@ -444,7 +444,7 @@ def get_terminal_pixel_size() -> Tuple[int, int]:
 
         # Estimate from rows/cols (typical cell: 9x18 pixels)
         return (cols * 9, rows * 18)
-    except:
+    except (OSError, io.UnsupportedOperation, struct.error, ValueError):
         # Fallback: assume reasonable terminal size
         size = shutil.get_terminal_size((80, 24))
         return (size.columns * 9, size.lines * 18)
@@ -516,7 +516,7 @@ def get_key(self) -> Optional[str]:
                             return 'up'
                         elif ch3 == b'B':
                             return 'down'
-                except:
+                except (OSError, IOError):
                     pass
                 return 'escape'
             elif ch in (b'q', b'Q'):
@@ -562,8 +562,18 @@ def get_key(self) -> Optional[str]:
     print(f"{{name}}={{value}}={{dtype}}")
 '''
 
-# Default JSONRPC socket path (adjust for your installation)
-JSONRPC_SOCKET_PATH = "/opt/edge/trafficserver/10.0/var/trafficserver/jsonrpc20.sock"
+# Default JSONRPC socket path (configurable via --socket-path or TRAFFICSERVER_JSONRPC_SOCKET env var)
+DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get(
+    "TRAFFICSERVER_JSONRPC_SOCKET",
+    "/usr/local/var/trafficserver/jsonrpc20.sock",
+)
+
+
+def _validate_hostname(hostname: str) -> str:
+    """Validate hostname to prevent command injection."""
+    if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9._:-]*$', hostname):
+        raise ValueError(f"Invalid hostname: {hostname!r}")
+    return hostname
 
 
 class JSONRPCBatchCollector:
@@ -572,8 +582,8 @@ class JSONRPCBatchCollector:
     Collects all metrics for a host in a single SSH call.
     """
 
-    def __init__(self, hostname: str, metric_keys: list, socket_path: str = JSONRPC_SOCKET_PATH):
-        self.hostname = hostname  # Bare hostname like "ats-server1.example.com"
+    def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH):
+        self.hostname = _validate_hostname(hostname)
         self.metric_keys = metric_keys
         self.socket_path = socket_path
 
@@ -592,13 +602,13 @@ def collect(self) -> dict:
         """
         script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern)
 
-        # Build SSH command - hostname is passed directly, we add "ssh" prefix
-        # Encode script as base64 to avoid quoting issues
+        # Encode script as base64 to avoid quoting issues.
+        # Use subprocess with list args (no shell=True) to prevent command injection.
         script_b64 = base64.b64encode(script.encode()).decode()
-        cmd = f"ssh {self.hostname} \"echo '{script_b64}' | base64 -d | python3\""
+        remote_cmd = f"echo '{script_b64}' | base64 -d | python3"
 
         try:
-            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
+            result = subprocess.run(["ssh", self.hostname, remote_cmd], capture_output=True, text=True, timeout=10)
 
             if result.returncode != 0:
                 return self.last_values
@@ -612,7 +622,7 @@ def collect(self) -> dict:
                         key = parts[0]
                         try:
                             value = float(parts[1])
-                        except:
+                        except (ValueError, TypeError):
                             continue
                         dtype = parts[2] if len(parts) > 2 else "INT"
                         values[key] = (value, dtype)
@@ -627,22 +637,27 @@ def collect(self) -> dict:
 class MetricCollector:
     """Collects metric values from a host via shell commands."""
 
-    def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefix: str = "", host_name: str = ""):
+    def __init__(
+            self,
+            name: str,
+            key: str,
+            metric_type: str,
+            color: str,
+            hostname: str = "",
+            host_name: str = "",
+            traffic_ctl_path: str = DEFAULT_TRAFFIC_CTL_PATH):
         self.name = name
         self.key = key
         self.metric_type = metric_type.lower()
         self.color = color
-        self.host_prefix = host_prefix
         self.host_name = host_name
 
-        # Build the full command
-        if host_prefix:
-            # For remote hosts: run traffic_ctl on remote, awk locally
-            remote_cmd = METRIC_COMMAND_REMOTE.format(key=key)
-            self.command = f"{host_prefix} '{remote_cmd}' | awk '{{print $2}}'"
+        # Build the command as a list (no shell=True needed)
+        if hostname:
+            _validate_hostname(hostname)
+            self.command = ["ssh", hostname, traffic_ctl_path, "metric", "get", key]
         else:
-            # Local: run everything locally
-            self.command = METRIC_COMMAND_LOCAL.format(key=key)
+            self.command = [traffic_ctl_path, "metric", "get", key]
 
         # For counter metrics, track previous value and time
         self._prev_value: Optional[float] = None
@@ -654,7 +669,7 @@ def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefi
     def _get_raw_value(self) -> Optional[float]:
         """Run the command and return the raw numeric value."""
         try:
-            result = subprocess.run(self.command, shell=True, capture_output=True, text=True, timeout=5)
+            result = subprocess.run(self.command, capture_output=True, text=True, timeout=5)
             if result.returncode != 0:
                 return None
 
@@ -662,6 +677,10 @@ def _get_raw_value(self) -> Optional[float]:
             if not output:
                 return None
 
+            # Parse "key value" output from traffic_ctl
+            parts = output.split()
+            if len(parts) >= 2:
+                return float(parts[1])
             return float(output)
         except (subprocess.TimeoutExpired, ValueError, OSError):
             return None
@@ -731,7 +750,8 @@ def __init__(
             log_stats: Optional[str] = None,
             run_for: Optional[int] = None,
             no_keyboard: bool = False,
-            tz_name: str = "UTC"):
+            tz_name: str = "UTC",
+            socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH):
         self.hostnames = hostnames  # Bare hostnames like ["ats-server1.example.com"]
         self.interval = interval
         self.history_seconds = history_seconds
@@ -748,7 +768,7 @@ def __init__(
         if ZoneInfo and tz_name != "UTC":
             try:
                 self.tz = ZoneInfo(tz_name)
-            except:
+            except Exception:
                 self.tz = timezone.utc
                 self.tz_name = "UTC"
         else:
@@ -783,7 +803,9 @@ def __init__(
                         all_metric_keys.append(metric["key2"])
 
         # Create ONE combined batch collector per host (collects all metrics at once)
-        self.combined_collectors = [JSONRPCBatchCollector(hostname, all_metric_keys) for hostname in hostnames]
+        self.combined_collectors = [
+            JSONRPCBatchCollector(hostname, all_metric_keys, socket_path=socket_path) for hostname in hostnames
+        ]
 
         # Initialize metric info and data for all pages
         # metric_info[page][panel][metric] = {name, key, type, color}
@@ -1343,6 +1365,16 @@ def main():
     # Display options
     parser.add_argument('--timezone', '-tz', default='UTC', help='Timezone for display (default: UTC, e.g., America/Los_Angeles)')
 
+    # ATS paths
+    parser.add_argument(
+        '--traffic-ctl',
+        default=DEFAULT_TRAFFIC_CTL_PATH,
+        help='Path to traffic_ctl binary (default: $TRAFFIC_CTL_PATH or "traffic_ctl")')
+    parser.add_argument(
+        '--socket-path',
+        default=DEFAULT_JSONRPC_SOCKET_PATH,
+        help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)')
+
     # Debug options
     parser.add_argument(
         '--save-png', default=None, metavar='FILE', help='Save PNG to file after each render (use {iter} for iteration number)')
@@ -1385,7 +1417,8 @@ def main():
         log_stats=args.log_stats,
         run_for=args.run_for,
         no_keyboard=args.no_keyboard,
-        tz_name=tz_name)
+        tz_name=tz_name,
+        socket_path=args.socket_path)
 
     print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history")
     if len(args.hosts) > 1:

From cf15b508d11603c12f5daaa02ec3bf56b7587f53 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Tue, 10 Feb 2026 15:03:00 -0800
Subject: [PATCH 03/15] traffic_grapher: Add README, rename --socket-path to
 --socket, remove --traffic-ctl

- Add README.md with usage, quick start, configuration, and CLI reference
- Rename --socket-path to --socket for brevity
- Remove unused --traffic-ctl CLI option (main code uses JSONRPC socket,
  not traffic_ctl)
---
 tools/traffic_grapher/README.md          | 112 +++++++++++++++++++++++
 tools/traffic_grapher/traffic_grapher.py |  10 +-
 2 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 tools/traffic_grapher/README.md

diff --git a/tools/traffic_grapher/README.md b/tools/traffic_grapher/README.md
new file mode 100644
index 00000000000..f89d697d7bc
--- /dev/null
+++ b/tools/traffic_grapher/README.md
@@ -0,0 +1,112 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+# Traffic Grapher
+
+Real-time ATS metrics visualization for iTerm2. Displays live graphs of
+requests/sec, latency, cache hit rate, connections, and more — inline in
+your terminal using imgcat.
+
+## Features
+
+- Real-time graphs of RPS, latency, cache hit rate, connections
+- Support for 1–4 hosts with different line styles for comparison
+- Collects metrics via JSONRPC Unix socket (batch collection per host)
+- Dark theme optimized for terminal display
+- Keyboard navigation between 4 metric pages
+- Configurable refresh interval and history window
+- Optional GUI mode via matplotlib window
+
+## Requirements
+
+- Python 3.9+
+- [uv](https://docs.astral.sh/uv/) (recommended) or pip
+- iTerm2 (or compatible terminal for inline images)
+- SSH access to remote ATS hosts
+
+## Quick Start
+
+```bash
+# With uv (handles dependencies automatically)
+uv run traffic_grapher.py ats-server1.example.com
+
+# Multiple hosts for comparison
+uv run traffic_grapher.py ats-server{1..4}.example.com
+
+# Custom interval and history
+uv run traffic_grapher.py --interval 2 --history 120 ats-server1.example.com
+```
+
+## Installation (Alternative)
+
+```bash
+# Install as a project
+uv sync
+uv run traffic-grapher ats-server1.example.com
+
+# Or with pip
+pip install .
+traffic-grapher ats-server1.example.com
+```
+
+## Configuration
+
+### ATS Paths
+
+The tool needs to know where the JSONRPC socket is on the remote host.
+Configure via CLI flag or environment variable:
+
+| Option | Env Var | Default |
+|--------|---------|---------|
+| `--socket` | `TRAFFICSERVER_JSONRPC_SOCKET` | `/usr/local/var/trafficserver/jsonrpc20.sock` |
+
+### Custom Dashboards
+
+Create a YAML config file to customize which metrics are displayed:
+
+```bash
+uv run traffic_grapher.py -c my_dashboard.yaml ats-server1.example.com
+```
+
+## Keyboard Controls
+
+| Key | Action |
+|-----|--------|
+| `h` / `←` | Previous page |
+| `l` / `→` | Next page |
+| `q` | Quit |
+
+## Pages
+
+1. **Traffic & Cache** — Requests/sec, latency, cache hit rate, connections
+2. **Response Codes** — 2xx, 3xx, 4xx, 5xx breakdown
+3. **TLS & HTTP/2** — SSL handshakes, connections, HTTP/2 stats
+4. **Network & Errors** — Bandwidth, connection errors, transaction errors
+
+## Options
+
+```
+--interval SEC     Refresh interval in seconds (default: 1.0)
+--history SEC      History window in seconds (default: 60)
+--socket PATH      Path to JSONRPC Unix socket on remote host
+--gui              Use matplotlib GUI window instead of imgcat
+--once             Single snapshot, then exit
+--timezone TZ      Timezone for display (default: UTC)
+--save-png FILE    Save PNG after each render (use {iter} for iteration)
+--no-keyboard      Disable keyboard handling (for non-TTY environments)
+```
diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 68a1e474cf9..f62ab8a0738 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -324,7 +324,7 @@
     },
 ]
 
-# Default traffic_ctl path (configurable via --traffic-ctl or TRAFFIC_CTL_PATH env var)
+# Default traffic_ctl path (used by MetricCollector, configurable via TRAFFIC_CTL_PATH env var)
 DEFAULT_TRAFFIC_CTL_PATH = os.environ.get("TRAFFIC_CTL_PATH", "traffic_ctl")
 
 # =============================================================================
@@ -1367,11 +1367,7 @@ def main():
 
     # ATS paths
     parser.add_argument(
-        '--traffic-ctl',
-        default=DEFAULT_TRAFFIC_CTL_PATH,
-        help='Path to traffic_ctl binary (default: $TRAFFIC_CTL_PATH or "traffic_ctl")')
-    parser.add_argument(
-        '--socket-path',
+        '--socket',
         default=DEFAULT_JSONRPC_SOCKET_PATH,
         help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)')
 
@@ -1418,7 +1414,7 @@ def main():
         run_for=args.run_for,
         no_keyboard=args.no_keyboard,
         tz_name=tz_name,
-        socket_path=args.socket_path)
+        socket_path=args.socket)
 
     print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history")
     if len(args.hosts) > 1:

From 7a4fd39d6ef6f17817daf1823e3659e86f0e5cb4 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 14:52:20 -0700
Subject: [PATCH 04/15] traffic_grapher: Add localhost support and connection
 error feedback

- Detect localhost/127.0.0.1/local and connect directly to the JSONRPC
  Unix socket instead of SSH, so 'traffic_grapher.py localhost' works
  without any SSH setup
- Add startup connection test that reports success/failure for each host
  before entering the graph loop
- Track and display collection errors: show in red on the dashboard
  status bar and print to stderr for the first few failures
- Give clear error messages for common failures: socket not found,
  permission denied, connection refused, SSH failures, timeouts, empty
  responses

Addresses review feedback from cmcfarlen.
---
 tools/traffic_grapher/README.md          |  14 +-
 tools/traffic_grapher/traffic_grapher.py | 193 ++++++++++++++++++++---
 2 files changed, 184 insertions(+), 23 deletions(-)

diff --git a/tools/traffic_grapher/README.md b/tools/traffic_grapher/README.md
index f89d697d7bc..ed67bec4084 100644
--- a/tools/traffic_grapher/README.md
+++ b/tools/traffic_grapher/README.md
@@ -37,12 +37,15 @@ your terminal using imgcat.
 - Python 3.9+
 - [uv](https://docs.astral.sh/uv/) (recommended) or pip
 - iTerm2 (or compatible terminal for inline images)
-- SSH access to remote ATS hosts
+- SSH access to remote ATS hosts (not needed for localhost)
 
 ## Quick Start
 
 ```bash
-# With uv (handles dependencies automatically)
+# Monitor ATS on the local machine (connects directly to JSONRPC socket)
+uv run traffic_grapher.py localhost
+
+# Monitor a remote host (connects via SSH)
 uv run traffic_grapher.py ats-server1.example.com
 
 # Multiple hosts for comparison
@@ -68,8 +71,11 @@ traffic-grapher ats-server1.example.com
 
 ### ATS Paths
 
-The tool needs to know where the JSONRPC socket is on the remote host.
-Configure via CLI flag or environment variable:
+For localhost, the tool connects directly to the JSONRPC Unix socket.
+For remote hosts, it sends a Python script via SSH that connects to
+the socket on the remote machine.
+
+Configure the socket path via CLI flag or environment variable:
 
 | Option | Env Var | Default |
 |--------|---------|---------|
diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index f62ab8a0738..444b8393aa2 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -37,9 +37,11 @@
 import fcntl
 import gc
 import io
+import json
 import os
 import re
 import shutil
+import socket as socket_module
 import struct
 import subprocess
 import sys
@@ -568,6 +570,8 @@ def get_key(self) -> Optional[str]:
     "/usr/local/var/trafficserver/jsonrpc20.sock",
 )
 
+LOCAL_HOSTNAMES = {'localhost', '127.0.0.1', 'local', '::1'}
+
 
 def _validate_hostname(hostname: str) -> str:
     """Validate hostname to prevent command injection."""
@@ -579,31 +583,130 @@ def _validate_hostname(hostname: str) -> str:
 class JSONRPCBatchCollector:
     """
     Batch metric collector using JSONRPC Unix socket.
-    Collects all metrics for a host in a single SSH call.
+    Connects directly for localhost, uses SSH for remote hosts.
     """
 
     def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH):
-        self.hostname = _validate_hostname(hostname)
+        self.is_local = hostname.lower() in LOCAL_HOSTNAMES
+        if self.is_local:
+            self.hostname = 'localhost'
+        else:
+            self.hostname = _validate_hostname(hostname)
         self.metric_keys = metric_keys
         self.socket_path = socket_path
 
-        # Build regex pattern matching all metric keys
-        # Escape dots and join with |
         escaped_keys = [k.replace('.', r'\.') for k in metric_keys]
         self.pattern = '|'.join(f"^{k}$" for k in escaped_keys)
 
-        # Cached results from last collection
-        self.last_values: dict = {}  # key -> (value, data_type)
+        self.last_values: dict = {}
+        self.last_error: Optional[str] = None
+        self.consecutive_errors: int = 0
 
     def collect(self) -> dict:
-        """
-        Collect all metrics in one SSH call.
-        Returns dict of {metric_key: (value, data_type)}.
-        """
+        """Collect all metrics. Uses local socket or SSH depending on host."""
+        if self.is_local:
+            return self._collect_local()
+        return self._collect_remote()
+
+    def test_connection(self) -> Tuple[bool, str]:
+        """Test connectivity and return (success, message)."""
+        if self.is_local:
+            return self._test_local()
+        return self._test_remote()
+
+    def _test_local(self) -> Tuple[bool, str]:
+        """Test local JSONRPC socket connectivity."""
+        if not os.path.exists(self.socket_path):
+            return False, f"Socket not found: {self.socket_path}"
+        try:
+            s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM)
+            s.settimeout(3)
+            s.connect(self.socket_path)
+            s.close()
+            return True, "OK"
+        except PermissionError:
+            return False, f"Permission denied: {self.socket_path}"
+        except ConnectionRefusedError:
+            return False, f"Connection refused: {self.socket_path} (is ATS running?)"
+        except OSError as e:
+            return False, f"Socket error: {e}"
+
+    def _test_remote(self) -> Tuple[bool, str]:
+        """Test SSH connectivity to remote host."""
+        try:
+            result = subprocess.run(
+                ["ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", self.hostname, "echo ok"],
+                capture_output=True,
+                text=True,
+                timeout=10)
+            if result.returncode != 0:
+                stderr = result.stderr.strip()
+                return False, f"SSH failed: {stderr or 'unknown error'}"
+            return True, "OK"
+        except subprocess.TimeoutExpired:
+            return False, "SSH timed out"
+        except OSError as e:
+            return False, f"SSH error: {e}"
+
+    def _collect_local(self) -> dict:
+        """Connect directly to the local JSONRPC Unix socket."""
+        try:
+            s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM)
+            s.settimeout(5)
+            s.connect(self.socket_path)
+
+            request = {"jsonrpc": "2.0", "method": "admin_lookup_records", "params": [{"record_name_regex": self.pattern}], "id": 1}
+            s.sendall(json.dumps(request).encode() + b"\n")
+
+            chunks = []
+            while True:
+                chunk = s.recv(1048576)
+                if not chunk:
+                    break
+                chunks.append(chunk)
+                try:
+                    json.loads(b''.join(chunks))
+                    break
+                except json.JSONDecodeError:
+                    continue
+
+            s.close()
+            response_data = json.loads(b''.join(chunks).decode())
+
+            values = {}
+            for rec in response_data.get("result", {}).get("recordList", []):
+                r = rec.get("record", {})
+                name = r.get("record_name", "")
+                raw_val = r.get("current_value", "")
+                dtype = r.get("data_type", "")
+                try:
+                    values[name] = (float(raw_val), dtype)
+                except (ValueError, TypeError):
+                    continue
+
+            self.last_values = values
+            self.last_error = None
+            self.consecutive_errors = 0
+            return values
+
+        except FileNotFoundError:
+            self.last_error = f"Socket not found: {self.socket_path}"
+        except PermissionError:
+            self.last_error = f"Permission denied: {self.socket_path}"
+        except ConnectionRefusedError:
+            self.last_error = f"Connection refused (is ATS running?)"
+        except json.JSONDecodeError as e:
+            self.last_error = f"Invalid JSON response: {e}"
+        except OSError as e:
+            self.last_error = f"Socket error: {e}"
+
+        self.consecutive_errors += 1
+        return self.last_values
+
+    def _collect_remote(self) -> dict:
+        """Collect via SSH to remote host."""
         script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern)
 
-        # Encode script as base64 to avoid quoting issues.
-        # Use subprocess with list args (no shell=True) to prevent command injection.
         script_b64 = base64.b64encode(script.encode()).decode()
         remote_cmd = f"echo '{script_b64}' | base64 -d | python3"
 
@@ -611,9 +714,11 @@ def collect(self) -> dict:
             result = subprocess.run(["ssh", self.hostname, remote_cmd], capture_output=True, text=True, timeout=10)
 
             if result.returncode != 0:
+                stderr = result.stderr.strip()
+                self.last_error = f"SSH error: {stderr[:200]}" if stderr else f"SSH exited with code {result.returncode}"
+                self.consecutive_errors += 1
                 return self.last_values
 
-            # Parse output: key=value=dtype per line
             values = {}
             for line in result.stdout.strip().split('\n'):
                 if '=' in line:
@@ -627,10 +732,26 @@ def collect(self) -> dict:
                         dtype = parts[2] if len(parts) > 2 else "INT"
                         values[key] = (value, dtype)
 
+            if not values and result.stdout.strip():
+                self.last_error = f"No metrics parsed from output ({len(result.stdout)} bytes)"
+                self.consecutive_errors += 1
+            elif not values:
+                self.last_error = "Empty response from remote host"
+                self.consecutive_errors += 1
+            else:
+                self.last_error = None
+                self.consecutive_errors = 0
+
             self.last_values = values
             return values
 
-        except (subprocess.TimeoutExpired, OSError):
+        except subprocess.TimeoutExpired:
+            self.last_error = "SSH timed out (10s)"
+            self.consecutive_errors += 1
+            return self.last_values
+        except OSError as e:
+            self.last_error = f"SSH error: {e}"
+            self.consecutive_errors += 1
             return self.last_values
 
 
@@ -908,6 +1029,9 @@ def collect_all_pages(self):
             batch_results.append(results)
             host_times.append(collect_time)
 
+            if batch_collector.last_error and batch_collector.consecutive_errors <= 3:
+                print(f"[{host_label}] {batch_collector.last_error}", file=sys.stderr)
+
             if self.log_stats:
                 log_lines.append(f"    Got {len(results)} metrics\n")
 
@@ -1173,9 +1297,19 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure:
             title += f" ({self.host_names[0]})"
         fig.suptitle(title, color=self.TEXT_COLOR, fontsize=18, fontweight='bold')
 
-        # Status bar
-        status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history"
-        fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080')
+        # Status bar - show errors if any hosts are failing
+        error_parts = []
+        for host_idx, collector in enumerate(self.combined_collectors):
+            if collector.last_error:
+                host_label = self.host_names[host_idx]
+                error_parts.append(f"{host_label}: {collector.last_error}")
+
+        if error_parts:
+            error_text = " | ".join(error_parts)
+            fig.text(0.5, 0.01, f"ERROR: {error_text}", ha='center', fontsize=13, color='#FF4040', fontweight='bold')
+        else:
+            status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history"
+            fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080')
 
         plt.tight_layout()
         plt.subplots_adjust(top=0.92, bottom=0.06, left=0.06, right=0.98, hspace=0.35, wspace=0.22)
@@ -1421,8 +1555,29 @@ def main():
         print(f"Comparing: {' vs '.join(grapher.host_names)}")
     else:
         print(f"Monitoring: {grapher.host_names[0]}")
-    print("Starting in 2 seconds... (press Ctrl+C to cancel)")
-    time.sleep(2)
+
+    # Test connectivity to all hosts before starting
+    print("\nTesting connections...")
+    all_ok = True
+    for i, collector in enumerate(grapher.combined_collectors):
+        host_label = grapher.host_names[i]
+        mode = "local socket" if collector.is_local else "SSH"
+        print(f"  {host_label} ({mode}): ", end="", flush=True)
+        ok, msg = collector.test_connection()
+        if ok:
+            print(f"OK")
+        else:
+            print(f"FAILED - {msg}")
+            all_ok = False
+
+    if not all_ok:
+        print("\nWARNING: Some hosts failed connectivity test.")
+        print("The grapher will start anyway - errors will be shown on the dashboard.")
+        print("Press Ctrl+C to cancel, or wait 3 seconds to continue...")
+        time.sleep(3)
+    else:
+        print("\nAll hosts connected. Starting in 1 second...")
+        time.sleep(1)
 
     # Clear scrollback buffer at startup to free any previous accumulated images
     clear_scrollback()

From 9cc7aa34d93dff9434bf433e0fe902da7b1d0fdd Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 14:57:16 -0700
Subject: [PATCH 05/15] traffic_grapher: Fix SyntaxWarning for regex pattern on
 Python 3.12+

Use raw string prefix for the regex pattern in the JSONRPC script
template so backslash-dot escape sequences do not trigger
SyntaxWarning/SyntaxError on remote hosts running Python 3.12+.
---
 tools/traffic_grapher/traffic_grapher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 444b8393aa2..b16e4f6dccb 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -547,7 +547,7 @@ def get_key(self) -> Optional[str]:
 request = {{
     "jsonrpc": "2.0",
     "method": "admin_lookup_records",
-    "params": [{{"record_name_regex": "{pattern}"}}],
+    "params": [{{"record_name_regex": r"{pattern}"}}],
     "id": 1
 }}
 sock.sendall(json.dumps(request).encode() + b"\\n")

From ec6d723ae1c4d230ae5acc25ebaa43ed01d688b3 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 15:46:45 -0700
Subject: [PATCH 06/15] traffic_grapher: Auto-discover JSONRPC socket path on
 target host

Instead of requiring the user to know where the JSONRPC socket is,
auto-discover it by finding traffic_ctl (via PATH or common install
prefixes like /usr/local, /opt/ats, /opt/trafficserver) and querying
the runtime directory. Falls back to checking common socket locations.

Discovery runs once at startup per host. Users can still override
with --socket or TRAFFICSERVER_JSONRPC_SOCKET env var.
---
 tools/traffic_grapher/traffic_grapher.py | 127 +++++++++++++++++++++--
 1 file changed, 116 insertions(+), 11 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index b16e4f6dccb..649e6daf02e 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -564,14 +564,94 @@ def get_key(self) -> Optional[str]:
     print(f"{{name}}={{value}}={{dtype}}")
 '''
 
-# Default JSONRPC socket path (configurable via --socket-path or TRAFFICSERVER_JSONRPC_SOCKET env var)
-DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get(
-    "TRAFFICSERVER_JSONRPC_SOCKET",
-    "/usr/local/var/trafficserver/jsonrpc20.sock",
-)
+# Default JSONRPC socket path (configurable via --socket or TRAFFICSERVER_JSONRPC_SOCKET env var)
+DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get("TRAFFICSERVER_JSONRPC_SOCKET", None)
 
 LOCAL_HOSTNAMES = {'localhost', '127.0.0.1', 'local', '::1'}
 
+# Discovery script: finds the JSONRPC socket on the target host by locating
+# traffic_ctl (via PATH or common prefixes) and querying the runtime directory.
+DISCOVER_SOCKET_SCRIPT = r'''
+import subprocess, os, sys
+
+COMMON_PREFIXES = [
+    "/usr/local",
+    "/opt/ats",
+    "/opt/trafficserver",
+    "/usr",
+    "/opt/ts",
+]
+
+def find_traffic_ctl():
+    """Find traffic_ctl binary."""
+    # Check PATH first
+    for d in os.environ.get("PATH", "").split(":"):
+        candidate = os.path.join(d, "traffic_ctl")
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+    # Check common install prefixes
+    for prefix in COMMON_PREFIXES:
+        candidate = os.path.join(prefix, "bin", "traffic_ctl")
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+    return None
+
+def get_socket_via_traffic_ctl(tc):
+    """Ask traffic_ctl for the runtime dir and build the socket path."""
+    try:
+        r = subprocess.run([tc, "config", "get", "proxy.config.local_state_dir"],
+                           capture_output=True, text=True, timeout=5)
+        if r.returncode == 0:
+            parts = r.stdout.strip().split()
+            state_dir = parts[1] if len(parts) >= 2 else parts[0]
+            # Could be relative to prefix — check if absolute
+            if not os.path.isabs(state_dir):
+                prefix = os.path.dirname(os.path.dirname(tc))
+                state_dir = os.path.join(prefix, state_dir)
+            sock = os.path.join(state_dir, "jsonrpc20.sock")
+            if os.path.exists(sock):
+                print(sock)
+                return True
+    except Exception:
+        pass
+    return False
+
+def check_common_paths():
+    """Check common socket locations."""
+    for prefix in COMMON_PREFIXES:
+        sock = os.path.join(prefix, "var", "trafficserver", "jsonrpc20.sock")
+        if os.path.exists(sock):
+            print(sock)
+            return True
+    return False
+
+tc = find_traffic_ctl()
+if tc:
+    if get_socket_via_traffic_ctl(tc):
+        sys.exit(0)
+if check_common_paths():
+    sys.exit(0)
+# Not found
+sys.exit(1)
+'''
+
+
+def discover_socket_path(hostname: str, is_local: bool) -> Optional[str]:
+    """Auto-discover the JSONRPC socket path on the target host."""
+    try:
+        if is_local:
+            result = subprocess.run(["python3", "-c", DISCOVER_SOCKET_SCRIPT], capture_output=True, text=True, timeout=10)
+        else:
+            script_b64 = base64.b64encode(DISCOVER_SOCKET_SCRIPT.encode()).decode()
+            remote_cmd = f"echo '{script_b64}' | base64 -d | python3"
+            result = subprocess.run(["ssh", hostname, remote_cmd], capture_output=True, text=True, timeout=10)
+
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except (subprocess.TimeoutExpired, OSError):
+        pass
+    return None
+
 
 def _validate_hostname(hostname: str) -> str:
     """Validate hostname to prevent command injection."""
@@ -586,14 +666,14 @@ class JSONRPCBatchCollector:
     Connects directly for localhost, uses SSH for remote hosts.
     """
 
-    def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH):
+    def __init__(self, hostname: str, metric_keys: list, socket_path: Optional[str] = None):
         self.is_local = hostname.lower() in LOCAL_HOSTNAMES
         if self.is_local:
             self.hostname = 'localhost'
         else:
             self.hostname = _validate_hostname(hostname)
         self.metric_keys = metric_keys
-        self.socket_path = socket_path
+        self.socket_path = socket_path  # May be None until discover() is called
 
         escaped_keys = [k.replace('.', r'\.') for k in metric_keys]
         self.pattern = '|'.join(f"^{k}$" for k in escaped_keys)
@@ -616,6 +696,8 @@ def test_connection(self) -> Tuple[bool, str]:
 
     def _test_local(self) -> Tuple[bool, str]:
         """Test local JSONRPC socket connectivity."""
+        if self.socket_path is None:
+            return False, "No socket path discovered"
         if not os.path.exists(self.socket_path):
             return False, f"Socket not found: {self.socket_path}"
         try:
@@ -650,6 +732,10 @@ def _test_remote(self) -> Tuple[bool, str]:
 
     def _collect_local(self) -> dict:
         """Connect directly to the local JSONRPC Unix socket."""
+        if self.socket_path is None:
+            self.last_error = "No socket path (use --socket or set TRAFFICSERVER_JSONRPC_SOCKET)"
+            self.consecutive_errors += 1
+            return self.last_values
         try:
             s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM)
             s.settimeout(5)
@@ -705,6 +791,10 @@ def _collect_local(self) -> dict:
 
     def _collect_remote(self) -> dict:
         """Collect via SSH to remote host."""
+        if self.socket_path is None:
+            self.last_error = "No socket path (use --socket or set TRAFFICSERVER_JSONRPC_SOCKET)"
+            self.consecutive_errors += 1
+            return self.last_values
         script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern)
 
         script_b64 = base64.b64encode(script.encode()).decode()
@@ -1501,9 +1591,7 @@ def main():
 
     # ATS paths
     parser.add_argument(
-        '--socket',
-        default=DEFAULT_JSONRPC_SOCKET_PATH,
-        help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)')
+        '--socket', default=DEFAULT_JSONRPC_SOCKET_PATH, help='Path to JSONRPC Unix socket (auto-discovered if not set)')
 
     # Debug options
     parser.add_argument(
@@ -1556,16 +1644,33 @@ def main():
     else:
         print(f"Monitoring: {grapher.host_names[0]}")
 
+    # Auto-discover socket path for hosts that don't have one explicitly set
+    for i, collector in enumerate(grapher.combined_collectors):
+        if collector.socket_path is not None:
+            continue
+        host_label = grapher.host_names[i]
+        print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True)
+        path = discover_socket_path(collector.hostname, collector.is_local)
+        if path:
+            print(f"found: {path}")
+            collector.socket_path = path
+        else:
+            print("not found (use --socket to specify)")
+
     # Test connectivity to all hosts before starting
     print("\nTesting connections...")
     all_ok = True
     for i, collector in enumerate(grapher.combined_collectors):
         host_label = grapher.host_names[i]
         mode = "local socket" if collector.is_local else "SSH"
+        if collector.socket_path is None:
+            print(f"  {host_label} ({mode}): SKIPPED - no socket path")
+            all_ok = False
+            continue
         print(f"  {host_label} ({mode}): ", end="", flush=True)
         ok, msg = collector.test_connection()
         if ok:
-            print(f"OK")
+            print(f"OK ({collector.socket_path})")
         else:
             print(f"FAILED - {msg}")
             all_ok = False

From 7b42c274a800541d9b202340f1f0e1d13e92bdeb Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 15:54:32 -0700
Subject: [PATCH 07/15] traffic_grapher: Fix GUI mode window sizing

In GUI mode, start with a large default size (16x10 inches) and
maximize the window on startup. On subsequent frames, don't override
the figure size so user resizes are preserved. The terminal-based
size calculation is only used for imgcat mode.
---
 tools/traffic_grapher/traffic_grapher.py | 29 +++++++++++++++++-------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 649e6daf02e..5a163775b57 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1249,16 +1249,18 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure:
         page = self.pages[self.current_page]
         current_elapsed = time.time() - self.start_time
 
-        # Set up dark theme - dynamic figure size to fill terminal
         plt.style.use('dark_background')
-        fig_width, fig_height = get_figure_size_for_terminal()
 
         if fig is None:
-            # Create new figure
-            fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height))
+            if self.gui_mode:
+                fig, axes = plt.subplots(2, 2, figsize=(16, 10))
+            else:
+                fig_width, fig_height = get_figure_size_for_terminal()
+                fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height))
         else:
-            # Reuse existing figure, create new axes
-            fig.set_size_inches(fig_width, fig_height)
+            if not self.gui_mode:
+                fig_width, fig_height = get_figure_size_for_terminal()
+                fig.set_size_inches(fig_width, fig_height)
             axes = fig.subplots(2, 2)
 
         fig.patch.set_facecolor(self.FIG_BG_COLOR)  # Pure black outside graphs
@@ -1501,12 +1503,23 @@ def run_gui(self):
         """Run the grapher in GUI mode with matplotlib window."""
         from matplotlib.animation import FuncAnimation
 
-        # Collect initial data for all pages
         self.collect_all_pages()
 
-        # Create initial figure
         fig = self.render_page()
 
+        # Maximize the window on startup
+        try:
+            manager = plt.get_current_fig_manager()
+            backend = matplotlib.get_backend().lower()
+            if 'macosx' in backend:
+                manager.full_screen_toggle()
+            elif 'tk' in backend:
+                manager.window.state('zoomed')
+            elif 'qt' in backend:
+                manager.window.showMaximized()
+        except Exception:
+            pass
+
         # Calculate number of frames if run_for specified
         if self.run_for:
             num_frames = int(self.run_for / self.interval)

From 3cf6b482a4e8f731eff6ff914d3e46b8b34163cf Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:08:48 -0700
Subject: [PATCH 08/15] traffic_grapher: Default to localhost when no hosts
 specified

Running 'traffic_grapher.py' with no arguments now monitors the
local ATS instance instead of requiring 'localhost' to be passed
explicitly.
---
 tools/traffic_grapher/traffic_grapher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 5a163775b57..515b4140f4f 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1585,6 +1585,7 @@ def main():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
+  %(prog)s                                                 # monitor localhost
   %(prog)s ats-server1.example.com
   %(prog)s ats-server{1..3}.example.com                    # bash expansion
   %(prog)s --interval 2 --history 120 ats-server{1..4}.example.com
@@ -1592,7 +1593,7 @@ def main():
 """)
 
     parser.add_argument(
-        'hosts', nargs='+', metavar='HOSTNAME', help='Hostnames to monitor (1-4 hosts, e.g., ats-server1.example.com)')
+        'hosts', nargs='*', default=['localhost'], metavar='HOSTNAME', help='Hostnames to monitor (default: localhost, max 4)')
     parser.add_argument('--interval', type=float, default=1.0, help='Refresh interval in seconds (default: 1.0)')
     parser.add_argument('--history', type=int, default=60, help='History window in seconds (default: 60)')
     parser.add_argument('--gui', action='store_true', help='Use matplotlib GUI window instead of imgcat')

From c3571e98d9d8a6fd38606a0cdc1b29a56417c756 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:13:17 -0700
Subject: [PATCH 09/15] traffic_grapher: Don't maximize GUI window on startup

The 16x10 default size is reasonable and the user can resize freely.
---
 tools/traffic_grapher/traffic_grapher.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 515b4140f4f..9456bc0e4f8 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1507,19 +1507,6 @@ def run_gui(self):
 
         fig = self.render_page()
 
-        # Maximize the window on startup
-        try:
-            manager = plt.get_current_fig_manager()
-            backend = matplotlib.get_backend().lower()
-            if 'macosx' in backend:
-                manager.full_screen_toggle()
-            elif 'tk' in backend:
-                manager.window.state('zoomed')
-            elif 'qt' in backend:
-                manager.window.showMaximized()
-        except Exception:
-            pass
-
         # Calculate number of frames if run_for specified
         if self.run_for:
             num_frames = int(self.run_for / self.interval)

From 4e59a7257516d232e496ba311857d6444d1a6a44 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:13:56 -0700
Subject: [PATCH 10/15] traffic_grapher: Use 14x8 default GUI size to fit
 within screen

---
 tools/traffic_grapher/traffic_grapher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 9456bc0e4f8..4c284b86735 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1253,7 +1253,7 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure:
 
         if fig is None:
             if self.gui_mode:
-                fig, axes = plt.subplots(2, 2, figsize=(16, 10))
+                fig, axes = plt.subplots(2, 2, figsize=(14, 8))
             else:
                 fig_width, fig_height = get_figure_size_for_terminal()
                 fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height))

From e6e59ffd4c87209b906f13ba4ceda540eb2503d4 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:16:23 -0700
Subject: [PATCH 11/15] traffic_grapher: Add keyboard navigation and hide
 toolbar in GUI mode

Hook into matplotlib key_press_event so h/l and arrow keys switch
pages in GUI mode. Hide the default navigation toolbar since pan/zoom
controls are not useful for a live dashboard.
---
 tools/traffic_grapher/traffic_grapher.py | 26 ++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 4c284b86735..09cf6ecd9ad 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1507,18 +1507,36 @@ def run_gui(self):
 
         fig = self.render_page()
 
-        # Calculate number of frames if run_for specified
+        # Hide the default matplotlib toolbar
+        try:
+            fig.canvas.manager.toolbar.pack_forget()
+        except Exception:
+            try:
+                fig.canvas.toolbar.setVisible(False)
+            except Exception:
+                pass
+
+        # Keyboard navigation for page switching
+        def on_key(event):
+            if event.key in ('left', 'h'):
+                self.current_page = (self.current_page - 1) % len(self.pages)
+            elif event.key in ('right', 'l'):
+                self.current_page = (self.current_page + 1) % len(self.pages)
+            elif event.key == 'q':
+                plt.close(fig)
+
+        fig.canvas.mpl_connect('key_press_event', on_key)
+
         if self.run_for:
             num_frames = int(self.run_for / self.interval)
             repeat = False
         else:
-            num_frames = None  # Infinite
+            num_frames = None
             repeat = True
 
         def update(frame):
             self.collect_all_pages()
-            fig.clf()  # Clear the existing figure, don't create new one
-            # Re-render onto the same figure
+            fig.clf()
             self.render_page(fig=fig)
             fig.canvas.draw_idle()
             return []

From 1afdc66f5447e6e1c26ff1afcb02140d6fb4c737 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:17:38 -0700
Subject: [PATCH 12/15] traffic_grapher: Fix toolbar hiding on MacOSX backend

Set rcParams toolbar to None before figure creation instead of
trying to remove it after.
---
 tools/traffic_grapher/traffic_grapher.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 09cf6ecd9ad..c8d9943da5e 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1505,17 +1505,9 @@ def run_gui(self):
 
         self.collect_all_pages()
 
+        matplotlib.rcParams['toolbar'] = 'None'
         fig = self.render_page()
 
-        # Hide the default matplotlib toolbar
-        try:
-            fig.canvas.manager.toolbar.pack_forget()
-        except Exception:
-            try:
-                fig.canvas.toolbar.setVisible(False)
-            except Exception:
-                pass
-
         # Keyboard navigation for page switching
         def on_key(event):
             if event.key in ('left', 'h'):

From 3fedbc04af5aea88e72c17840e021d1c6383282a Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:18:43 -0700
Subject: [PATCH 13/15] traffic_grapher: Revert toolbar hiding, MacOSX backend
 ignores it

---
 tools/traffic_grapher/traffic_grapher.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index c8d9943da5e..d6a80ee5b4a 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -60,7 +60,6 @@
 import matplotlib
 # Check for --gui in sys.argv early to set backend before importing pyplot
 if '--gui' in sys.argv:
-    # Use MacOSX on macOS, fall back to TkAgg on other platforms
     import platform
     if platform.system() == 'Darwin':
         matplotlib.use('MacOSX')
@@ -1505,7 +1504,6 @@ def run_gui(self):
 
         self.collect_all_pages()
 
-        matplotlib.rcParams['toolbar'] = 'None'
         fig = self.render_page()
 
         # Keyboard navigation for page switching

From 27edb7bbbe8e73b4fda8a7c643d6704d51275f82 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:19:22 -0700
Subject: [PATCH 14/15] traffic_grapher: Hide toolbar in GUI mode via rcParams

Set toolbar to None before pyplot import so the matplotlib window
does not show the default navigation toolbar.
---
 tools/traffic_grapher/traffic_grapher.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index d6a80ee5b4a..3d4ab54cacb 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -65,6 +65,7 @@
         matplotlib.use('MacOSX')
     else:
         matplotlib.use('TkAgg')
+    matplotlib.rcParams['toolbar'] = 'None'
 else:
     matplotlib.use('Agg')
 import matplotlib.pyplot as plt

From 1624504a47324a8b550b91863e4f92f489ca7690 Mon Sep 17 00:00:00 2001
From: Bryan Call <bcall@apache.org>
Date: Wed, 11 Mar 2026 16:22:08 -0700
Subject: [PATCH 15/15] traffic_grapher: Suppress terminal output in GUI mode

In GUI mode, skip all startup prints and stderr collection errors
since the dashboard already shows errors visually. This allows
running in the background without output spilling into the terminal.
---
 tools/traffic_grapher/traffic_grapher.py | 44 +++++++++++++++---------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py
index 3d4ab54cacb..769fdfb4a4c 100755
--- a/tools/traffic_grapher/traffic_grapher.py
+++ b/tools/traffic_grapher/traffic_grapher.py
@@ -1119,7 +1119,7 @@ def collect_all_pages(self):
             batch_results.append(results)
             host_times.append(collect_time)
 
-            if batch_collector.last_error and batch_collector.consecutive_errors <= 3:
+            if batch_collector.last_error and batch_collector.consecutive_errors <= 3 and not self.gui_mode:
                 print(f"[{host_label}] {batch_collector.last_error}", file=sys.stderr)
 
             if self.log_stats:
@@ -1648,49 +1648,61 @@ def main():
         tz_name=tz_name,
         socket_path=args.socket)
 
-    print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history")
-    if len(args.hosts) > 1:
-        print(f"Comparing: {' vs '.join(grapher.host_names)}")
-    else:
-        print(f"Monitoring: {grapher.host_names[0]}")
+    # In GUI mode, suppress terminal output -- errors show on the dashboard
+    verbose = not args.gui
+
+    if verbose:
+        print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history")
+        if len(args.hosts) > 1:
+            print(f"Comparing: {' vs '.join(grapher.host_names)}")
+        else:
+            print(f"Monitoring: {grapher.host_names[0]}")
 
     # Auto-discover socket path for hosts that don't have one explicitly set
     for i, collector in enumerate(grapher.combined_collectors):
         if collector.socket_path is not None:
             continue
         host_label = grapher.host_names[i]
-        print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True)
+        if verbose:
+            print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True)
         path = discover_socket_path(collector.hostname, collector.is_local)
         if path:
-            print(f"found: {path}")
+            if verbose:
+                print(f"found: {path}")
             collector.socket_path = path
         else:
-            print("not found (use --socket to specify)")
+            if verbose:
+                print("not found (use --socket to specify)")
 
     # Test connectivity to all hosts before starting
-    print("\nTesting connections...")
+    if verbose:
+        print("\nTesting connections...")
     all_ok = True
     for i, collector in enumerate(grapher.combined_collectors):
         host_label = grapher.host_names[i]
         mode = "local socket" if collector.is_local else "SSH"
         if collector.socket_path is None:
-            print(f"  {host_label} ({mode}): SKIPPED - no socket path")
+            if verbose:
+                print(f"  {host_label} ({mode}): SKIPPED - no socket path")
             all_ok = False
             continue
-        print(f"  {host_label} ({mode}): ", end="", flush=True)
+        if verbose:
+            print(f"  {host_label} ({mode}): ", end="", flush=True)
         ok, msg = collector.test_connection()
         if ok:
-            print(f"OK ({collector.socket_path})")
+            if verbose:
+                print(f"OK ({collector.socket_path})")
         else:
-            print(f"FAILED - {msg}")
+            if verbose:
+                print(f"FAILED - {msg}")
             all_ok = False
 
-    if not all_ok:
+    if not all_ok and verbose:
         print("\nWARNING: Some hosts failed connectivity test.")
         print("The grapher will start anyway - errors will be shown on the dashboard.")
         print("Press Ctrl+C to cancel, or wait 3 seconds to continue...")
         time.sleep(3)
-    else:
+    elif verbose:
         print("\nAll hosts connected. Starting in 1 second...")
         time.sleep(1)