From ed6173f0042fd6ecd4a01915e531debc0cd0d17f Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Mon, 2 Feb 2026 12:30:37 -0800 Subject: [PATCH 01/15] tools: Add traffic_grapher for real-time ATS metrics visualization A Python tool that displays ATS metrics inline in iTerm2 using imgcat with live updates and multi-host comparison. Features: - Real-time graphs of RPS, latency, cache hit rate, connections - Support for 1-4 hosts with different line styles for comparison - Collects metrics via JSONRPC Unix socket (batch collection) - Dark theme optimized for terminal display - Keyboard navigation between metric pages (4 pages) - Configurable refresh interval and history window Requirements: - Python 3 with matplotlib - iTerm2 (or compatible terminal for inline images) - SSH access to remote ATS hosts Usage: traffic_grapher.py ats-server1.example.com traffic_grapher.py ats{1..4}.example.com --interval 2 --- tools/traffic_grapher/traffic_grapher.py | 1411 ++++++++++++++++++++++ 1 file changed, 1411 insertions(+) create mode 100755 tools/traffic_grapher/traffic_grapher.py diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py new file mode 100755 index 00000000000..b6b6015db3c --- /dev/null +++ b/tools/traffic_grapher/traffic_grapher.py @@ -0,0 +1,1411 @@ +#!/usr/bin/env python3 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Traffic Grapher - Real-time ATS metrics visualization. + +Displays metrics inline in iTerm using imgcat with multi-host overlay comparison, +keyboard navigation between pages, and flicker-free live updates. + +Usage: + traffic_grapher.py ats-server1.example.com + traffic_grapher.py ats-server{1..4}.example.com + traffic_grapher.py --interval 2 --history 120 ats-server1.example.com ats-server2.example.com +""" + +import argparse +import base64 +import fcntl +import gc +import io +import os +import select +import shutil +import struct +import subprocess +import sys +import termios +import time +import tty +from collections import deque +from datetime import datetime, timezone +from typing import Optional, Tuple + +try: + from zoneinfo import ZoneInfo +except ImportError: + from datetime import tzinfo + ZoneInfo = None # Will fall back to UTC only + +import matplotlib +# Check for --gui in sys.argv early to set backend before importing pyplot +if '--gui' in sys.argv: + # Use MacOSX on macOS, fall back to TkAgg on other platforms + import platform + if platform.system() == 'Darwin': + matplotlib.use('MacOSX') + else: + matplotlib.use('TkAgg') +else: + matplotlib.use('Agg') +import matplotlib.pyplot as plt +from matplotlib.ticker import FuncFormatter +import yaml + +# ============================================================================= +# Built-in Default Pages - 2x2 grid per page +# ============================================================================= + +DEFAULT_PAGES = [ + { + "name": "Traffic & Cache", + "panels": + [ + { + "title": "Requests/sec", + "metrics": + [ + { + "name": "Client", + "type": "counter", + "key": "proxy.process.http.incoming_requests", + "color": "#FFFF00" + }, # Yellow + { + "name": "Origin", + "type": "counter", + "key": "proxy.process.http.outgoing_requests", + "color": "#BF00FF" + }, # Electric Purple (complement) + ] + }, + { + "title": "Latency (ms)", + "metrics": + [ + { + "name": "Cache Hit", + "type": "latency", + "key": "proxy.process.http.transaction_totaltime.hit_fresh", + "key2": "proxy.process.http.transaction_counts.hit_fresh", + "color": "#00FF00" + }, # Lime Green + { + "name": "Origin", + "type": "latency", + "key": "proxy.process.http.transaction_totaltime.miss_cold", + "key2": "proxy.process.http.transaction_counts.miss_cold", + "color": "#FF00FF" + }, # Magenta (complement) + ] + }, + { + "title": "Cache Hit Rate %", + "metrics": + [ + { + "name": "Hit Rate", + "type": "hit_rate", + "key": "proxy.process.cache_total_hits", + "key2": "proxy.process.cache_total_misses", + "color": "#00FF7F" + }, # Spring Green + ] + }, + { + "title": "Connections", + "metrics": + [ + { + "name": "Client", + "type": "gauge", + "key": "proxy.process.http.current_client_connections", + "color": "#00FFFF" + }, # Cyan + { + "name": "Origin", + "type": "gauge", + "key": "proxy.process.http.current_server_connections", + "color": "#FF4040" + }, # Bright Red (complement) + ] + }, + ] + }, + { + "name": "Response Codes", + "panels": + [ + { + "title": "2xx Responses/sec", + "metrics": [{ + "name": "2xx", + "type": "counter", + "key": "proxy.process.http.2xx_responses", + "color": "#00FF00" + },] + }, + { + "title": "3xx Responses/sec", + "metrics": [{ + "name": "3xx", + "type": "counter", + "key": "proxy.process.http.3xx_responses", + "color": "#00FFFF" + },] + }, + { + "title": "4xx Responses/sec", + "metrics": [{ + "name": "4xx", + "type": "counter", + "key": "proxy.process.http.4xx_responses", + "color": "#FFA500" + },] + }, + { + "title": "5xx Responses/sec", + "metrics": [{ + "name": "5xx", + "type": "counter", + "key": "proxy.process.http.5xx_responses", + "color": "#FF0000" + },] + }, + ] + }, + { + "name": "TLS & HTTP/2", + "panels": + [ + { + "title": "SSL Handshakes/sec", + "metrics": + [ + { + "name": "Success", + "type": "counter", + "key": "proxy.process.ssl.total_success_handshake_count_in", + "color": "#00FF00" + }, + { + "name": "Failed", + "type": "counter", + "key": "proxy.process.ssl.ssl_error_ssl", + "color": "#FF0000" + }, + ] + }, + { + "title": "SSL Connections", + "metrics": + [{ + "name": "Active", + "type": "gauge", + "key": "proxy.process.ssl.user_agent_sessions", + "color": "#00FFFF" + },] + }, + { + "title": "HTTP/2 Connections", + "metrics": + [ + { + "name": "Total", + "type": "counter", + "key": "proxy.process.http2.total_client_connections", + "color": "#FF00FF" + }, + { + "name": "Active", + "type": "gauge", + "key": "proxy.process.http2.current_client_sessions", + "color": "#00FFFF" + }, + ] + }, + { + "title": "HTTP/2 Errors/sec", + "metrics": + [ + { + "name": "Errors", + "type": "counter", + "key": "proxy.process.http2.connection_errors", + "color": "#FF0000" + }, + ] + }, + ] + }, + { + "name": "Network & Errors", + "panels": + [ + { + "title": "Client Bytes/sec", + "metrics": + [ + { + "name": "Read", + "type": "counter", + "key": "proxy.process.http.user_agent_total_request_bytes", + "color": "#00FFFF" + }, + { + "name": "Write", + "type": "counter", + "key": "proxy.process.http.user_agent_total_response_bytes", + "color": "#FF00FF" + }, + ] + }, + { + "title": "Origin Bytes/sec", + "metrics": + [ + { + "name": "Read", + "type": "counter", + "key": "proxy.process.http.origin_server_total_response_bytes", + "color": "#00FF00" + }, + { + "name": "Write", + "type": "counter", + "key": "proxy.process.http.origin_server_total_request_bytes", + "color": "#FFA500" + }, + ] + }, + { + "title": "Connection Errors/sec", + "metrics": + [ + { + "name": "Connect Fail", + "type": "counter", + "key": "proxy.process.http.err_connect_fail_count", + "color": "#FF0000" + }, + ] + }, + { + "title": "Transaction Errors/sec", + "metrics": + [ + { + "name": "Aborts", + "type": "counter", + "key": "proxy.process.http.transaction_counts.errors.aborts", + "color": "#FFA500" + }, + ] + }, + ] + }, +] + +# Command template for traffic_ctl +# Default path (adjust for your installation) +# Note: awk runs locally to avoid SSH quote escaping issues +METRIC_COMMAND_REMOTE = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key}" +METRIC_COMMAND_LOCAL = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key} | awk '{{print $2}}'" + +# ============================================================================= +# Terminal Utilities +# ============================================================================= + + +def cursor_home(): + """Move cursor to top-left position.""" + sys.stdout.write('\033[H') + sys.stdout.flush() + + +def clear_screen(): + """Clear the terminal screen.""" + sys.stdout.write('\033[2J\033[H') + sys.stdout.flush() + + +def clear_scrollback(): + """Clear iTerm2 scrollback buffer to free memory from accumulated images.""" + # iTerm2-specific escape sequence to clear scrollback + sys.stdout.write('\033]1337;ClearScrollback\007') + # Also send standard escape sequence that works in some other terminals + sys.stdout.write('\033[3J') + sys.stdout.flush() + + +def hide_cursor(): + """Hide the terminal cursor.""" + sys.stdout.write('\033[?25l') + sys.stdout.flush() + + +def show_cursor(): + """Show the terminal cursor.""" + sys.stdout.write('\033[?25h') + sys.stdout.flush() + + +def imgcat_display(fig): + """ + Display a matplotlib figure inline in iTerm2 using imgcat protocol. + """ + buf = io.BytesIO() + # Don't use bbox_inches='tight' - we want exact figure dimensions to fill terminal + fig.savefig(buf, format='png', dpi=100, facecolor=fig.get_facecolor(), edgecolor='none') + buf.seek(0) + image_data = base64.b64encode(buf.read()).decode('ascii') + buf.close() + + # iTerm2 inline image protocol + sys.stdout.write(f'\033]1337;File=inline=1:{image_data}\a\n') + sys.stdout.flush() + + +def format_value(value: float, is_percent: bool = False, is_latency: bool = False) -> str: + """Format a value with K/M suffix for readability, or as percentage/latency.""" + if value is None: + return "N/A" + if is_percent: + return f"{value:.0f}%" + if is_latency: + # Format latency in milliseconds + if value >= 1000: + return f"{value/1000:.1f}s" + elif value >= 1: + return f"{value:.0f}ms" + else: + return f"{value:.1f}ms" + if abs(value) >= 1_000_000_000_000: + return f"{value/1_000_000_000_000:.1f}T" + if abs(value) >= 1_000_000_000: + return f"{value/1_000_000_000:.1f}G" + if abs(value) >= 1_000_000: + return f"{value/1_000_000:.1f}M" + elif abs(value) >= 1_000: + return f"{value/1_000:.1f}K" + elif abs(value) >= 1: + return f"{value:.0f}" + else: + return f"{value:.2f}" + + +def format_ytick(value, pos): + """Format Y-axis tick values with K/M/G/T suffixes, no decimals.""" + if value == 0: + return '0' + if abs(value) < 1: + # Very small values - show with appropriate precision + if abs(value) < 0.01: + return '0' + return f'{value:.1f}' + if abs(value) >= 1e12: + return f'{int(value/1e12)}T' + if abs(value) >= 1e9: + return f'{int(value/1e9)}G' + if abs(value) >= 1e6: + return f'{int(value/1e6)}M' + if abs(value) >= 1e3: + return f'{int(value/1e3)}K' + return f'{int(value)}' + + +def get_terminal_pixel_size() -> Tuple[int, int]: + """ + Get terminal size in pixels using TIOCGWINSZ ioctl. + Returns (width, height) in pixels, or estimated size from rows/cols. + """ + try: + # Try to get pixel size from terminal + result = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, b'\x00' * 8) + rows, cols, xpixel, ypixel = struct.unpack('HHHH', result) + + if xpixel > 0 and ypixel > 0: + return (xpixel, ypixel) + + # Estimate from rows/cols (typical cell: 9x18 pixels) + return (cols * 9, rows * 18) + except: + # Fallback: assume reasonable terminal size + size = shutil.get_terminal_size((80, 24)) + return (size.columns * 9, size.lines * 18) + + +def get_figure_size_for_terminal() -> Tuple[float, float]: + """ + Calculate matplotlib figure size to fill terminal. + Returns (width, height) in inches for matplotlib. + """ + pixel_width, pixel_height = get_terminal_pixel_size() + + # imgcat displays at native resolution, so we want pixel dimensions + # that match the terminal. Use 100 DPI as our reference. + dpi = 100 + + # Use nearly full terminal width/height - imgcat will display at native size + # Leave minimal margin for terminal chrome + width_inches = (pixel_width * 0.99) / dpi + height_inches = (pixel_height * 0.92) / dpi # Leave room for status bar + + # Minimum bounds for readability, no max - let it fill the terminal + width_inches = max(12, width_inches) + height_inches = max(8, height_inches) + + return (width_inches, height_inches) + + +class KeyboardHandler: + """Non-blocking keyboard input handler.""" + + def __init__(self): + self.old_settings = None + self.fd = None + + def __enter__(self): + self.fd = sys.stdin.fileno() + self.old_settings = termios.tcgetattr(self.fd) + # Set raw mode for better key detection + new_settings = termios.tcgetattr(self.fd) + new_settings[3] = new_settings[3] & ~(termios.ICANON | termios.ECHO) + new_settings[6][termios.VMIN] = 0 + new_settings[6][termios.VTIME] = 0 + termios.tcsetattr(self.fd, termios.TCSADRAIN, new_settings) + return self + + def __exit__(self, *args): + if self.old_settings and self.fd is not None: + termios.tcsetattr(self.fd, termios.TCSADRAIN, self.old_settings) + + def get_key(self) -> Optional[str]: + """Get a keypress if available, non-blocking.""" + try: + ch = os.read(self.fd, 1) + if not ch: + return None + + if ch == b'\x1b': # Escape sequence + # Read more bytes for arrow key sequences + try: + ch2 = os.read(self.fd, 1) + if ch2 == b'[': + ch3 = os.read(self.fd, 1) + if ch3 == b'D': + return 'left' + elif ch3 == b'C': + return 'right' + elif ch3 == b'A': + return 'up' + elif ch3 == b'B': + return 'down' + except: + pass + return 'escape' + elif ch in (b'q', b'Q'): + return 'quit' + elif ch in (b'h', b'H'): # vim-style left + return 'left' + elif ch in (b'l', b'L'): # vim-style right + return 'right' + except (OSError, IOError): + pass + return None + + +# ============================================================================= +# Metric Collection +# ============================================================================= + +# JSONRPC script template - runs on remote host via SSH +JSONRPC_SCRIPT = ''' +import socket +import json + +sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +sock.connect("{socket_path}") + +request = {{ + "jsonrpc": "2.0", + "method": "admin_lookup_records", + "params": [{{"record_name_regex": "{pattern}"}}], + "id": 1 +}} +sock.sendall(json.dumps(request).encode() + b"\\n") +response = sock.recv(1048576).decode() +sock.close() + +# Parse and output simple key=value format +data = json.loads(response) +for rec in data.get("result", {{}}).get("recordList", []): + r = rec.get("record", {{}}) + name = r.get("record_name", "") + value = r.get("current_value", "") + dtype = r.get("data_type", "") + print(f"{{name}}={{value}}={{dtype}}") +''' + +# Default JSONRPC socket path (adjust for your installation) +JSONRPC_SOCKET_PATH = "/opt/edge/trafficserver/10.0/var/trafficserver/jsonrpc20.sock" + + +class JSONRPCBatchCollector: + """ + Batch metric collector using JSONRPC Unix socket. + Collects all metrics for a host in a single SSH call. + """ + + def __init__(self, hostname: str, metric_keys: list, socket_path: str = JSONRPC_SOCKET_PATH): + self.hostname = hostname # Bare hostname like "ats-server1.example.com" + self.metric_keys = metric_keys + self.socket_path = socket_path + + # Build regex pattern matching all metric keys + # Escape dots and join with | + escaped_keys = [k.replace('.', r'\.') for k in metric_keys] + self.pattern = '|'.join(f"^{k}$" for k in escaped_keys) + + # Cached results from last collection + self.last_values: dict = {} # key -> (value, data_type) + + def collect(self) -> dict: + """ + Collect all metrics in one SSH call. + Returns dict of {metric_key: (value, data_type)}. + """ + script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern) + + # Build SSH command - hostname is passed directly, we add "ssh" prefix + # Encode script as base64 to avoid quoting issues + script_b64 = base64.b64encode(script.encode()).decode() + cmd = f"ssh {self.hostname} \"echo '{script_b64}' | base64 -d | python3\"" + + try: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10) + + if result.returncode != 0: + return self.last_values + + # Parse output: key=value=dtype per line + values = {} + for line in result.stdout.strip().split('\n'): + if '=' in line: + parts = line.split('=', 2) + if len(parts) >= 2: + key = parts[0] + try: + value = float(parts[1]) + except: + continue + dtype = parts[2] if len(parts) > 2 else "INT" + values[key] = (value, dtype) + + self.last_values = values + return values + + except (subprocess.TimeoutExpired, OSError): + return self.last_values + + +class MetricCollector: + """Collects metric values from a host via shell commands.""" + + def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefix: str = "", host_name: str = ""): + self.name = name + self.key = key + self.metric_type = metric_type.lower() + self.color = color + self.host_prefix = host_prefix + self.host_name = host_name + + # Build the full command + if host_prefix: + # For remote hosts: run traffic_ctl on remote, awk locally + remote_cmd = METRIC_COMMAND_REMOTE.format(key=key) + self.command = f"{host_prefix} '{remote_cmd}' | awk '{{print $2}}'" + else: + # Local: run everything locally + self.command = METRIC_COMMAND_LOCAL.format(key=key) + + # For counter metrics, track previous value and time + self._prev_value: Optional[float] = None + self._prev_time: Optional[float] = None + + # Latest value for display + self.latest_value: Optional[float] = None + + def _get_raw_value(self) -> Optional[float]: + """Run the command and return the raw numeric value.""" + try: + result = subprocess.run(self.command, shell=True, capture_output=True, text=True, timeout=5) + if result.returncode != 0: + return None + + output = result.stdout.strip() + if not output: + return None + + return float(output) + except (subprocess.TimeoutExpired, ValueError, OSError): + return None + + def collect(self) -> Optional[float]: + """Collect the metric value.""" + raw_value = self._get_raw_value() + current_time = time.time() + + if self.metric_type == 'gauge': + self.latest_value = raw_value + return raw_value + + # Counter: calculate rate of change + if raw_value is None: + return None + + if self._prev_value is None or self._prev_time is None: + self._prev_value = raw_value + self._prev_time = current_time + return None + + time_delta = current_time - self._prev_time + if time_delta <= 0: + return None + + # Handle counter reset + if raw_value < self._prev_value: + self._prev_value = raw_value + self._prev_time = current_time + return None + + rate = (raw_value - self._prev_value) / time_delta + self._prev_value = raw_value + self._prev_time = current_time + + self.latest_value = rate + return rate + + +# ============================================================================= +# Main Grapher +# ============================================================================= + + +class ATSGrapher: + """Main grapher class with 2x2 grid layout and dark theme.""" + + # Dark theme colors + FIG_BG_COLOR = '#000000' # Pure black for figure background (titles, labels area) + PLOT_BG_COLOR = '#1a1a1a' # Dark grey for inside the plots + TEXT_COLOR = '#ffffff' + GRID_COLOR = '#555555' # Lighter grid for better visibility + AXIS_COLOR = '#888888' # Visible axis/spine color + + # Line styles for differentiating hosts (up to 4) + LINE_STYLES = ['-', '--', '-.', ':'] # solid, dashed, dash-dot, dotted + + def __init__( + self, + hostnames: list, + interval: float, + history_seconds: int, + pages: list, + gui_mode: bool = False, + save_png: Optional[str] = None, + log_stats: Optional[str] = None, + run_for: Optional[int] = None, + no_keyboard: bool = False, + tz_name: str = "UTC"): + self.hostnames = hostnames # Bare hostnames like ["ats-server1.example.com"] + self.interval = interval + self.history_seconds = history_seconds + self.pages = pages + self.gui_mode = gui_mode + self.save_png = save_png + self.log_stats = log_stats + self.run_for = run_for + self.no_keyboard = no_keyboard + self.tz_name = tz_name + self.num_hosts = len(hostnames) + + # Set up timezone + if ZoneInfo and tz_name != "UTC": + try: + self.tz = ZoneInfo(tz_name) + except: + self.tz = timezone.utc + self.tz_name = "UTC" + else: + self.tz = timezone.utc + if tz_name != "UTC": + self.tz_name = "UTC" # Fallback if zoneinfo not available + + self.current_page = 0 + self.running = True + self.start_time = time.time() + self.iteration = 0 + + # Extract short host names for display (e.g., "e1" from "ats-server1.example.com") + self.host_names = [self._extract_short_name(h) for h in hostnames] + + # Initialize log file if specified + if self.log_stats: + with open(self.log_stats, 'w') as f: + f.write(f"# Traffic Grapher Stats Log - {datetime.now()}\n") + f.write(f"# Hosts: {', '.join(hostnames)}\n") + f.write("#\n") + + # Collect ALL unique metric keys across ALL pages for combined batch collection + all_metric_keys = [] + for page in pages: + for panel in page["panels"]: + for metric in panel["metrics"]: + if metric["key"] not in all_metric_keys: + all_metric_keys.append(metric["key"]) + # Also collect key2 for hit_rate metrics + if "key2" in metric and metric["key2"] not in all_metric_keys: + all_metric_keys.append(metric["key2"]) + + # Create ONE combined batch collector per host (collects all metrics at once) + self.combined_collectors = [JSONRPCBatchCollector(hostname, all_metric_keys) for hostname in hostnames] + + # Initialize metric info and data for all pages + # metric_info[page][panel][metric] = {name, key, type, color} + # data[page][panel][metric][host] = deque of (elapsed_time, value) + self.metric_info: list = [] + self.data: list = [] + + # Track previous values for counter rate calculation + # prev_values[page][panel][metric][host] = (prev_raw, prev_time) + self.prev_values: list = [] + + # Track latest values for display + # latest_values[page][panel][metric][host] = value + self.latest_values: list = [] + + for page in pages: + page_info = [] + page_data = [] + page_prev = [] + page_latest = [] + for panel in page["panels"]: + panel_info = [] + panel_data = [] + panel_prev = [] + panel_latest = [] + for metric in panel["metrics"]: + info = {"name": metric["name"], "key": metric["key"], "type": metric["type"].lower(), "color": metric["color"]} + # Store key2 for hit_rate metrics + if "key2" in metric: + info["key2"] = metric["key2"] + panel_info.append(info) + + # Initialize data structures for each host + metric_data = [deque() for _ in range(self.num_hosts)] + metric_prev = [None for _ in range(self.num_hosts)] + metric_latest = [None for _ in range(self.num_hosts)] + panel_data.append(metric_data) + panel_prev.append(metric_prev) + panel_latest.append(metric_latest) + page_info.append(panel_info) + page_data.append(panel_data) + page_prev.append(panel_prev) + page_latest.append(panel_latest) + self.metric_info.append(page_info) + self.data.append(page_data) + self.prev_values.append(page_prev) + self.latest_values.append(page_latest) + + # Terminal size tracking + self.last_terminal_size = shutil.get_terminal_size() + + def _extract_short_name(self, hostname: str) -> str: + """Extract a short display name from a full hostname.""" + if not hostname: + return "local" + # Take first part before dot (e.g., "e1" from "ats-server1.example.com") + if '.' in hostname: + return hostname.split('.')[0] + return hostname[:15] + + def _check_resize(self) -> bool: + """Check if terminal was resized.""" + current_size = shutil.get_terminal_size() + if current_size != self.last_terminal_size: + self.last_terminal_size = current_size + return True + return False + + def _trim_history(self): + """Remove old data points outside the history window.""" + cutoff = time.time() - self.start_time - self.history_seconds + for page_data in self.data: + for panel_data in page_data: + for metric_data in panel_data: + for host_data in metric_data: + while host_data and host_data[0][0] < cutoff: + host_data.popleft() + + def collect_all_pages(self): + """Collect data for ALL pages in a single batch per host.""" + self.iteration += 1 + + log_lines = [] + if self.log_stats: + elapsed = time.time() - self.start_time + log_lines.append(f"\n=== Iteration {self.iteration} at {elapsed:.1f}s ===\n") + + # Batch collect from each host - capture time PER HOST + batch_results = [] + host_times = [] + + for host_idx, batch_collector in enumerate(self.combined_collectors): + host_label = self.host_names[host_idx] + + if self.log_stats: + log_lines.append(f" Collecting from {host_label}...\n") + + collect_time = time.time() + results = batch_collector.collect() + batch_results.append(results) + host_times.append(collect_time) + + if self.log_stats: + log_lines.append(f" Got {len(results)} metrics\n") + + # Distribute values to ALL pages + for page_idx in range(len(self.pages)): + for panel_idx, panel_info in enumerate(self.metric_info[page_idx]): + panel = self.pages[page_idx]["panels"][panel_idx] + + for metric_idx, metric in enumerate(panel_info): + key = metric["key"] + metric_type = metric["type"] + + for host_idx in range(self.num_hosts): + host_label = self.host_names[host_idx] + current_time = host_times[host_idx] + + if key in batch_results[host_idx]: + raw_value, dtype = batch_results[host_idx][key] + + # Calculate value based on metric type + if metric_type == 'gauge': + value = raw_value + self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value + elif metric_type == 'hit_rate': + # Calculate hit rate percentage: hits / (hits + misses) * 100 + key2 = metric.get("key2") + if key2 and key2 in batch_results[host_idx]: + hits_raw = raw_value + misses_raw, _ = batch_results[host_idx][key2] + + prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx] + if prev is None: + # Store both hits and misses as prev value + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = ( + (hits_raw, misses_raw), current_time) + value = None + else: + (prev_hits, prev_misses), prev_time = prev + time_delta = current_time - prev_time + if time_delta > 0: + delta_hits = hits_raw - prev_hits + delta_misses = misses_raw - prev_misses + total = delta_hits + delta_misses + if total > 0: + value = (delta_hits / total) * 100.0 + self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value + else: + value = None + else: + value = None + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = ( + (hits_raw, misses_raw), current_time) + else: + value = None + elif metric_type == 'latency': + # Calculate average latency: delta_time / delta_count + # key = total time counter (milliseconds), key2 = count counter + key2 = metric.get("key2") + if key2 and key2 in batch_results[host_idx]: + time_raw = raw_value # Total time in milliseconds + count_raw, _ = batch_results[host_idx][key2] + + prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx] + if prev is None: + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = ( + (time_raw, count_raw), current_time) + value = None + else: + (prev_time_raw, prev_count), prev_time = prev + delta_time_ms = time_raw - prev_time_raw + delta_count = count_raw - prev_count + if delta_count > 0 and delta_time_ms >= 0: + # Average latency already in milliseconds + value = delta_time_ms / delta_count + self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value + else: + value = None + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = ( + (time_raw, count_raw), current_time) + else: + value = None + else: + # Counter: calculate rate + prev = self.prev_values[page_idx][panel_idx][metric_idx][host_idx] + if prev is None: + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (raw_value, current_time) + value = None + else: + prev_raw, prev_time = prev + time_delta = current_time - prev_time + if time_delta > 0 and raw_value >= prev_raw: + value = (raw_value - prev_raw) / time_delta + self.latest_values[page_idx][panel_idx][metric_idx][host_idx] = value + else: + value = None + self.prev_values[page_idx][panel_idx][metric_idx][host_idx] = (raw_value, current_time) + + if value is not None: + elapsed = current_time - self.start_time + self.data[page_idx][panel_idx][metric_idx][host_idx].append((elapsed, value)) + + if self.log_stats: + # Log current page data point counts only + page_idx = self.current_page + log_lines.append(f" Data points (page {page_idx + 1}):\n") + for panel_idx, panel_data in enumerate(self.data[page_idx]): + panel = self.pages[page_idx]["panels"][panel_idx] + for metric_idx, metric_data in enumerate(panel_data): + metric = self.metric_info[page_idx][panel_idx][metric_idx] + for host_idx, host_data in enumerate(metric_data): + host_label = self.host_names[host_idx] + log_lines.append(f" [{panel['title']}] {metric['name']} ({host_label}): {len(host_data)} points\n") + + with open(self.log_stats, 'a') as f: + f.writelines(log_lines) + + self._trim_history() + + def collect_data(self): + """Collect data - wrapper that calls collect_all_pages for backwards compatibility.""" + self.collect_all_pages() + + def render_page(self, fig: plt.Figure = None) -> plt.Figure: + """Render the current page as a matplotlib figure with 2x2 grid.""" + page = self.pages[self.current_page] + current_elapsed = time.time() - self.start_time + + # Set up dark theme - dynamic figure size to fill terminal + plt.style.use('dark_background') + fig_width, fig_height = get_figure_size_for_terminal() + + if fig is None: + # Create new figure + fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height)) + else: + # Reuse existing figure, create new axes + fig.set_size_inches(fig_width, fig_height) + axes = fig.subplots(2, 2) + + fig.patch.set_facecolor(self.FIG_BG_COLOR) # Pure black outside graphs + + # Flatten axes for easier iteration + axes_flat = axes.flatten() + + # Y-axis formatter for K/M/G/T + y_formatter = FuncFormatter(format_ytick) + + num_hosts = self.num_hosts + + for panel_idx, panel in enumerate(page["panels"]): + if panel_idx >= 4: + break + ax = axes_flat[panel_idx] + ax.set_facecolor(self.PLOT_BG_COLOR) # Dark grey inside plots + + panel_data = self.data[self.current_page][panel_idx] + panel_info = self.metric_info[self.current_page][panel_idx] + panel_latest = self.latest_values[self.current_page][panel_idx] + + # Build title with current values + value_parts = [] + metric_values = {} # Group values by metric for compact display + has_data = False + + for metric_idx, metric in enumerate(panel_info): + for host_idx in range(num_hosts): + host_data = panel_data[metric_idx][host_idx] + host_name = self.host_names[host_idx] + + # Plot the data - transform X to "seconds ago" + if host_data: + has_data = True + # Transform: x = current_elapsed - data_elapsed = age of data point + # Newest data (age ~0) on RIGHT, oldest data on LEFT + times = [current_elapsed - t for t, _ in host_data] + values = [v for _, v in host_data] + + color = metric["color"] + linestyle = self.LINE_STYLES[host_idx % len(self.LINE_STYLES)] + linewidth = 2.5 - (host_idx * 0.2) # Slightly thinner for each host + + label = metric["name"] + if self.num_hosts > 1: + label = f"{metric['name']} ({host_name})" + + ax.plot(times, values, color=color, linewidth=linewidth, linestyle=linestyle, label=label) + + # Collect values per metric for compact display + latest = panel_latest[metric_idx][host_idx] + if latest is not None: + if metric_idx not in metric_values: + metric_values[metric_idx] = {'name': metric['name'], 'type': metric['type'], 'values': []} + is_percent = metric['type'] == 'hit_rate' + is_latency = metric['type'] == 'latency' + metric_values[metric_idx]['values'].append(format_value(latest, is_percent, is_latency)) + + # Build compact title: "Requests/sec - Client: 3.6K/4.0K | Origin: 2.5K/2.7K" + for m_idx, m_data in metric_values.items(): + if m_data['values']: + if self.num_hosts > 1: + value_parts.append(f"{m_data['name']}: {'/'.join(m_data['values'])}") + else: + value_parts.append(f"{m_data['name']}: {m_data['values'][0]}") + + # Set title with values + if value_parts: + title = f"{panel['title']} - {' | '.join(value_parts)}" + else: + title = panel["title"] + ax.set_title(title, color=self.TEXT_COLOR, fontsize=12, fontweight='bold') + + # Configure axes - more visible grid and borders + ax.grid(True, alpha=0.6, color=self.GRID_COLOR, linewidth=0.8) + ax.tick_params(colors=self.TEXT_COLOR, labelsize=12) + for spine in ax.spines.values(): + spine.set_color(self.AXIS_COLOR) + spine.set_linewidth(1.5) + + # X-axis: oldest on left, 0 (now) on right - data flows right to left + ax.set_xlim(self.history_seconds, 0) # Reversed: oldest on left, 0=now on right + ax.set_xlabel('') # No label - self explanatory + ax.tick_params(axis='x', labelsize=14) + + # Check if this is a percentage or latency panel + is_percent_panel = any(m['type'] == 'hit_rate' for m in panel_info) + is_latency_panel = any(m['type'] == 'latency' for m in panel_info) + + if is_percent_panel: + # Percentage panel: fixed 0-100% range, no K/M/G formatter + ax.set_ylim(0, 100) + ax.set_ylabel("Hit Rate %", color=self.TEXT_COLOR, fontsize=14, fontweight='bold') + ax.tick_params(axis='y', labelsize=14) + elif is_latency_panel: + # Latency panel: auto-scale, Y-axis label in ms + ax.set_ylabel("Latency (ms)", color=self.TEXT_COLOR, fontsize=14, fontweight='bold') + ax.tick_params(axis='y', labelsize=14) + else: + # Y-axis: use K/M/G/T formatter, no decimals, add label + ax.yaxis.set_major_formatter(y_formatter) + ax.tick_params(axis='y', labelsize=14) + + # Y-axis label based on panel title + ylabel = panel["title"].split('/')[0].strip() if '/' in panel["title"] else "" + if ylabel: + ax.set_ylabel(ylabel, color=self.TEXT_COLOR, fontsize=14, fontweight='bold') + + # Legend if multiple metrics or hosts (only if we have data) + if has_data and (len(panel["metrics"]) > 1 or self.num_hosts > 1): + ax.legend( + loc='upper left', + fontsize=13, + facecolor=self.PLOT_BG_COLOR, + edgecolor=self.AXIS_COLOR, + labelcolor=self.TEXT_COLOR) + + # Overall title with date, time, and timezone + now = datetime.now(self.tz) + timestamp = now.strftime('%Y-%m-%d %H:%M:%S') + title = f"ATS Dashboard - {timestamp} {self.tz_name} - Page {self.current_page + 1}/{len(self.pages)}: {page['name']}" + if self.num_hosts > 1: + title += f"\n{' vs '.join(self.host_names)}" + else: + title += f" ({self.host_names[0]})" + fig.suptitle(title, color=self.TEXT_COLOR, fontsize=18, fontweight='bold') + + # Status bar + status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history" + fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080') + + plt.tight_layout() + plt.subplots_adjust(top=0.92, bottom=0.06, left=0.06, right=0.98, hspace=0.35, wspace=0.22) + + # Save PNG if requested + if self.save_png: + png_path = self.save_png + if '{iter}' in png_path: + png_path = png_path.replace('{iter}', str(self.iteration)) + fig.savefig(png_path, facecolor=fig.get_facecolor(), edgecolor='none', dpi=100) + + return fig + + def run_imgcat(self): + """Run the grapher in imgcat mode.""" + run_until = None + if self.run_for: + run_until = time.time() + self.run_for + + # Non-keyboard mode (for non-TTY environments) + if self.no_keyboard: + try: + while self.running: + if run_until and time.time() >= run_until: + print(f"\n\nRun time of {self.run_for}s reached. Exiting.") + break + + self.collect_data() + fig = self.render_page() + + # In no-keyboard mode, just save the PNG, don't try imgcat + if self.save_png: + print(f"Iteration {self.iteration}: saved to {self.save_png}") + plt.close(fig) + plt.close('all') # Close any remaining figures + gc.collect() # Force garbage collection to reduce memory + + # Clear iTerm scrollback every 60 iterations to prevent memory buildup + if self.iteration % 60 == 0: + clear_scrollback() + + time.sleep(self.interval) + except KeyboardInterrupt: + print("\nStopped by user") + return + + # Normal interactive mode + hide_cursor() + clear_screen() + + try: + with KeyboardHandler() as kbd: + while self.running: + # Check run_for timeout + if run_until and time.time() >= run_until: + print(f"\n\nRun time of {self.run_for}s reached. Exiting.") + break + + # Check for resize + if self._check_resize(): + clear_screen() + else: + cursor_home() + + # Collect data and render + self.collect_data() + fig = self.render_page() + imgcat_display(fig) + plt.close(fig) + plt.close('all') + gc.collect() + + # Clear iTerm scrollback every 60 iterations to prevent memory buildup + if self.iteration % 60 == 0: + clear_scrollback() + + # Handle keyboard input during sleep + sleep_until = time.time() + self.interval + while time.time() < sleep_until and self.running: + # Check run_for timeout during sleep too + if run_until and time.time() >= run_until: + break + key = kbd.get_key() + if key == 'quit': + self.running = False + break + elif key == 'left': + self.current_page = (self.current_page - 1) % len(self.pages) + break + elif key == 'right': + self.current_page = (self.current_page + 1) % len(self.pages) + break + time.sleep(0.05) + finally: + show_cursor() + clear_screen() + + def run_gui(self): + """Run the grapher in GUI mode with matplotlib window.""" + from matplotlib.animation import FuncAnimation + + # Collect initial data for all pages + self.collect_all_pages() + + # Create initial figure + fig = self.render_page() + + # Calculate number of frames if run_for specified + if self.run_for: + num_frames = int(self.run_for / self.interval) + repeat = False + else: + num_frames = None # Infinite + repeat = True + + def update(frame): + self.collect_all_pages() + fig.clf() # Clear the existing figure, don't create new one + # Re-render onto the same figure + self.render_page(fig=fig) + fig.canvas.draw_idle() + return [] + + anim = FuncAnimation( + fig, update, interval=int(self.interval * 1000), frames=num_frames, repeat=repeat, blit=False, cache_frame_data=False) + plt.show() + + def run_once(self): + """Run a single snapshot and exit.""" + self.collect_data() + time.sleep(self.interval) + self.collect_data() + + fig = self.render_page() + imgcat_display(fig) + plt.close(fig) + + def run(self, once: bool = False): + """Run the grapher.""" + if once: + self.run_once() + elif self.gui_mode: + self.run_gui() + else: + self.run_imgcat() + + +# ============================================================================= +# Configuration Loading +# ============================================================================= + + +def load_config(config_path: str) -> dict: + """Load optional config file for layout customization.""" + if not os.path.exists(config_path): + return {} + + with open(config_path, 'r') as f: + return yaml.safe_load(f) or {} + + +# ============================================================================= +# Main Entry Point +# ============================================================================= + + +def main(): + parser = argparse.ArgumentParser( + description='Traffic Grapher - Real-time ATS metrics visualization', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s ats-server1.example.com + %(prog)s ats-server{1..3}.example.com # bash expansion + %(prog)s --interval 2 --history 120 ats-server{1..4}.example.com + %(prog)s ats1.dc1.example.com ats1.dc2.example.com # compare POPs +""") + + parser.add_argument( + 'hosts', nargs='+', metavar='HOSTNAME', help='Hostnames to monitor (1-4 hosts, e.g., ats-server1.example.com)') + parser.add_argument('--interval', type=float, default=1.0, help='Refresh interval in seconds (default: 1.0)') + parser.add_argument('--history', type=int, default=60, help='History window in seconds (default: 60)') + parser.add_argument('--gui', action='store_true', help='Use matplotlib GUI window instead of imgcat') + parser.add_argument('--once', action='store_true', help='Single snapshot, then exit') + parser.add_argument('-c', '--config', default=None, help='Optional config file for layout customization') + + # Display options + parser.add_argument('--timezone', '-tz', default='UTC', help='Timezone for display (default: UTC, e.g., America/Los_Angeles)') + + # Debug options + parser.add_argument( + '--save-png', default=None, metavar='FILE', help='Save PNG to file after each render (use {iter} for iteration number)') + parser.add_argument('--log-stats', default=None, metavar='FILE', help='Log raw stats to file for debugging') + parser.add_argument('--run-for', type=int, default=None, metavar='SECONDS', help='Run for N seconds then exit (for debugging)') + parser.add_argument('--no-keyboard', action='store_true', help='Disable keyboard handling (for non-TTY environments)') + + args = parser.parse_args() + + # Load optional config + config = {} + if args.config: + config = load_config(args.config) + + # Use custom pages from config, or defaults + pages = config.get('pages', DEFAULT_PAGES) + + # Override history from config if specified + history = args.history + if 'history' in config: + history = config['history'].get('seconds', args.history) + + # Get timezone from config or CLI + tz_name = args.timezone + if 'timezone' in config: + tz_name = config['timezone'] + + # Validate host count + if len(args.hosts) > 4: + parser.error("Maximum 4 hosts supported (limited by line styles)") + + # Create and run grapher + grapher = ATSGrapher( + hostnames=args.hosts, + interval=args.interval, + history_seconds=history, + pages=pages, + gui_mode=args.gui, + save_png=args.save_png, + log_stats=args.log_stats, + run_for=args.run_for, + no_keyboard=args.no_keyboard, + tz_name=tz_name) + + print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history") + if len(args.hosts) > 1: + print(f"Comparing: {' vs '.join(grapher.host_names)}") + else: + print(f"Monitoring: {grapher.host_names[0]}") + print("Starting in 2 seconds... (press Ctrl+C to cancel)") + time.sleep(2) + + # Clear scrollback buffer at startup to free any previous accumulated images + clear_scrollback() + + try: + grapher.run(once=args.once) + except KeyboardInterrupt: + print("\nStopped by user") + return 0 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) From 436784e71d8d948351becc909e26c7668af5c135 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Tue, 10 Feb 2026 14:53:38 -0800 Subject: [PATCH 02/15] traffic_grapher: Add uv project and fix code quality issues Address review feedback: - Add pyproject.toml and PEP 723 inline script metadata so the tool can be run via 'uv run traffic_grapher.py' without manual pip installs - Make paths configurable: --traffic-ctl, --socket-path CLI args with TRAFFIC_CTL_PATH and TRAFFICSERVER_JSONRPC_SOCKET env var fallbacks (replaces hard-coded /opt/edge/... paths) - Fix command injection: use subprocess.run with list args instead of shell=True, add hostname validation via regex - Replace bare except clauses with specific exception types - Remove unused imports (select, tty) - Parse traffic_ctl output in Python instead of piping through awk --- tools/traffic_grapher/pyproject.toml | 29 +++++++ tools/traffic_grapher/traffic_grapher.py | 97 ++++++++++++++++-------- 2 files changed, 94 insertions(+), 32 deletions(-) create mode 100644 tools/traffic_grapher/pyproject.toml diff --git a/tools/traffic_grapher/pyproject.toml b/tools/traffic_grapher/pyproject.toml new file mode 100644 index 00000000000..21746fba35b --- /dev/null +++ b/tools/traffic_grapher/pyproject.toml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[project] +name = "traffic-grapher" +version = "1.0.0" +description = "Real-time ATS metrics visualization for iTerm2" +requires-python = ">=3.9" +license = "Apache-2.0" +dependencies = [ + "matplotlib>=3.7", + "pyyaml>=6.0", +] + +[project.scripts] +traffic-grapher = "traffic_grapher:main" diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index b6b6015db3c..68a1e474cf9 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -16,6 +16,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# /// script +# dependencies = ["matplotlib>=3.7", "pyyaml>=6.0"] +# requires-python = ">=3.9" +# /// """ Traffic Grapher - Real-time ATS metrics visualization. @@ -34,14 +38,13 @@ import gc import io import os -import select +import re import shutil import struct import subprocess import sys import termios import time -import tty from collections import deque from datetime import datetime, timezone from typing import Optional, Tuple @@ -321,11 +324,8 @@ }, ] -# Command template for traffic_ctl -# Default path (adjust for your installation) -# Note: awk runs locally to avoid SSH quote escaping issues -METRIC_COMMAND_REMOTE = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key}" -METRIC_COMMAND_LOCAL = "/opt/edge/trafficserver/10.0/bin/traffic_ctl metric get {key} | awk '{{print $2}}'" +# Default traffic_ctl path (configurable via --traffic-ctl or TRAFFIC_CTL_PATH env var) +DEFAULT_TRAFFIC_CTL_PATH = os.environ.get("TRAFFIC_CTL_PATH", "traffic_ctl") # ============================================================================= # Terminal Utilities @@ -444,7 +444,7 @@ def get_terminal_pixel_size() -> Tuple[int, int]: # Estimate from rows/cols (typical cell: 9x18 pixels) return (cols * 9, rows * 18) - except: + except (OSError, io.UnsupportedOperation, struct.error, ValueError): # Fallback: assume reasonable terminal size size = shutil.get_terminal_size((80, 24)) return (size.columns * 9, size.lines * 18) @@ -516,7 +516,7 @@ def get_key(self) -> Optional[str]: return 'up' elif ch3 == b'B': return 'down' - except: + except (OSError, IOError): pass return 'escape' elif ch in (b'q', b'Q'): @@ -562,8 +562,18 @@ def get_key(self) -> Optional[str]: print(f"{{name}}={{value}}={{dtype}}") ''' -# Default JSONRPC socket path (adjust for your installation) -JSONRPC_SOCKET_PATH = "/opt/edge/trafficserver/10.0/var/trafficserver/jsonrpc20.sock" +# Default JSONRPC socket path (configurable via --socket-path or TRAFFICSERVER_JSONRPC_SOCKET env var) +DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get( + "TRAFFICSERVER_JSONRPC_SOCKET", + "/usr/local/var/trafficserver/jsonrpc20.sock", +) + + +def _validate_hostname(hostname: str) -> str: + """Validate hostname to prevent command injection.""" + if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9._:-]*$', hostname): + raise ValueError(f"Invalid hostname: {hostname!r}") + return hostname class JSONRPCBatchCollector: @@ -572,8 +582,8 @@ class JSONRPCBatchCollector: Collects all metrics for a host in a single SSH call. """ - def __init__(self, hostname: str, metric_keys: list, socket_path: str = JSONRPC_SOCKET_PATH): - self.hostname = hostname # Bare hostname like "ats-server1.example.com" + def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH): + self.hostname = _validate_hostname(hostname) self.metric_keys = metric_keys self.socket_path = socket_path @@ -592,13 +602,13 @@ def collect(self) -> dict: """ script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern) - # Build SSH command - hostname is passed directly, we add "ssh" prefix - # Encode script as base64 to avoid quoting issues + # Encode script as base64 to avoid quoting issues. + # Use subprocess with list args (no shell=True) to prevent command injection. script_b64 = base64.b64encode(script.encode()).decode() - cmd = f"ssh {self.hostname} \"echo '{script_b64}' | base64 -d | python3\"" + remote_cmd = f"echo '{script_b64}' | base64 -d | python3" try: - result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10) + result = subprocess.run(["ssh", self.hostname, remote_cmd], capture_output=True, text=True, timeout=10) if result.returncode != 0: return self.last_values @@ -612,7 +622,7 @@ def collect(self) -> dict: key = parts[0] try: value = float(parts[1]) - except: + except (ValueError, TypeError): continue dtype = parts[2] if len(parts) > 2 else "INT" values[key] = (value, dtype) @@ -627,22 +637,27 @@ def collect(self) -> dict: class MetricCollector: """Collects metric values from a host via shell commands.""" - def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefix: str = "", host_name: str = ""): + def __init__( + self, + name: str, + key: str, + metric_type: str, + color: str, + hostname: str = "", + host_name: str = "", + traffic_ctl_path: str = DEFAULT_TRAFFIC_CTL_PATH): self.name = name self.key = key self.metric_type = metric_type.lower() self.color = color - self.host_prefix = host_prefix self.host_name = host_name - # Build the full command - if host_prefix: - # For remote hosts: run traffic_ctl on remote, awk locally - remote_cmd = METRIC_COMMAND_REMOTE.format(key=key) - self.command = f"{host_prefix} '{remote_cmd}' | awk '{{print $2}}'" + # Build the command as a list (no shell=True needed) + if hostname: + _validate_hostname(hostname) + self.command = ["ssh", hostname, traffic_ctl_path, "metric", "get", key] else: - # Local: run everything locally - self.command = METRIC_COMMAND_LOCAL.format(key=key) + self.command = [traffic_ctl_path, "metric", "get", key] # For counter metrics, track previous value and time self._prev_value: Optional[float] = None @@ -654,7 +669,7 @@ def __init__(self, name: str, key: str, metric_type: str, color: str, host_prefi def _get_raw_value(self) -> Optional[float]: """Run the command and return the raw numeric value.""" try: - result = subprocess.run(self.command, shell=True, capture_output=True, text=True, timeout=5) + result = subprocess.run(self.command, capture_output=True, text=True, timeout=5) if result.returncode != 0: return None @@ -662,6 +677,10 @@ def _get_raw_value(self) -> Optional[float]: if not output: return None + # Parse "key value" output from traffic_ctl + parts = output.split() + if len(parts) >= 2: + return float(parts[1]) return float(output) except (subprocess.TimeoutExpired, ValueError, OSError): return None @@ -731,7 +750,8 @@ def __init__( log_stats: Optional[str] = None, run_for: Optional[int] = None, no_keyboard: bool = False, - tz_name: str = "UTC"): + tz_name: str = "UTC", + socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH): self.hostnames = hostnames # Bare hostnames like ["ats-server1.example.com"] self.interval = interval self.history_seconds = history_seconds @@ -748,7 +768,7 @@ def __init__( if ZoneInfo and tz_name != "UTC": try: self.tz = ZoneInfo(tz_name) - except: + except Exception: self.tz = timezone.utc self.tz_name = "UTC" else: @@ -783,7 +803,9 @@ def __init__( all_metric_keys.append(metric["key2"]) # Create ONE combined batch collector per host (collects all metrics at once) - self.combined_collectors = [JSONRPCBatchCollector(hostname, all_metric_keys) for hostname in hostnames] + self.combined_collectors = [ + JSONRPCBatchCollector(hostname, all_metric_keys, socket_path=socket_path) for hostname in hostnames + ] # Initialize metric info and data for all pages # metric_info[page][panel][metric] = {name, key, type, color} @@ -1343,6 +1365,16 @@ def main(): # Display options parser.add_argument('--timezone', '-tz', default='UTC', help='Timezone for display (default: UTC, e.g., America/Los_Angeles)') + # ATS paths + parser.add_argument( + '--traffic-ctl', + default=DEFAULT_TRAFFIC_CTL_PATH, + help='Path to traffic_ctl binary (default: $TRAFFIC_CTL_PATH or "traffic_ctl")') + parser.add_argument( + '--socket-path', + default=DEFAULT_JSONRPC_SOCKET_PATH, + help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)') + # Debug options parser.add_argument( '--save-png', default=None, metavar='FILE', help='Save PNG to file after each render (use {iter} for iteration number)') @@ -1385,7 +1417,8 @@ def main(): log_stats=args.log_stats, run_for=args.run_for, no_keyboard=args.no_keyboard, - tz_name=tz_name) + tz_name=tz_name, + socket_path=args.socket_path) print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history") if len(args.hosts) > 1: From cf15b508d11603c12f5daaa02ec3bf56b7587f53 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Tue, 10 Feb 2026 15:03:00 -0800 Subject: [PATCH 03/15] traffic_grapher: Add README, rename --socket-path to --socket, remove --traffic-ctl - Add README.md with usage, quick start, configuration, and CLI reference - Rename --socket-path to --socket for brevity - Remove unused --traffic-ctl CLI option (main code uses JSONRPC socket, not traffic_ctl) --- tools/traffic_grapher/README.md | 112 +++++++++++++++++++++++ tools/traffic_grapher/traffic_grapher.py | 10 +- 2 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 tools/traffic_grapher/README.md diff --git a/tools/traffic_grapher/README.md b/tools/traffic_grapher/README.md new file mode 100644 index 00000000000..f89d697d7bc --- /dev/null +++ b/tools/traffic_grapher/README.md @@ -0,0 +1,112 @@ + + +# Traffic Grapher + +Real-time ATS metrics visualization for iTerm2. Displays live graphs of +requests/sec, latency, cache hit rate, connections, and more — inline in +your terminal using imgcat. + +## Features + +- Real-time graphs of RPS, latency, cache hit rate, connections +- Support for 1–4 hosts with different line styles for comparison +- Collects metrics via JSONRPC Unix socket (batch collection per host) +- Dark theme optimized for terminal display +- Keyboard navigation between 4 metric pages +- Configurable refresh interval and history window +- Optional GUI mode via matplotlib window + +## Requirements + +- Python 3.9+ +- [uv](https://docs.astral.sh/uv/) (recommended) or pip +- iTerm2 (or compatible terminal for inline images) +- SSH access to remote ATS hosts + +## Quick Start + +```bash +# With uv (handles dependencies automatically) +uv run traffic_grapher.py ats-server1.example.com + +# Multiple hosts for comparison +uv run traffic_grapher.py ats-server{1..4}.example.com + +# Custom interval and history +uv run traffic_grapher.py --interval 2 --history 120 ats-server1.example.com +``` + +## Installation (Alternative) + +```bash +# Install as a project +uv sync +uv run traffic-grapher ats-server1.example.com + +# Or with pip +pip install . +traffic-grapher ats-server1.example.com +``` + +## Configuration + +### ATS Paths + +The tool needs to know where the JSONRPC socket is on the remote host. +Configure via CLI flag or environment variable: + +| Option | Env Var | Default | +|--------|---------|---------| +| `--socket` | `TRAFFICSERVER_JSONRPC_SOCKET` | `/usr/local/var/trafficserver/jsonrpc20.sock` | + +### Custom Dashboards + +Create a YAML config file to customize which metrics are displayed: + +```bash +uv run traffic_grapher.py -c my_dashboard.yaml ats-server1.example.com +``` + +## Keyboard Controls + +| Key | Action | +|-----|--------| +| `h` / `←` | Previous page | +| `l` / `→` | Next page | +| `q` | Quit | + +## Pages + +1. **Traffic & Cache** — Requests/sec, latency, cache hit rate, connections +2. **Response Codes** — 2xx, 3xx, 4xx, 5xx breakdown +3. **TLS & HTTP/2** — SSL handshakes, connections, HTTP/2 stats +4. **Network & Errors** — Bandwidth, connection errors, transaction errors + +## Options + +``` +--interval SEC Refresh interval in seconds (default: 1.0) +--history SEC History window in seconds (default: 60) +--socket PATH Path to JSONRPC Unix socket on remote host +--gui Use matplotlib GUI window instead of imgcat +--once Single snapshot, then exit +--timezone TZ Timezone for display (default: UTC) +--save-png FILE Save PNG after each render (use {iter} for iteration) +--no-keyboard Disable keyboard handling (for non-TTY environments) +``` diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 68a1e474cf9..f62ab8a0738 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -324,7 +324,7 @@ }, ] -# Default traffic_ctl path (configurable via --traffic-ctl or TRAFFIC_CTL_PATH env var) +# Default traffic_ctl path (used by MetricCollector, configurable via TRAFFIC_CTL_PATH env var) DEFAULT_TRAFFIC_CTL_PATH = os.environ.get("TRAFFIC_CTL_PATH", "traffic_ctl") # ============================================================================= @@ -1367,11 +1367,7 @@ def main(): # ATS paths parser.add_argument( - '--traffic-ctl', - default=DEFAULT_TRAFFIC_CTL_PATH, - help='Path to traffic_ctl binary (default: $TRAFFIC_CTL_PATH or "traffic_ctl")') - parser.add_argument( - '--socket-path', + '--socket', default=DEFAULT_JSONRPC_SOCKET_PATH, help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)') @@ -1418,7 +1414,7 @@ def main(): run_for=args.run_for, no_keyboard=args.no_keyboard, tz_name=tz_name, - socket_path=args.socket_path) + socket_path=args.socket) print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history") if len(args.hosts) > 1: From 7a4fd39d6ef6f17817daf1823e3659e86f0e5cb4 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 14:52:20 -0700 Subject: [PATCH 04/15] traffic_grapher: Add localhost support and connection error feedback - Detect localhost/127.0.0.1/local and connect directly to the JSONRPC Unix socket instead of SSH, so 'traffic_grapher.py localhost' works without any SSH setup - Add startup connection test that reports success/failure for each host before entering the graph loop - Track and display collection errors: show in red on the dashboard status bar and print to stderr for the first few failures - Give clear error messages for common failures: socket not found, permission denied, connection refused, SSH failures, timeouts, empty responses Addresses review feedback from cmcfarlen. --- tools/traffic_grapher/README.md | 14 +- tools/traffic_grapher/traffic_grapher.py | 193 ++++++++++++++++++++--- 2 files changed, 184 insertions(+), 23 deletions(-) diff --git a/tools/traffic_grapher/README.md b/tools/traffic_grapher/README.md index f89d697d7bc..ed67bec4084 100644 --- a/tools/traffic_grapher/README.md +++ b/tools/traffic_grapher/README.md @@ -37,12 +37,15 @@ your terminal using imgcat. - Python 3.9+ - [uv](https://docs.astral.sh/uv/) (recommended) or pip - iTerm2 (or compatible terminal for inline images) -- SSH access to remote ATS hosts +- SSH access to remote ATS hosts (not needed for localhost) ## Quick Start ```bash -# With uv (handles dependencies automatically) +# Monitor ATS on the local machine (connects directly to JSONRPC socket) +uv run traffic_grapher.py localhost + +# Monitor a remote host (connects via SSH) uv run traffic_grapher.py ats-server1.example.com # Multiple hosts for comparison @@ -68,8 +71,11 @@ traffic-grapher ats-server1.example.com ### ATS Paths -The tool needs to know where the JSONRPC socket is on the remote host. -Configure via CLI flag or environment variable: +For localhost, the tool connects directly to the JSONRPC Unix socket. +For remote hosts, it sends a Python script via SSH that connects to +the socket on the remote machine. + +Configure the socket path via CLI flag or environment variable: | Option | Env Var | Default | |--------|---------|---------| diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index f62ab8a0738..444b8393aa2 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -37,9 +37,11 @@ import fcntl import gc import io +import json import os import re import shutil +import socket as socket_module import struct import subprocess import sys @@ -568,6 +570,8 @@ def get_key(self) -> Optional[str]: "/usr/local/var/trafficserver/jsonrpc20.sock", ) +LOCAL_HOSTNAMES = {'localhost', '127.0.0.1', 'local', '::1'} + def _validate_hostname(hostname: str) -> str: """Validate hostname to prevent command injection.""" @@ -579,31 +583,130 @@ def _validate_hostname(hostname: str) -> str: class JSONRPCBatchCollector: """ Batch metric collector using JSONRPC Unix socket. - Collects all metrics for a host in a single SSH call. + Connects directly for localhost, uses SSH for remote hosts. """ def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH): - self.hostname = _validate_hostname(hostname) + self.is_local = hostname.lower() in LOCAL_HOSTNAMES + if self.is_local: + self.hostname = 'localhost' + else: + self.hostname = _validate_hostname(hostname) self.metric_keys = metric_keys self.socket_path = socket_path - # Build regex pattern matching all metric keys - # Escape dots and join with | escaped_keys = [k.replace('.', r'\.') for k in metric_keys] self.pattern = '|'.join(f"^{k}$" for k in escaped_keys) - # Cached results from last collection - self.last_values: dict = {} # key -> (value, data_type) + self.last_values: dict = {} + self.last_error: Optional[str] = None + self.consecutive_errors: int = 0 def collect(self) -> dict: - """ - Collect all metrics in one SSH call. - Returns dict of {metric_key: (value, data_type)}. - """ + """Collect all metrics. Uses local socket or SSH depending on host.""" + if self.is_local: + return self._collect_local() + return self._collect_remote() + + def test_connection(self) -> Tuple[bool, str]: + """Test connectivity and return (success, message).""" + if self.is_local: + return self._test_local() + return self._test_remote() + + def _test_local(self) -> Tuple[bool, str]: + """Test local JSONRPC socket connectivity.""" + if not os.path.exists(self.socket_path): + return False, f"Socket not found: {self.socket_path}" + try: + s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) + s.settimeout(3) + s.connect(self.socket_path) + s.close() + return True, "OK" + except PermissionError: + return False, f"Permission denied: {self.socket_path}" + except ConnectionRefusedError: + return False, f"Connection refused: {self.socket_path} (is ATS running?)" + except OSError as e: + return False, f"Socket error: {e}" + + def _test_remote(self) -> Tuple[bool, str]: + """Test SSH connectivity to remote host.""" + try: + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", self.hostname, "echo ok"], + capture_output=True, + text=True, + timeout=10) + if result.returncode != 0: + stderr = result.stderr.strip() + return False, f"SSH failed: {stderr or 'unknown error'}" + return True, "OK" + except subprocess.TimeoutExpired: + return False, "SSH timed out" + except OSError as e: + return False, f"SSH error: {e}" + + def _collect_local(self) -> dict: + """Connect directly to the local JSONRPC Unix socket.""" + try: + s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) + s.settimeout(5) + s.connect(self.socket_path) + + request = {"jsonrpc": "2.0", "method": "admin_lookup_records", "params": [{"record_name_regex": self.pattern}], "id": 1} + s.sendall(json.dumps(request).encode() + b"\n") + + chunks = [] + while True: + chunk = s.recv(1048576) + if not chunk: + break + chunks.append(chunk) + try: + json.loads(b''.join(chunks)) + break + except json.JSONDecodeError: + continue + + s.close() + response_data = json.loads(b''.join(chunks).decode()) + + values = {} + for rec in response_data.get("result", {}).get("recordList", []): + r = rec.get("record", {}) + name = r.get("record_name", "") + raw_val = r.get("current_value", "") + dtype = r.get("data_type", "") + try: + values[name] = (float(raw_val), dtype) + except (ValueError, TypeError): + continue + + self.last_values = values + self.last_error = None + self.consecutive_errors = 0 + return values + + except FileNotFoundError: + self.last_error = f"Socket not found: {self.socket_path}" + except PermissionError: + self.last_error = f"Permission denied: {self.socket_path}" + except ConnectionRefusedError: + self.last_error = f"Connection refused (is ATS running?)" + except json.JSONDecodeError as e: + self.last_error = f"Invalid JSON response: {e}" + except OSError as e: + self.last_error = f"Socket error: {e}" + + self.consecutive_errors += 1 + return self.last_values + + def _collect_remote(self) -> dict: + """Collect via SSH to remote host.""" script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern) - # Encode script as base64 to avoid quoting issues. - # Use subprocess with list args (no shell=True) to prevent command injection. script_b64 = base64.b64encode(script.encode()).decode() remote_cmd = f"echo '{script_b64}' | base64 -d | python3" @@ -611,9 +714,11 @@ def collect(self) -> dict: result = subprocess.run(["ssh", self.hostname, remote_cmd], capture_output=True, text=True, timeout=10) if result.returncode != 0: + stderr = result.stderr.strip() + self.last_error = f"SSH error: {stderr[:200]}" if stderr else f"SSH exited with code {result.returncode}" + self.consecutive_errors += 1 return self.last_values - # Parse output: key=value=dtype per line values = {} for line in result.stdout.strip().split('\n'): if '=' in line: @@ -627,10 +732,26 @@ def collect(self) -> dict: dtype = parts[2] if len(parts) > 2 else "INT" values[key] = (value, dtype) + if not values and result.stdout.strip(): + self.last_error = f"No metrics parsed from output ({len(result.stdout)} bytes)" + self.consecutive_errors += 1 + elif not values: + self.last_error = "Empty response from remote host" + self.consecutive_errors += 1 + else: + self.last_error = None + self.consecutive_errors = 0 + self.last_values = values return values - except (subprocess.TimeoutExpired, OSError): + except subprocess.TimeoutExpired: + self.last_error = "SSH timed out (10s)" + self.consecutive_errors += 1 + return self.last_values + except OSError as e: + self.last_error = f"SSH error: {e}" + self.consecutive_errors += 1 return self.last_values @@ -908,6 +1029,9 @@ def collect_all_pages(self): batch_results.append(results) host_times.append(collect_time) + if batch_collector.last_error and batch_collector.consecutive_errors <= 3: + print(f"[{host_label}] {batch_collector.last_error}", file=sys.stderr) + if self.log_stats: log_lines.append(f" Got {len(results)} metrics\n") @@ -1173,9 +1297,19 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure: title += f" ({self.host_names[0]})" fig.suptitle(title, color=self.TEXT_COLOR, fontsize=18, fontweight='bold') - # Status bar - status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history" - fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080') + # Status bar - show errors if any hosts are failing + error_parts = [] + for host_idx, collector in enumerate(self.combined_collectors): + if collector.last_error: + host_label = self.host_names[host_idx] + error_parts.append(f"{host_label}: {collector.last_error}") + + if error_parts: + error_text = " | ".join(error_parts) + fig.text(0.5, 0.01, f"ERROR: {error_text}", ha='center', fontsize=13, color='#FF4040', fontweight='bold') + else: + status = f"[←/→ or h/l pages, q quit] | {self.interval}s refresh | {self.history_seconds}s history" + fig.text(0.5, 0.01, status, ha='center', fontsize=13, color='#808080') plt.tight_layout() plt.subplots_adjust(top=0.92, bottom=0.06, left=0.06, right=0.98, hspace=0.35, wspace=0.22) @@ -1421,8 +1555,29 @@ def main(): print(f"Comparing: {' vs '.join(grapher.host_names)}") else: print(f"Monitoring: {grapher.host_names[0]}") - print("Starting in 2 seconds... (press Ctrl+C to cancel)") - time.sleep(2) + + # Test connectivity to all hosts before starting + print("\nTesting connections...") + all_ok = True + for i, collector in enumerate(grapher.combined_collectors): + host_label = grapher.host_names[i] + mode = "local socket" if collector.is_local else "SSH" + print(f" {host_label} ({mode}): ", end="", flush=True) + ok, msg = collector.test_connection() + if ok: + print(f"OK") + else: + print(f"FAILED - {msg}") + all_ok = False + + if not all_ok: + print("\nWARNING: Some hosts failed connectivity test.") + print("The grapher will start anyway - errors will be shown on the dashboard.") + print("Press Ctrl+C to cancel, or wait 3 seconds to continue...") + time.sleep(3) + else: + print("\nAll hosts connected. Starting in 1 second...") + time.sleep(1) # Clear scrollback buffer at startup to free any previous accumulated images clear_scrollback() From 9cc7aa34d93dff9434bf433e0fe902da7b1d0fdd Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 14:57:16 -0700 Subject: [PATCH 05/15] traffic_grapher: Fix SyntaxWarning for regex pattern on Python 3.12+ Use raw string prefix for the regex pattern in the JSONRPC script template so backslash-dot escape sequences do not trigger SyntaxWarning/SyntaxError on remote hosts running Python 3.12+. --- tools/traffic_grapher/traffic_grapher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 444b8393aa2..b16e4f6dccb 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -547,7 +547,7 @@ def get_key(self) -> Optional[str]: request = {{ "jsonrpc": "2.0", "method": "admin_lookup_records", - "params": [{{"record_name_regex": "{pattern}"}}], + "params": [{{"record_name_regex": r"{pattern}"}}], "id": 1 }} sock.sendall(json.dumps(request).encode() + b"\\n") From ec6d723ae1c4d230ae5acc25ebaa43ed01d688b3 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 15:46:45 -0700 Subject: [PATCH 06/15] traffic_grapher: Auto-discover JSONRPC socket path on target host Instead of requiring the user to know where the JSONRPC socket is, auto-discover it by finding traffic_ctl (via PATH or common install prefixes like /usr/local, /opt/ats, /opt/trafficserver) and querying the runtime directory. Falls back to checking common socket locations. Discovery runs once at startup per host. Users can still override with --socket or TRAFFICSERVER_JSONRPC_SOCKET env var. --- tools/traffic_grapher/traffic_grapher.py | 127 +++++++++++++++++++++-- 1 file changed, 116 insertions(+), 11 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index b16e4f6dccb..649e6daf02e 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -564,14 +564,94 @@ def get_key(self) -> Optional[str]: print(f"{{name}}={{value}}={{dtype}}") ''' -# Default JSONRPC socket path (configurable via --socket-path or TRAFFICSERVER_JSONRPC_SOCKET env var) -DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get( - "TRAFFICSERVER_JSONRPC_SOCKET", - "/usr/local/var/trafficserver/jsonrpc20.sock", -) +# Default JSONRPC socket path (configurable via --socket or TRAFFICSERVER_JSONRPC_SOCKET env var) +DEFAULT_JSONRPC_SOCKET_PATH = os.environ.get("TRAFFICSERVER_JSONRPC_SOCKET", None) LOCAL_HOSTNAMES = {'localhost', '127.0.0.1', 'local', '::1'} +# Discovery script: finds the JSONRPC socket on the target host by locating +# traffic_ctl (via PATH or common prefixes) and querying the runtime directory. +DISCOVER_SOCKET_SCRIPT = r''' +import subprocess, os, sys + +COMMON_PREFIXES = [ + "/usr/local", + "/opt/ats", + "/opt/trafficserver", + "/usr", + "/opt/ts", +] + +def find_traffic_ctl(): + """Find traffic_ctl binary.""" + # Check PATH first + for d in os.environ.get("PATH", "").split(":"): + candidate = os.path.join(d, "traffic_ctl") + if os.path.isfile(candidate) and os.access(candidate, os.X_OK): + return candidate + # Check common install prefixes + for prefix in COMMON_PREFIXES: + candidate = os.path.join(prefix, "bin", "traffic_ctl") + if os.path.isfile(candidate) and os.access(candidate, os.X_OK): + return candidate + return None + +def get_socket_via_traffic_ctl(tc): + """Ask traffic_ctl for the runtime dir and build the socket path.""" + try: + r = subprocess.run([tc, "config", "get", "proxy.config.local_state_dir"], + capture_output=True, text=True, timeout=5) + if r.returncode == 0: + parts = r.stdout.strip().split() + state_dir = parts[1] if len(parts) >= 2 else parts[0] + # Could be relative to prefix — check if absolute + if not os.path.isabs(state_dir): + prefix = os.path.dirname(os.path.dirname(tc)) + state_dir = os.path.join(prefix, state_dir) + sock = os.path.join(state_dir, "jsonrpc20.sock") + if os.path.exists(sock): + print(sock) + return True + except Exception: + pass + return False + +def check_common_paths(): + """Check common socket locations.""" + for prefix in COMMON_PREFIXES: + sock = os.path.join(prefix, "var", "trafficserver", "jsonrpc20.sock") + if os.path.exists(sock): + print(sock) + return True + return False + +tc = find_traffic_ctl() +if tc: + if get_socket_via_traffic_ctl(tc): + sys.exit(0) +if check_common_paths(): + sys.exit(0) +# Not found +sys.exit(1) +''' + + +def discover_socket_path(hostname: str, is_local: bool) -> Optional[str]: + """Auto-discover the JSONRPC socket path on the target host.""" + try: + if is_local: + result = subprocess.run(["python3", "-c", DISCOVER_SOCKET_SCRIPT], capture_output=True, text=True, timeout=10) + else: + script_b64 = base64.b64encode(DISCOVER_SOCKET_SCRIPT.encode()).decode() + remote_cmd = f"echo '{script_b64}' | base64 -d | python3" + result = subprocess.run(["ssh", hostname, remote_cmd], capture_output=True, text=True, timeout=10) + + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except (subprocess.TimeoutExpired, OSError): + pass + return None + def _validate_hostname(hostname: str) -> str: """Validate hostname to prevent command injection.""" @@ -586,14 +666,14 @@ class JSONRPCBatchCollector: Connects directly for localhost, uses SSH for remote hosts. """ - def __init__(self, hostname: str, metric_keys: list, socket_path: str = DEFAULT_JSONRPC_SOCKET_PATH): + def __init__(self, hostname: str, metric_keys: list, socket_path: Optional[str] = None): self.is_local = hostname.lower() in LOCAL_HOSTNAMES if self.is_local: self.hostname = 'localhost' else: self.hostname = _validate_hostname(hostname) self.metric_keys = metric_keys - self.socket_path = socket_path + self.socket_path = socket_path # May be None until discover() is called escaped_keys = [k.replace('.', r'\.') for k in metric_keys] self.pattern = '|'.join(f"^{k}$" for k in escaped_keys) @@ -616,6 +696,8 @@ def test_connection(self) -> Tuple[bool, str]: def _test_local(self) -> Tuple[bool, str]: """Test local JSONRPC socket connectivity.""" + if self.socket_path is None: + return False, "No socket path discovered" if not os.path.exists(self.socket_path): return False, f"Socket not found: {self.socket_path}" try: @@ -650,6 +732,10 @@ def _test_remote(self) -> Tuple[bool, str]: def _collect_local(self) -> dict: """Connect directly to the local JSONRPC Unix socket.""" + if self.socket_path is None: + self.last_error = "No socket path (use --socket or set TRAFFICSERVER_JSONRPC_SOCKET)" + self.consecutive_errors += 1 + return self.last_values try: s = socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) s.settimeout(5) @@ -705,6 +791,10 @@ def _collect_local(self) -> dict: def _collect_remote(self) -> dict: """Collect via SSH to remote host.""" + if self.socket_path is None: + self.last_error = "No socket path (use --socket or set TRAFFICSERVER_JSONRPC_SOCKET)" + self.consecutive_errors += 1 + return self.last_values script = JSONRPC_SCRIPT.format(socket_path=self.socket_path, pattern=self.pattern) script_b64 = base64.b64encode(script.encode()).decode() @@ -1501,9 +1591,7 @@ def main(): # ATS paths parser.add_argument( - '--socket', - default=DEFAULT_JSONRPC_SOCKET_PATH, - help='Path to JSONRPC Unix socket (default: $TRAFFICSERVER_JSONRPC_SOCKET)') + '--socket', default=DEFAULT_JSONRPC_SOCKET_PATH, help='Path to JSONRPC Unix socket (auto-discovered if not set)') # Debug options parser.add_argument( @@ -1556,16 +1644,33 @@ def main(): else: print(f"Monitoring: {grapher.host_names[0]}") + # Auto-discover socket path for hosts that don't have one explicitly set + for i, collector in enumerate(grapher.combined_collectors): + if collector.socket_path is not None: + continue + host_label = grapher.host_names[i] + print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True) + path = discover_socket_path(collector.hostname, collector.is_local) + if path: + print(f"found: {path}") + collector.socket_path = path + else: + print("not found (use --socket to specify)") + # Test connectivity to all hosts before starting print("\nTesting connections...") all_ok = True for i, collector in enumerate(grapher.combined_collectors): host_label = grapher.host_names[i] mode = "local socket" if collector.is_local else "SSH" + if collector.socket_path is None: + print(f" {host_label} ({mode}): SKIPPED - no socket path") + all_ok = False + continue print(f" {host_label} ({mode}): ", end="", flush=True) ok, msg = collector.test_connection() if ok: - print(f"OK") + print(f"OK ({collector.socket_path})") else: print(f"FAILED - {msg}") all_ok = False From 7b42c274a800541d9b202340f1f0e1d13e92bdeb Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 15:54:32 -0700 Subject: [PATCH 07/15] traffic_grapher: Fix GUI mode window sizing In GUI mode, start with a large default size (16x10 inches) and maximize the window on startup. On subsequent frames, don't override the figure size so user resizes are preserved. The terminal-based size calculation is only used for imgcat mode. --- tools/traffic_grapher/traffic_grapher.py | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 649e6daf02e..5a163775b57 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1249,16 +1249,18 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure: page = self.pages[self.current_page] current_elapsed = time.time() - self.start_time - # Set up dark theme - dynamic figure size to fill terminal plt.style.use('dark_background') - fig_width, fig_height = get_figure_size_for_terminal() if fig is None: - # Create new figure - fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height)) + if self.gui_mode: + fig, axes = plt.subplots(2, 2, figsize=(16, 10)) + else: + fig_width, fig_height = get_figure_size_for_terminal() + fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height)) else: - # Reuse existing figure, create new axes - fig.set_size_inches(fig_width, fig_height) + if not self.gui_mode: + fig_width, fig_height = get_figure_size_for_terminal() + fig.set_size_inches(fig_width, fig_height) axes = fig.subplots(2, 2) fig.patch.set_facecolor(self.FIG_BG_COLOR) # Pure black outside graphs @@ -1501,12 +1503,23 @@ def run_gui(self): """Run the grapher in GUI mode with matplotlib window.""" from matplotlib.animation import FuncAnimation - # Collect initial data for all pages self.collect_all_pages() - # Create initial figure fig = self.render_page() + # Maximize the window on startup + try: + manager = plt.get_current_fig_manager() + backend = matplotlib.get_backend().lower() + if 'macosx' in backend: + manager.full_screen_toggle() + elif 'tk' in backend: + manager.window.state('zoomed') + elif 'qt' in backend: + manager.window.showMaximized() + except Exception: + pass + # Calculate number of frames if run_for specified if self.run_for: num_frames = int(self.run_for / self.interval) From 3cf6b482a4e8f731eff6ff914d3e46b8b34163cf Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:08:48 -0700 Subject: [PATCH 08/15] traffic_grapher: Default to localhost when no hosts specified Running 'traffic_grapher.py' with no arguments now monitors the local ATS instance instead of requiring 'localhost' to be passed explicitly. --- tools/traffic_grapher/traffic_grapher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 5a163775b57..515b4140f4f 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1585,6 +1585,7 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: + %(prog)s # monitor localhost %(prog)s ats-server1.example.com %(prog)s ats-server{1..3}.example.com # bash expansion %(prog)s --interval 2 --history 120 ats-server{1..4}.example.com @@ -1592,7 +1593,7 @@ def main(): """) parser.add_argument( - 'hosts', nargs='+', metavar='HOSTNAME', help='Hostnames to monitor (1-4 hosts, e.g., ats-server1.example.com)') + 'hosts', nargs='*', default=['localhost'], metavar='HOSTNAME', help='Hostnames to monitor (default: localhost, max 4)') parser.add_argument('--interval', type=float, default=1.0, help='Refresh interval in seconds (default: 1.0)') parser.add_argument('--history', type=int, default=60, help='History window in seconds (default: 60)') parser.add_argument('--gui', action='store_true', help='Use matplotlib GUI window instead of imgcat') From c3571e98d9d8a6fd38606a0cdc1b29a56417c756 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:13:17 -0700 Subject: [PATCH 09/15] traffic_grapher: Don't maximize GUI window on startup The 16x10 default size is reasonable and the user can resize freely. --- tools/traffic_grapher/traffic_grapher.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 515b4140f4f..9456bc0e4f8 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1507,19 +1507,6 @@ def run_gui(self): fig = self.render_page() - # Maximize the window on startup - try: - manager = plt.get_current_fig_manager() - backend = matplotlib.get_backend().lower() - if 'macosx' in backend: - manager.full_screen_toggle() - elif 'tk' in backend: - manager.window.state('zoomed') - elif 'qt' in backend: - manager.window.showMaximized() - except Exception: - pass - # Calculate number of frames if run_for specified if self.run_for: num_frames = int(self.run_for / self.interval) From 4e59a7257516d232e496ba311857d6444d1a6a44 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:13:56 -0700 Subject: [PATCH 10/15] traffic_grapher: Use 14x8 default GUI size to fit within screen --- tools/traffic_grapher/traffic_grapher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 9456bc0e4f8..4c284b86735 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1253,7 +1253,7 @@ def render_page(self, fig: plt.Figure = None) -> plt.Figure: if fig is None: if self.gui_mode: - fig, axes = plt.subplots(2, 2, figsize=(16, 10)) + fig, axes = plt.subplots(2, 2, figsize=(14, 8)) else: fig_width, fig_height = get_figure_size_for_terminal() fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height)) From e6e59ffd4c87209b906f13ba4ceda540eb2503d4 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:16:23 -0700 Subject: [PATCH 11/15] traffic_grapher: Add keyboard navigation and hide toolbar in GUI mode Hook into matplotlib key_press_event so h/l and arrow keys switch pages in GUI mode. Hide the default navigation toolbar since pan/zoom controls are not useful for a live dashboard. --- tools/traffic_grapher/traffic_grapher.py | 26 ++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 4c284b86735..09cf6ecd9ad 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1507,18 +1507,36 @@ def run_gui(self): fig = self.render_page() - # Calculate number of frames if run_for specified + # Hide the default matplotlib toolbar + try: + fig.canvas.manager.toolbar.pack_forget() + except Exception: + try: + fig.canvas.toolbar.setVisible(False) + except Exception: + pass + + # Keyboard navigation for page switching + def on_key(event): + if event.key in ('left', 'h'): + self.current_page = (self.current_page - 1) % len(self.pages) + elif event.key in ('right', 'l'): + self.current_page = (self.current_page + 1) % len(self.pages) + elif event.key == 'q': + plt.close(fig) + + fig.canvas.mpl_connect('key_press_event', on_key) + if self.run_for: num_frames = int(self.run_for / self.interval) repeat = False else: - num_frames = None # Infinite + num_frames = None repeat = True def update(frame): self.collect_all_pages() - fig.clf() # Clear the existing figure, don't create new one - # Re-render onto the same figure + fig.clf() self.render_page(fig=fig) fig.canvas.draw_idle() return [] From 1afdc66f5447e6e1c26ff1afcb02140d6fb4c737 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:17:38 -0700 Subject: [PATCH 12/15] traffic_grapher: Fix toolbar hiding on MacOSX backend Set rcParams toolbar to None before figure creation instead of trying to remove it after. --- tools/traffic_grapher/traffic_grapher.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 09cf6ecd9ad..c8d9943da5e 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1505,17 +1505,9 @@ def run_gui(self): self.collect_all_pages() + matplotlib.rcParams['toolbar'] = 'None' fig = self.render_page() - # Hide the default matplotlib toolbar - try: - fig.canvas.manager.toolbar.pack_forget() - except Exception: - try: - fig.canvas.toolbar.setVisible(False) - except Exception: - pass - # Keyboard navigation for page switching def on_key(event): if event.key in ('left', 'h'): From 3fedbc04af5aea88e72c17840e021d1c6383282a Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:18:43 -0700 Subject: [PATCH 13/15] traffic_grapher: Revert toolbar hiding, MacOSX backend ignores it --- tools/traffic_grapher/traffic_grapher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index c8d9943da5e..d6a80ee5b4a 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -60,7 +60,6 @@ import matplotlib # Check for --gui in sys.argv early to set backend before importing pyplot if '--gui' in sys.argv: - # Use MacOSX on macOS, fall back to TkAgg on other platforms import platform if platform.system() == 'Darwin': matplotlib.use('MacOSX') @@ -1505,7 +1504,6 @@ def run_gui(self): self.collect_all_pages() - matplotlib.rcParams['toolbar'] = 'None' fig = self.render_page() # Keyboard navigation for page switching From 27edb7bbbe8e73b4fda8a7c643d6704d51275f82 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:19:22 -0700 Subject: [PATCH 14/15] traffic_grapher: Hide toolbar in GUI mode via rcParams Set toolbar to None before pyplot import so the matplotlib window does not show the default navigation toolbar. --- tools/traffic_grapher/traffic_grapher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index d6a80ee5b4a..3d4ab54cacb 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -65,6 +65,7 @@ matplotlib.use('MacOSX') else: matplotlib.use('TkAgg') + matplotlib.rcParams['toolbar'] = 'None' else: matplotlib.use('Agg') import matplotlib.pyplot as plt From 1624504a47324a8b550b91863e4f92f489ca7690 Mon Sep 17 00:00:00 2001 From: Bryan Call Date: Wed, 11 Mar 2026 16:22:08 -0700 Subject: [PATCH 15/15] traffic_grapher: Suppress terminal output in GUI mode In GUI mode, skip all startup prints and stderr collection errors since the dashboard already shows errors visually. This allows running in the background without output spilling into the terminal. --- tools/traffic_grapher/traffic_grapher.py | 44 +++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/tools/traffic_grapher/traffic_grapher.py b/tools/traffic_grapher/traffic_grapher.py index 3d4ab54cacb..769fdfb4a4c 100755 --- a/tools/traffic_grapher/traffic_grapher.py +++ b/tools/traffic_grapher/traffic_grapher.py @@ -1119,7 +1119,7 @@ def collect_all_pages(self): batch_results.append(results) host_times.append(collect_time) - if batch_collector.last_error and batch_collector.consecutive_errors <= 3: + if batch_collector.last_error and batch_collector.consecutive_errors <= 3 and not self.gui_mode: print(f"[{host_label}] {batch_collector.last_error}", file=sys.stderr) if self.log_stats: @@ -1648,49 +1648,61 @@ def main(): tz_name=tz_name, socket_path=args.socket) - print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history") - if len(args.hosts) > 1: - print(f"Comparing: {' vs '.join(grapher.host_names)}") - else: - print(f"Monitoring: {grapher.host_names[0]}") + # In GUI mode, suppress terminal output -- errors show on the dashboard + verbose = not args.gui + + if verbose: + print(f"Traffic Grapher - {len(pages)} pages, {args.interval}s refresh, {history}s history") + if len(args.hosts) > 1: + print(f"Comparing: {' vs '.join(grapher.host_names)}") + else: + print(f"Monitoring: {grapher.host_names[0]}") # Auto-discover socket path for hosts that don't have one explicitly set for i, collector in enumerate(grapher.combined_collectors): if collector.socket_path is not None: continue host_label = grapher.host_names[i] - print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True) + if verbose: + print(f"\nDiscovering JSONRPC socket on {host_label}...", end=" ", flush=True) path = discover_socket_path(collector.hostname, collector.is_local) if path: - print(f"found: {path}") + if verbose: + print(f"found: {path}") collector.socket_path = path else: - print("not found (use --socket to specify)") + if verbose: + print("not found (use --socket to specify)") # Test connectivity to all hosts before starting - print("\nTesting connections...") + if verbose: + print("\nTesting connections...") all_ok = True for i, collector in enumerate(grapher.combined_collectors): host_label = grapher.host_names[i] mode = "local socket" if collector.is_local else "SSH" if collector.socket_path is None: - print(f" {host_label} ({mode}): SKIPPED - no socket path") + if verbose: + print(f" {host_label} ({mode}): SKIPPED - no socket path") all_ok = False continue - print(f" {host_label} ({mode}): ", end="", flush=True) + if verbose: + print(f" {host_label} ({mode}): ", end="", flush=True) ok, msg = collector.test_connection() if ok: - print(f"OK ({collector.socket_path})") + if verbose: + print(f"OK ({collector.socket_path})") else: - print(f"FAILED - {msg}") + if verbose: + print(f"FAILED - {msg}") all_ok = False - if not all_ok: + if not all_ok and verbose: print("\nWARNING: Some hosts failed connectivity test.") print("The grapher will start anyway - errors will be shown on the dashboard.") print("Press Ctrl+C to cancel, or wait 3 seconds to continue...") time.sleep(3) - else: + elif verbose: print("\nAll hosts connected. Starting in 1 second...") time.sleep(1)