Skip to content

Commit 2749a44

Browse files
committed
first stab
1 parent b5e4c46 commit 2749a44

File tree

4 files changed

+250
-5
lines changed

4 files changed

+250
-5
lines changed

Lib/profiling/sampling/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@
99
from .stack_collector import CollapsedStackCollector
1010
from .heatmap_collector import HeatmapCollector
1111
from .gecko_collector import GeckoCollector
12+
from .ndjson_collector import NdjsonCollector
1213
from .string_table import StringTable
1314

14-
__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable")
15+
__all__ = (
16+
"Collector",
17+
"PstatsCollector",
18+
"CollapsedStackCollector",
19+
"HeatmapCollector",
20+
"GeckoCollector",
21+
"NdjsonCollector",
22+
"StringTable",
23+
)

Lib/profiling/sampling/binary_reader.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .gecko_collector import GeckoCollector
66
from .stack_collector import FlamegraphCollector, CollapsedStackCollector
7+
from .ndjson_collector import NdjsonCollector
78
from .pstats_collector import PstatsCollector
89

910

@@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format,
117118
collector = PstatsCollector(interval)
118119
elif output_format == 'gecko':
119120
collector = GeckoCollector(interval)
121+
elif output_format == 'ndjson':
122+
collector = NdjsonCollector(interval)
120123
else:
121124
raise ValueError(f"Unknown output format: {output_format}")
122125

Lib/profiling/sampling/cli.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from .stack_collector import CollapsedStackCollector, FlamegraphCollector
2020
from .heatmap_collector import HeatmapCollector
2121
from .gecko_collector import GeckoCollector
22+
from .ndjson_collector import NdjsonCollector
2223
from .binary_collector import BinaryCollector
2324
from .binary_reader import BinaryReader
2425
from .constants import (
@@ -87,6 +88,7 @@ class CustomFormatter(
8788
"flamegraph": "html",
8889
"gecko": "json",
8990
"heatmap": "html",
91+
"ndjson": "ndjson",
9092
"binary": "bin",
9193
}
9294

@@ -96,6 +98,7 @@ class CustomFormatter(
9698
"flamegraph": FlamegraphCollector,
9799
"gecko": GeckoCollector,
98100
"heatmap": HeatmapCollector,
101+
"ndjson": NdjsonCollector,
99102
"binary": BinaryCollector,
100103
}
101104

@@ -467,6 +470,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True):
467470
dest="format",
468471
help="Generate interactive HTML heatmap visualization with line-level sample counts",
469472
)
473+
format_group.add_argument(
474+
"--ndjson",
475+
action="store_const",
476+
const="ndjson",
477+
dest="format",
478+
help="Generate NDJSON snapshot output for external consumers",
479+
)
470480
if include_binary:
471481
format_group.add_argument(
472482
"--binary",
@@ -545,15 +555,17 @@ def _sort_to_mode(sort_choice):
545555
return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)
546556

547557
def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False,
548-
output_file=None, compression='auto'):
558+
mode=None, output_file=None, compression='auto'):
549559
"""Create the appropriate collector based on format type.
550560
551561
Args:
552-
format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary')
562+
format_type: The output format ('pstats', 'collapsed', 'flamegraph',
563+
'gecko', 'heatmap', 'ndjson', 'binary')
553564
sample_interval_usec: Sampling interval in microseconds
554565
skip_idle: Whether to skip idle samples
555566
opcodes: Whether to collect opcode information (only used by gecko format
556567
for creating interval markers in Firefox Profiler)
568+
mode: Profiling mode for collectors that expose it in metadata
557569
output_file: Output file path (required for binary format)
558570
compression: Compression type for binary format ('auto', 'zstd', 'none')
559571
@@ -577,6 +589,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals
577589
skip_idle = False
578590
return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes)
579591

592+
if format_type == "ndjson":
593+
return collector_class(
594+
sample_interval_usec, skip_idle=skip_idle, mode=mode
595+
)
596+
580597
return collector_class(sample_interval_usec, skip_idle=skip_idle)
581598

582599

@@ -951,7 +968,7 @@ def _handle_attach(args):
951968

952969
# Create the appropriate collector
953970
collector = _create_collector(
954-
args.format, args.sample_interval_usec, skip_idle, args.opcodes,
971+
args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode,
955972
output_file=output_file,
956973
compression=getattr(args, 'compression', 'auto')
957974
)
@@ -1029,7 +1046,7 @@ def _handle_run(args):
10291046

10301047
# Create the appropriate collector
10311048
collector = _create_collector(
1032-
args.format, args.sample_interval_usec, skip_idle, args.opcodes,
1049+
args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode,
10331050
output_file=output_file,
10341051
compression=getattr(args, 'compression', 'auto')
10351052
)
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
"""NDJSON collector."""
2+
3+
import json
4+
import uuid
5+
from itertools import batched
6+
7+
from .constants import (
8+
PROFILING_MODE_ALL,
9+
PROFILING_MODE_CPU,
10+
PROFILING_MODE_EXCEPTION,
11+
PROFILING_MODE_GIL,
12+
PROFILING_MODE_WALL,
13+
)
14+
from .stack_collector import StackTraceCollector
15+
16+
17+
_CHUNK_SIZE = 1000
18+
19+
_MODE_NAMES = {
20+
PROFILING_MODE_WALL: "wall",
21+
PROFILING_MODE_CPU: "cpu",
22+
PROFILING_MODE_GIL: "gil",
23+
PROFILING_MODE_ALL: "all",
24+
PROFILING_MODE_EXCEPTION: "exception",
25+
}
26+
27+
28+
class NdjsonCollector(StackTraceCollector):
29+
"""Collector that exports finalized profiling data as NDJSON."""
30+
31+
def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None):
32+
super().__init__(sample_interval_usec, skip_idle=skip_idle)
33+
self.run_id = uuid.uuid4().hex
34+
35+
self._string_to_id = {}
36+
self._strings = []
37+
38+
self._frame_to_id = {}
39+
self._frames = []
40+
41+
self._frame_self = {}
42+
self._frame_cumulative = {}
43+
self._samples_total = 0
44+
45+
self._mode = mode
46+
47+
def process_frames(self, frames, _thread_id, weight=1):
48+
if not frames:
49+
return
50+
51+
self._samples_total += weight
52+
53+
frame_ids = [
54+
self._get_or_create_frame_id(filename, location, funcname)
55+
for filename, location, funcname, _opcode in frames
56+
]
57+
leaf_frame_id = frame_ids[0]
58+
59+
self._frame_self[leaf_frame_id] = (
60+
self._frame_self.get(leaf_frame_id, 0) + weight
61+
)
62+
63+
for frame_id in set(frame_ids):
64+
self._frame_cumulative[frame_id] = (
65+
self._frame_cumulative.get(frame_id, 0) + weight
66+
)
67+
68+
def export(self, filename):
69+
with open(filename, "w", encoding="utf-8") as output:
70+
self._write_message(output, self._build_meta_record())
71+
self._write_chunked_defs(output, "str_def", self._strings)
72+
self._write_chunked_defs(output, "frame_def", self._frames)
73+
self._write_chunked_agg(output, self._iter_agg_entries())
74+
self._write_message(
75+
output,
76+
{
77+
"type": "end",
78+
"v": 1,
79+
"run_id": self.run_id,
80+
"samples_total": self._samples_total,
81+
},
82+
)
83+
84+
print(f"NDJSON profile written to {filename}")
85+
86+
def _build_meta_record(self):
87+
record = {
88+
"type": "meta",
89+
"v": 1,
90+
"run_id": self.run_id,
91+
"sample_interval_usec": self.sample_interval_usec,
92+
}
93+
94+
if self._mode is not None:
95+
record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode))
96+
97+
return record
98+
99+
def _get_or_create_frame_id(self, filename, location, funcname):
100+
synthetic = location is None
101+
location_fields = self._normalize_export_location(location)
102+
func_str_id = self._intern_string(funcname)
103+
path_str_id = self._intern_string(filename)
104+
105+
frame_key = (
106+
path_str_id,
107+
func_str_id,
108+
location_fields["line"],
109+
location_fields.get("end_line"),
110+
location_fields.get("col"),
111+
location_fields.get("end_col"),
112+
synthetic,
113+
)
114+
115+
if (frame_id := self._frame_to_id.get(frame_key)) is not None:
116+
return frame_id
117+
118+
frame_id = len(self._frames) + 1
119+
frame_record = {
120+
"frame_id": frame_id,
121+
"path_str_id": path_str_id,
122+
"func_str_id": func_str_id,
123+
**location_fields,
124+
}
125+
if synthetic:
126+
frame_record["synthetic"] = True
127+
128+
self._frame_to_id[frame_key] = frame_id
129+
self._frames.append(frame_record)
130+
return frame_id
131+
132+
def _intern_string(self, value):
133+
value = str(value)
134+
135+
if (string_id := self._string_to_id.get(value)) is not None:
136+
return string_id
137+
138+
string_id = len(self._strings) + 1
139+
self._string_to_id[value] = string_id
140+
self._strings.append({"str_id": string_id, "value": value})
141+
return string_id
142+
143+
@staticmethod
144+
def _normalize_export_location(location):
145+
if location is None:
146+
return {"line": 0}
147+
148+
if isinstance(location, int):
149+
return {"line": max(location, 0)}
150+
151+
if not isinstance(location, tuple):
152+
lineno = getattr(location, "lineno", 0)
153+
location = (
154+
lineno,
155+
getattr(location, "end_lineno", lineno),
156+
getattr(location, "col_offset", -1),
157+
getattr(location, "end_col_offset", -1),
158+
)
159+
160+
lineno, end_lineno, col_offset, end_col_offset = location
161+
if not isinstance(lineno, int) or lineno <= 0:
162+
return {"line": 0}
163+
164+
normalized = {"line": lineno}
165+
if isinstance(end_lineno, int) and end_lineno > 0:
166+
normalized["end_line"] = end_lineno
167+
if isinstance(col_offset, int) and col_offset >= 0:
168+
normalized["col"] = col_offset
169+
if isinstance(end_col_offset, int) and end_col_offset >= 0:
170+
normalized["end_col"] = end_col_offset
171+
return normalized
172+
173+
def _iter_agg_entries(self):
174+
entries = []
175+
for frame_record in self._frames:
176+
frame_id = frame_record["frame_id"]
177+
entries.append(
178+
{
179+
"frame_id": frame_id,
180+
"self": self._frame_self.get(frame_id, 0),
181+
"cumulative": self._frame_cumulative.get(frame_id, 0),
182+
}
183+
)
184+
return entries
185+
186+
def _write_chunked_defs(self, output, record_type, entries):
187+
for chunk in batched(entries, _CHUNK_SIZE):
188+
self._write_message(
189+
output,
190+
{
191+
"type": record_type,
192+
"v": 1,
193+
"run_id": self.run_id,
194+
"defs": chunk,
195+
},
196+
)
197+
198+
def _write_chunked_agg(self, output, entries):
199+
for chunk in batched(entries, _CHUNK_SIZE):
200+
self._write_message(
201+
output,
202+
{
203+
"type": "agg",
204+
"v": 1,
205+
"run_id": self.run_id,
206+
"kind": "frame",
207+
"scope": "final",
208+
"samples_total": self._samples_total,
209+
"entries": chunk,
210+
},
211+
)
212+
213+
@staticmethod
214+
def _write_message(output, record):
215+
output.write(json.dumps(record, separators=(",", ":")))
216+
output.write("\n")

0 commit comments

Comments
 (0)