diff --git a/testsuite/perftest.py b/testsuite/perftest.py
new file mode 100755
index 000000000..d5ac6a2b5
--- /dev/null
+++ b/testsuite/perftest.py
@@ -0,0 +1,506 @@
+#!/usr/bin/env python3
+"""Compare the transfer performance of two rsync binaries (local <-> local).
+
+This is a standalone dev tool (run it directly, not via runtests.py) for
+spotting performance regressions between rsync releases.  Given two rsync
+binaries it builds one test tree, then runs the two binaries ALTERNATELY for a
+number of loops, timing each transfer, and reports the mean and standard
+deviation of the transfer time for each binary.
+
+Two transfers are timed each loop (see --mode):
+  * full  -- a fresh copy into an emptied destination (end-to-end read+write).
+  * noop  -- a re-run against an already-synced destination (rsync's own
+             scan / file-list / stat overhead, where many regressions hide).
+
+The first measured run of each binary is dropped (see --warmup) because it
+cold-loads the source into the page cache and is an outlier.
+
+The test tree's shape (heavy-tailed file sizes, a directory spine, symlinks,
+hard links and a spread of permission modes) follows the gentestdata.py
+generator; it is deterministic for a given --seed.
+
+Examples:
+    # Quick smoke run, same binary twice (means should match, no regression).
+    ./perftest.py --files 200 --total-size 5M -n 3 ./rsync ./rsync
+
+    # Compare a released binary against a fresh build over 8 loops.
+    ./perftest.py -n 8 ../old_versions/rsync_3.4.0 ./rsync
+
+    # Heavier tree, no-op (scan-overhead) timing only.
+    ./perftest.py --files 50000 --total-size 2G --mode noop OLD/rsync NEW/rsync
+"""
+
+import argparse
+import dataclasses
+import math
+import os
+import random
+import shlex
+import shutil
+import statistics
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+
+# ---------------------------------------------------------------------------
+# Test-tree generation (ported from gentestdata.py, kept self-contained).
+# ---------------------------------------------------------------------------
+
+# Marker file at the tree root; safe_rmtree only deletes a tree carrying it.
+MARKER = ".perftest"
+
+# Permission modes drawn at random for regular files (execs + read-only).
+FILE_MODES = [0o644, 0o644, 0o600, 0o640, 0o664, 0o444, 0o755, 0o750, 0o700]
+# Directory modes; owner always keeps r-x so the tree stays traversable.
+DIR_MODES = [0o755, 0o755, 0o775, 0o750, 0o700, 0o555]
+
+SIZE_SIGMA = 1.8          # sigma of the underlying lognormal size distribution
+BASE_BUF_SIZE = 1 << 20   # 1 MiB shared random buffer for file content
+
+
+def parse_size(s):
+    """Parse a human size like 500M, 1.5GiB, 200KB, or a bare byte count."""
+    s = s.strip()
+    units = {
+        "": 1, "B": 1,
+        "K": 1024, "KIB": 1024, "KB": 1000,
+        "M": 1024**2, "MIB": 1024**2, "MB": 1000**2,
+        "G": 1024**3, "GIB": 1024**3, "GB": 1000**3,
+        "T": 1024**4, "TIB": 1024**4, "TB": 1000**4,
+    }
+    num, suffix = s, ""
+    while num and not (num[-1].isdigit() or num[-1] == "."):
+        suffix = num[-1] + suffix
+        num = num[:-1]
+    suffix = suffix.upper()
+    if suffix not in units:
+        raise argparse.ArgumentTypeError(f"unknown size suffix in {s!r}")
+    try:
+        value = float(num)
+    except ValueError:
+        raise argparse.ArgumentTypeError(f"invalid size {s!r}")
+    return int(value * units[suffix])
+
+
+def human(n):
+    """Format a byte count for the summary output."""
+    for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
+        if abs(n) < 1024 or unit == "TiB":
+            return f"{n:.1f}{unit}" if unit != "B" else f"{n}B"
+        n /= 1024
+
+
+def gen_sizes(n, total, rng):
+    """Return n heavy-tailed file sizes (bytes) summing to exactly `total`."""
+    if n == 0:
+        return []
+    weights = [math.exp(rng.gauss(0.0, SIZE_SIGMA)) for _ in range(n)]
+    wsum = sum(weights)
+    sizes = [int(w / wsum * total) for w in weights]
+    drift = total - sum(sizes)
+    if drift and sizes:
+        i = max(range(n), key=lambda k: sizes[k])
+        sizes[i] += drift
+    return sizes
+
+
+def build_dirs(root, num_dirs, max_depth, rng):
+    """Create `num_dirs` dirs under root, up to `max_depth` deep; return them."""
+    os.makedirs(root)
+    dirs = [root]
+    depth_of = {root: 0}
+    candidates = [root] if max_depth > 0 else []
+    counter = 0
+
+    cur = root
+    for d in range(1, max_depth + 1):
+        cur = os.path.join(cur, f"d{d}")
+        os.mkdir(cur)
+        dirs.append(cur)
+        depth_of[cur] = d
+        if d < max_depth:
+            candidates.append(cur)
+
+    while len(dirs) < num_dirs and candidates:
+        parent = rng.choice(candidates)
+        counter += 1
+        child = os.path.join(parent, f"dir{counter}")
+        os.mkdir(child)
+        d = depth_of[parent] + 1
+        dirs.append(child)
+        depth_of[child] = d
+        if d < max_depth:
+            candidates.append(child)
+
+    return dirs
+
+
+def write_file(path, size, index, base):
+    """Write a regular file of exactly `size` bytes (index/size in first 16)."""
+    with open(path, "wb") as f:
+        remaining = size
+        if remaining >= 16:
+            f.write(struct.pack("<QQ", index, size))
+            remaining -= 16
+        blen = len(base)
+        while remaining > 0:
+            chunk = base if remaining >= blen else base[:remaining]
+            f.write(chunk)
+            remaining -= len(chunk)
+
+
+def rel_symlink(target, link_path):
+    """Create a relative symlink at link_path pointing at target."""
+    rel = os.path.relpath(target, os.path.dirname(link_path))
+    os.symlink(rel, link_path)
+
+
+def safe_rmtree(path):
+    """Remove a tree, even one containing read-only directories."""
+    for dirpath, _dirnames, _filenames in os.walk(path):
+        try:
+            os.chmod(dirpath, 0o700)
+        except OSError:
+            pass
+    shutil.rmtree(path)
+
+
+def generate_tree(root, args):
+    """Build the deterministic source tree at `root`; return a summary string."""
+    n = args.files
+    num_dirs = args.dirs if args.dirs is not None else max(args.depth, n // 20, 1)
+    n_sym = args.symlinks if args.symlinks is not None else (max(1, n // 20) if n else 0)
+    n_hard = args.hardlinks if args.hardlinks is not None else (max(1, n // 20) if n else 0)
+
+    rng = random.Random(args.seed)
+    base = rng.randbytes(BASE_BUF_SIZE)
+
+    dirs = build_dirs(root, num_dirs, args.depth, rng)
+    with open(os.path.join(root, MARKER), "w") as f:
+        f.write(f"generated by perftest.py seed={args.seed} files={n} "
+                f"total={args.total_size}\n")
+
+    sizes = gen_sizes(n, args.total_size, rng)
+    files = []
+    for i in range(n):
+        path = os.path.join(rng.choice(dirs), f"file{i}.dat")
+        write_file(path, sizes[i], i, base)
+        files.append(path)
+
+    hard_made = 0
+    if files:
+        for i in range(n_hard):
+            tgt = rng.choice(files)
+            link = os.path.join(rng.choice(dirs), f"hlink{i}_{os.path.basename(tgt)}")
+            try:
+                os.link(tgt, link)
+                hard_made += 1
+            except OSError:
+                pass
+
+    sym_made = 0
+    for i in range(n_sym):
+        link = os.path.join(rng.choice(dirs), f"sym{i}")
+        roll = rng.random()
+        try:
+            if roll < 0.15 or not files:
+                os.symlink(f"../broken-target-{i}", link)
+            elif roll < 0.30:
+                rel_symlink(rng.choice(dirs), link)
+            else:
+                rel_symlink(rng.choice(files), link)
+            sym_made += 1
+        except OSError:
+            pass
+
+    for path in files:
+        os.chmod(path, rng.choice(FILE_MODES))
+    for path in sorted((d for d in dirs if d != root),
+                       key=lambda p: p.count(os.sep), reverse=True):
+        os.chmod(path, rng.choice(DIR_MODES))
+
+    return (f"files={n} dirs={len(dirs)} symlinks={sym_made} hardlinks={hard_made} "
+            f"total={human(sum(sizes))} biggest={human(max(sizes) if sizes else 0)} "
+            f"seed={args.seed}")
+
+
+# ---------------------------------------------------------------------------
+# Benchmark.
+# ---------------------------------------------------------------------------
+
+@dataclasses.dataclass
+class Binary:
+    label: str          # "A" / "B"
+    path: str           # absolute path to the rsync binary
+    version: str        # first line of `rsync --version`
+
+
+def rsync_version(path):
+    """Return the first line of `<rsync> --version`, or a placeholder."""
+    try:
+        r = subprocess.run([path, "--version"], capture_output=True, text=True, timeout=15)
+        line = (r.stdout or r.stderr or "").splitlines()
+        return line[0].strip() if line else "(no --version output)"
+    except (OSError, subprocess.TimeoutExpired) as e:
+        return f"(version unavailable: {e})"
+
+
+def drop_caches():
+    """Best-effort: flush dirty pages and drop the page/dentry/inode caches.
+
+    Needs root to write /proc/sys/vm/drop_caches; returns True on success.
+    """
+    subprocess.run(["sync"], check=False)
+    try:
+        with open("/proc/sys/vm/drop_caches", "w") as f:
+            f.write("3\n")
+        return True
+    except OSError:
+        return False
+
+
+def time_transfer(binary, rsync_args, src, dest, timeout):
+    """Run one `rsync <args> src/ dest/` and return its wall-clock seconds.
+
+    Raises RuntimeError if rsync exits non-zero (a failed transfer can't be
+    timed meaningfully).
+    """
+    argv = [binary.path, *rsync_args, src + "/", dest + "/"]
+    t0 = time.monotonic()
+    r = subprocess.run(argv, capture_output=True, text=True, timeout=timeout)
+    elapsed = time.monotonic() - t0
+    if r.returncode != 0:
+        raise RuntimeError(
+            f"{binary.label} ({binary.path}) rsync exited {r.returncode}:\n"
+            f"  cmd: {shlex.join(argv)}\n"
+            f"  {(r.stderr or r.stdout).strip()}")
+    return elapsed
+
+
+def run_benchmark(binaries, args, src, dest_full, dest_noop):
+    """Run the alternating loops; return {label: {mode: [all samples]}}."""
+    do_full = args.mode in ("both", "full")
+    do_noop = args.mode in ("both", "noop")
+
+    # Pre-populate the shared no-op destination so every timed no-op run finds
+    # nothing to do.  Use binary A; its content is identical for B.
+    if do_noop:
+        time_transfer(binaries[0], args.rsync_args, src, dest_noop, args.timeout)
+
+    samples = {b.label: {m: [] for m in ("full", "noop")} for b in binaries}
+    total_loops = args.warmup + args.runs
+
+    for loop in range(total_loops):
+        tag = "warmup" if loop < args.warmup else f"run {loop - args.warmup + 1}/{args.runs}"
+        # Alternate which binary goes first to cancel first-mover/thermal drift.
+        order = binaries if loop % 2 == 0 else list(reversed(binaries))
+        for b in order:
+            if do_full:
+                safe_rmtree(dest_full) if os.path.exists(dest_full) else None
+                os.mkdir(dest_full)
+                if args.drop_caches:
+                    drop_caches()
+                t = time_transfer(b, args.rsync_args, src, dest_full, args.timeout)
+                samples[b.label]["full"].append(t)
+                _progress(b, "full", tag, t)
+            if do_noop:
+                if args.drop_caches:
+                    drop_caches()
+                t = time_transfer(b, args.rsync_args, src, dest_noop, args.timeout)
+                samples[b.label]["noop"].append(t)
+                _progress(b, "noop", tag, t)
+    return samples
+
+
+def _progress(binary, mode, tag, t):
+    excl = " (warmup, excluded)" if tag == "warmup" else ""
+    print(f"  [{tag:>10}] {binary.label} {mode:<4} {t:8.3f}s{excl}")
+
+
+# ---------------------------------------------------------------------------
+# Reporting.
+# ---------------------------------------------------------------------------
+
+def _stats(times):
+    """(n, mean, stddev, min, median) over the timing samples."""
+    n = len(times)
+    if n == 0:
+        return (0, 0.0, 0.0, 0.0, 0.0)
+    return (n, statistics.mean(times),
+            statistics.stdev(times) if n > 1 else 0.0,
+            min(times), statistics.median(times))
+
+
+def report(binaries, samples, args):
+    """Print the per-binary tables and the A-vs-B comparison; return exit code."""
+    print("\n" + "=" * 72)
+    for b in binaries:
+        print(f"{b.label}: {b.path}\n   {b.version}")
+    print(f"rsync args: {' '.join(args.rsync_args)}   "
+          f"(note: a full copy is not fsync'd unless you add --fsync)")
+    print("=" * 72)
+
+    modes = [m for m in ("full", "noop") if any(samples[b.label][m] for b in binaries)]
+    hdr = f"{'binary':<7}{'mode':<6}{'runs':>5}{'mean':>11}{'stddev':>11}{'min':>11}{'median':>11}"
+
+    for mode in modes:
+        print(f"\n{hdr}\n{'-' * len(hdr)}")
+        st = {}
+        for b in binaries:
+            # Drop the leading warm-up samples before computing statistics.
+            kept = samples[b.label][mode][args.warmup:]
+            st[b.label] = _stats(kept)
+            n, mean, sd, mn, md = st[b.label]
+            print(f"{b.label:<7}{mode:<6}{n:>5}{mean:>10.3f}s{sd:>10.3f}s"
+                  f"{mn:>10.3f}s{md:>10.3f}s")
+
+        a, c = binaries[0].label, binaries[1].label
+        (na, ma, sda, *_), (nc, mc, sdc, *_) = st[a], st[c]
+        if na and nc and ma > 0:
+            delta = mc - ma
+            pct = delta / ma * 100.0
+            noise = max(sda, sdc)
+            # Flag only when B is slower beyond the run-to-run noise and a small
+            # relative threshold, so jitter doesn't cry "regression".
+            if delta > noise and pct > args.threshold:
+                verdict = f"REGRESSION (slower): {c} is {pct:+.1f}% vs {a}"
+            elif delta < -noise and -pct > args.threshold:
+                verdict = f"faster: {c} is {pct:+.1f}% vs {a}"
+            else:
+                verdict = f"no significant change: {pct:+.1f}% (within noise)"
+            print(f"  {mode}: {a} {ma:.3f}s  vs  {c} {mc:.3f}s  ->  {verdict}")
+
+    if args.csv:
+        _write_csv(args.csv, binaries, samples)
+        print(f"\nraw per-run timings written to {args.csv}")
+    return 0
+
+
+def _write_csv(path, binaries, samples):
+    with open(path, "w") as f:
+        f.write("binary,path,mode,run,warmup,seconds\n")
+        for b in binaries:
+            for mode in ("full", "noop"):
+                for i, t in enumerate(samples[b.label][mode]):
+                    f.write(f"{b.label},{b.path},{mode},{i},{int(i == 0)},{t:.6f}\n")
+
+
+# ---------------------------------------------------------------------------
+# Main.
+# ---------------------------------------------------------------------------
+
+def main():
+    ap = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("rsync_a", help="path to the first rsync binary (labelled A)")
+    ap.add_argument("rsync_b", help="path to the second rsync binary (labelled B)")
+    ap.add_argument("-n", "--runs", type=int, default=10,
+                    help="measured loops per binary (default: 10)")
+    ap.add_argument("--warmup", type=int, default=1,
+                    help="leading runs per binary dropped from the stats to "
+                         "reduce cache impact (default: 1)")
+    ap.add_argument("--mode", choices=("both", "full", "noop"), default="both",
+                    help="full=clean-dest copy, noop=re-sync scan overhead, "
+                         "both (default)")
+    ap.add_argument("--rsync-args", default="-aH",
+                    help="rsync flags for the timed transfer (default: -aH)")
+    ap.add_argument("--threshold", type=float, default=2.0,
+                    help="percent slowdown above run-to-run noise before a "
+                         "regression is flagged (default: 2.0)")
+    # Tree-generation knobs (mirror gentestdata.py).
+    ap.add_argument("--src", default=None,
+                    help="benchmark this existing tree instead of generating one")
+    ap.add_argument("-f", "--files", type=int, default=10000,
+                    help="number of regular files to generate (default: 10000)")
+    ap.add_argument("-s", "--total-size", type=parse_size, default="500M",
+                    help="total size of all regular files (default: 500M)")
+    ap.add_argument("-d", "--depth", type=int, default=10,
+                    help="maximum directory tree depth (default: 10)")
+    ap.add_argument("--dirs", type=int, default=None,
+                    help="number of directories (default: max(depth, files/20))")
+    ap.add_argument("--symlinks", type=int, default=None,
+                    help="number of symlinks (default: files/20)")
+    ap.add_argument("--hardlinks", type=int, default=None,
+                    help="number of hard links (default: files/20)")
+    ap.add_argument("--seed", type=int, default=1,
+                    help="PRNG seed for a reproducible tree (default: 1)")
+    ap.add_argument("--workdir", default=None,
+                    help="scratch root for src/dest dirs (default: a tempdir)")
+    ap.add_argument("--drop-caches", action="store_true",
+                    help="sync + drop page/dentry/inode caches before each timed "
+                         "run (needs root; cold-cache measurement)")
+    ap.add_argument("--timeout", type=float, default=3600.0,
+                    help="seconds before a single rsync run is abandoned "
+                         "(default: 3600)")
+    ap.add_argument("--keep", action="store_true",
+                    help="keep the scratch tree on exit (default: remove it)")
+    ap.add_argument("--csv", default=None,
+                    help="write raw per-run timings to this CSV file")
+    args = ap.parse_args()
+
+    if args.runs < 2:
+        ap.error("--runs must be >= 2 (need >=2 samples for a stddev)")
+    args.rsync_args = shlex.split(args.rsync_args)
+
+    binaries = []
+    for label, p in (("A", args.rsync_a), ("B", args.rsync_b)):
+        path = os.path.abspath(p)
+        if not (os.path.isfile(path) and os.access(path, os.X_OK)):
+            ap.error(f"rsync {label} is not an executable file: {p}")
+        binaries.append(Binary(label, path, rsync_version(path)))
+
+    workdir = tempfile.mkdtemp(prefix="rsync-perftest-",
+                               dir=args.workdir) if not args.keep or not args.workdir \
+        else os.path.join(args.workdir, "rsync-perftest")
+    os.makedirs(workdir, exist_ok=True)
+    dest_full = os.path.join(workdir, "dest_full")
+    dest_noop = os.path.join(workdir, "dest_noop")
+    os.makedirs(dest_noop, exist_ok=True)
+
+    generated = None
+    if args.src:
+        src = os.path.abspath(args.src)
+        if not os.path.isdir(src):
+            ap.error(f"--src is not a directory: {args.src}")
+        print(f"using existing source tree {src}")
+    else:
+        src = os.path.join(workdir, "src")
+        print(f"generating source tree in {src} ...")
+        t0 = time.monotonic()
+        summary = generate_tree(src, args)
+        generated = src
+        print(f"  {summary}  ({time.monotonic() - t0:.1f}s)")
+
+    print(f"\nbenchmarking: warmup={args.warmup} runs={args.runs} mode={args.mode} "
+          f"drop_caches={args.drop_caches}\n")
+    rc = 1
+    try:
+        samples = run_benchmark(binaries, args, src, dest_full, dest_noop)
+        rc = report(binaries, samples, args)
+    except RuntimeError as e:
+        print(f"\nbenchmark aborted: {e}", file=sys.stderr)
+        rc = 2
+    except KeyboardInterrupt:
+        print("\ninterrupted", file=sys.stderr)
+        rc = 130
+    finally:
+        if args.keep:
+            print(f"\nkept scratch tree: {workdir}")
+        else:
+            for d in (dest_full, dest_noop, generated):
+                if d and os.path.exists(d):
+                    safe_rmtree(d)
+            # Remove the workdir itself if it is now empty (i.e. we made it).
+            try:
+                os.rmdir(workdir)
+            except OSError:
+                pass
+    sys.exit(rc)
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: sw=4 et ft=python