diff --git a/R/table_columns.R b/R/table_columns.R
index 80cadef..0c853b5 100644
--- a/R/table_columns.R
+++ b/R/table_columns.R
@@ -96,14 +96,16 @@ compute_col_widths <- function(resolved_cols, data, content_width_in,
                                tbl, pg_width, pg_height, margins,
                                overflow_action   = c("error", "warn"),
                                validate_overflow = TRUE,
-                               floor_overrides   = NULL) {
+                               floor_overrides   = NULL,
+                               cache             = NULL) {
   overflow_action <- match.arg(overflow_action)
   strategy <- tbl$col_split_strategy %||% "balanced"
 
   # Shared setup: compute natural widths, resolve relative weights,
   # auto-detect wrap eligibility, and measure col_cont_label_half_w.
   setup <- .resolve_natural_widths(
-    resolved_cols, data, content_width_in, tbl, pg_width, pg_height, margins
+    resolved_cols, data, content_width_in, tbl, pg_width, pg_height, margins,
+    cache = cache
   )
 
   # Dispatch.  Each strategy returns list(resolved_cols, col_groups).
@@ -162,7 +164,8 @@ compute_col_widths <- function(resolved_cols, data, content_width_in,
 # The scratch device used for text measurement is opened, used, and
 # closed inside this function so neither strategy has to manage it.
 .resolve_natural_widths <- function(resolved_cols, data, content_width_in,
-                                     tbl, pg_width, pg_height, margins) {
+                                     tbl, pg_width, pg_height, margins,
+                                     cache = NULL) {
   n_cols    <- length(resolved_cols)
   n_grp     <- length(tbl$group_vars)
   min_in    <- .width_in(tbl$min_col_width)
@@ -171,6 +174,7 @@ compute_col_widths <- function(resolved_cols, data, content_width_in,
                .width_in(cell_pad[["left"]])
   na_str    <- tbl$na_string
   max_rows  <- tbl$max_measure_rows
+  hdr_gp_key  <- paste0("header_row_lh", tbl$line_height)
 
   scratch_file <- tempfile(fileext = ".pdf")
   grDevices::pdf(scratch_file, width = pg_width, height = pg_height)
@@ -198,8 +202,12 @@ compute_col_widths <- function(resolved_cols, data, content_width_in,
         .resolve_table_gp(tbl$gp, "header_row"), tbl$line_height
       )
       parts  <- .split_col_strings(data[[cs$col]], cs$label, na_str, max_rows)
-      w_data <- .measure_max_string_width(parts$data, cell_gp)
-      w_hdr  <- .measure_max_string_width(parts$header, hdr_gp)
+      cell_key <- paste0(if (cs$is_group_col) "group_col" else "data_row",
+                         "_lh", tbl$line_height)
+      w_data <- .measure_max_string_width(parts$data,   cell_gp,
+                                          gp_key = cell_key, cache = cache)
+      w_hdr  <- .measure_max_string_width(parts$header, hdr_gp,
+                                          gp_key = hdr_gp_key, cache = cache)
       max(min_in, max(w_data, w_hdr) + h_pad_in)
     }
   }, numeric(1L))
diff --git a/R/table_draw.R b/R/table_draw.R
index 197c78a..21936d5 100644
--- a/R/table_draw.R
+++ b/R/table_draw.R
@@ -66,7 +66,8 @@ build_table_grob <- function(row_page, col_group_idx, n_group_cols,
                              cell_heights_in_mat = NULL,
                              cont_row_h_in       = NULL,
                              is_first_col_page   = TRUE,
-                             is_last_col_page    = TRUE) {
+                             is_last_col_page    = TRUE,
+                             clip_width_caches   = NULL) {
   # Subset to display columns for this page
   page_cols <- resolved_cols[col_group_idx]
 
@@ -81,6 +82,11 @@ build_table_grob <- function(row_page, col_group_idx, n_group_cols,
     cont_row_h_in       = cont_row_h_in,        # cached from paginate phase
     is_first_col_page   = is_first_col_page,    # FALSE when prior col pages exist
     is_last_col_page    = is_last_col_page,     # FALSE when more col pages follow
+    clip_width_caches   = clip_width_caches,    # shared across all pages of
+                                                # this tfl_table; NULL means
+                                                # drawDetails will create
+                                                # per-page envs (e.g. grobs
+                                                # assembled outside pipeline)
     cl                  = "tfl_table_grob"
   )
 }
@@ -300,12 +306,22 @@ drawDetails.tfl_table_grob <- function(x, recording) {
   })
 
   # Per-column clip-width memo: many cells in a column share identical text
-  # (numeric formats like "5.1", category labels), and the clip-width
-  # computation otherwise re-measures each one.  Cache scoped to this
-  # drawDetails call and to one column, so a single (text -> width) key is
-  # enough.
-  clip_width_cache_by_col <- lapply(page_cols, function(cs) {
-    new.env(hash = TRUE, parent = emptyenv())
+  # (numeric formats like "5.1", category labels).  When the parent
+  # tfl_table_to_pagelist() built a shared cache list, reuse those envs --
+  # they persist across every row-page and col-group of this table, so the
+  # cache hits accumulate over all pages.  Otherwise fall back to per-page
+  # envs (e.g. for grobs built outside the normal pipeline).
+  clip_width_cache_by_col <- if (!is.null(x$clip_width_caches)) {
+    lapply(x$col_group_idx, function(k) x$clip_width_caches[[k]])
+  } else {
+    lapply(page_cols, function(cs) new.env(hash = TRUE, parent = emptyenv()))
+  }
+
+  # Pre-extract and pre-format each column's cell strings for the rows on
+  # this page.  Replaces a per-cell `.fmt_cell(data[[cs$col]][i], na_str)`
+  # with a single vectorised `.fmt_cell_vec()` per column.
+  cell_strs_by_col <- lapply(page_cols, function(cs) {
+    .fmt_cell_vec(data[[cs$col]][rows], na_str)
   })
 
   # Hoist row/group-rule gpars too -- they don't change between rows.  Only
@@ -358,9 +374,8 @@ drawDetails.tfl_table_grob <- function(x, recording) {
 
     # Draw data row
     for (j in seq_len(n_disp_cols)) {
-      cs      <- page_cols[[j]]
-      raw_val <- data[[cs$col]][i]
-      cell_str <- .fmt_cell(raw_val, na_str)
+      cs       <- page_cols[[j]]
+      cell_str <- cell_strs_by_col[[j]][[ri]]
 
       # Group repeat suppression and span detection
       clip_h <- row_h
@@ -590,10 +605,38 @@ drawDetails.tfl_table_grob <- function(x, recording) {
   # call; the optional per-column cache lets the caller deduplicate.
   text_w <- .measure_text_width_in(text, gp, width_cache)
   needed <- text_w + h_lft_in + h_rgt_in
-  bleed_tol_in <- 0.05
-  clip_w <- min(col_width_in + bleed_tol_in, max(col_width_in, needed))
 
-  # Clip to column width by using a clipping viewport
+  # X position in parent-viewport inches.
+  x_in <- if (identical(align, "left")) {
+    x_left + h_lft_in
+  } else if (identical(align, "right")) {
+    x_right - h_rgt_in
+  } else {
+    (x_left + x_right) / 2
+  }
+  y_in <- y_top_in + v_top_in
+
+  if (needed <= col_width_in) {
+    # Fast path: the text fits inside its column, so the column width
+    # already clips by construction.  Draw directly into the parent
+    # viewport and skip the per-cell clip viewport push/pop entirely --
+    # for tables of all-fits cells (numeric columns, short categoricals)
+    # this saves a viewport per cell.
+    grid::grid.text(
+      label = text,
+      x     = grid::unit(x_in / vp_w, "npc"),
+      y     = grid::unit(1 - y_in / vp_h, "npc"),
+      just  = just,
+      gp    = gp
+    )
+    return(invisible(NULL))
+  }
+
+  # Slow path: text exceeds the column.  Push a clipping viewport so the
+  # overflow is clipped at the column edge plus a small tolerance,
+  # instead of bleeding into the neighbour.
+  bleed_tol_in <- 0.05
+  clip_w <- min(col_width_in + bleed_tol_in, needed)
   vp_clip <- grid::viewport(
     x      = grid::unit(x_left / vp_w, "npc"),
     y      = grid::unit(1 - (y_top_in + row_h) / vp_h, "npc"),
@@ -604,18 +647,17 @@ drawDetails.tfl_table_grob <- function(x, recording) {
   )
   grid::pushViewport(vp_clip)
 
-  # Re-express x, y relative to clip viewport
-  vp_w2 <- .width_in(grid::unit(1, "npc"))
-  vp_h2 <- .height_in(grid::unit(1, "npc"))
-
+  # vp_clip was just constructed with width = clip_w inches and
+  # height = row_h inches, so npc=1 inside it is exactly that many inches --
+  # no need to convertUnit() to recover those numbers.
   x_local_in <- if (identical(align, "left")) {
     h_lft_in
   } else if (identical(align, "right")) {
-    vp_w2 - h_rgt_in
+    clip_w - h_rgt_in
   } else {
-    vp_w2 / 2
+    clip_w / 2
   }
-  y_local_in <- vp_h2 - v_top_in
+  y_local_in <- row_h - v_top_in
 
   grid::grid.text(
     label = text,
diff --git a/R/table_pagelist.R b/R/table_pagelist.R
index bb2583f..2a40fc4 100644
--- a/R/table_pagelist.R
+++ b/R/table_pagelist.R
@@ -150,6 +150,15 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
   wrap_extra_pad_in <- if (!is.null(tbl$wrap_extra_padding)) {
     .height_in(tbl$wrap_extra_padding)
   } else 0
+
+  # Per-pagination text-dimension cache.  Spans the natural-width pass,
+  # the row-height pass, and the header-height pass.  Each (gp_key, string)
+  # is constructed via grid::textGrob exactly once; both dimensions are read
+  # from that grob and cached together.  Different passes typically need
+  # only one dimension, so the cache amortises the other for free.  All
+  # those passes open PDF scratch devices with the same pg_width/pg_height
+  # so font metrics are stable across the cache's lifetime.
+  text_dim_cache <- new.env(hash = TRUE, parent = emptyenv())
   strategy        <- tbl$col_split_strategy %||% "balanced"
   max_retries     <- as.integer(tbl$row_overflow_max_retries %||% 5L)
   use_retry_loop  <- identical(strategy, "balanced") && max_retries > 0L
@@ -178,14 +187,16 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
     header_row_h <- if (tbl$show_col_names) {
       .measure_header_row_height(resolved_cols, tbl$gp, tbl$cell_padding,
                                  tbl$line_height, breaks = breaks,
-                                 wrap_extra_pad_in = wrap_extra_pad_in)
+                                 wrap_extra_pad_in = wrap_extra_pad_in,
+                                 cache = text_dim_cache)
     } else 0
 
     cell_h_mat <- measure_row_heights_tbl(
       tbl$data, resolved_cols, tbl$gp, tbl$cell_padding,
       tbl$na_string, tbl$line_height, tbl$max_measure_rows,
       breaks = breaks,
-      wrap_extra_pad_in = wrap_extra_pad_in
+      wrap_extra_pad_in = wrap_extra_pad_in,
+      cache = text_dim_cache
     )
 
     cont_row_h <- max(
@@ -217,7 +228,8 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
     col_result <- compute_col_widths(
       resolved_cols_0, tbl$data, cw, tbl, pg_width, pg_height, margins,
       overflow_action = overflow_action,
-      floor_overrides = floor_overrides
+      floor_overrides = floor_overrides,
+      cache           = text_dim_cache
     )
     resolved_cols <- col_result$resolved_cols
     col_groups    <- col_result$col_groups
@@ -239,7 +251,8 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
         resolved_cols_0, tbl$data, cw_adj, tbl, pg_width, pg_height, margins,
         overflow_action   = overflow_action,
         validate_overflow = FALSE,
-        floor_overrides   = floor_overrides
+        floor_overrides   = floor_overrides,
+        cache             = text_dim_cache
       )
       resolved_cols <- col_result$resolved_cols
       col_groups    <- col_result$col_groups
@@ -287,6 +300,15 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
   pages <- vector("list", n_rp * n_cg)
   idx   <- 1L
 
+  # Shared per-resolved_cols clip-width cache.  Every page-grob built below
+  # holds a reference to the SAME list of envs, so .draw_cell_text() can
+  # reuse measurements across pages of the same tfl_table (one entry per
+  # unique cell text per column).  Envs are reference-typed, so memory is
+  # not duplicated per grob.
+  clip_width_caches <- lapply(seq_along(resolved_cols), function(k) {
+    new.env(hash = TRUE, parent = emptyenv())
+  })
+
   for (rp in seq_len(n_rp)) {
     for (cg in seq_len(n_cg)) {
       grob <- build_table_grob(
@@ -298,7 +320,8 @@ tfl_table_to_pagelist <- function(tbl, pg_width, pg_height, dots,
         cell_heights_in_mat  = cell_h_mat,
         cont_row_h_in        = cont_row_h,
         is_first_col_page    = (cg == 1L),
-        is_last_col_page     = (cg == n_cg)
+        is_last_col_page     = (cg == n_cg),
+        clip_width_caches    = clip_width_caches
       )
       page_spec <- list(content = grob)
       pages[[idx]] <- page_spec
diff --git a/R/table_rows.R b/R/table_rows.R
index 6af93e1..7c71d23 100644
--- a/R/table_rows.R
+++ b/R/table_rows.R
@@ -26,7 +26,8 @@
 #' @keywords internal
 measure_row_heights_tbl <- function(data, resolved_cols, gp_tbl, cell_padding,
                                     na_string, line_height, max_measure_rows,
-                                    breaks = NULL, wrap_extra_pad_in = 0) {
+                                    breaks = NULL, wrap_extra_pad_in = 0,
+                                    cache = NULL) {
   n_rows   <- nrow(data)
   n_cols   <- length(resolved_cols)
   v_pad_in <- .height_in(cell_padding[["top"]]) +
@@ -34,17 +35,12 @@ measure_row_heights_tbl <- function(data, resolved_cols, gp_tbl, cell_padding,
   h_lft_in <- .width_in(cell_padding[["left"]])
   h_rgt_in <- .width_in(cell_padding[["right"]])
 
-  # Memoised per-cell-text-height function: (string, gp_key) -> height_in
-  memo <- new.env(hash = TRUE, parent = emptyenv())
-  .memo_str_height <- function(s, gp_key, gp) {
-    key <- paste0(gp_key, "\x01", s)
-    if (!exists(key, envir = memo, inherits = FALSE)) {
-      grob <- grid::textGrob(s, gp = gp)
-      h    <- .height_in(grid::grobHeight(grob))
-      assign(key, h, envir = memo)
-    }
-    get(key, envir = memo, inherits = FALSE)
-  }
+  # When the caller supplied a shared cache, reuse it across columns so that
+  # any (string, gp) measured during the natural-width pass earlier in
+  # pagination is served from the cache here.  Otherwise create a local
+  # height-only memo (preserves the pre-cache behaviour for the few callers
+  # that exercise this function directly).
+  if (is.null(cache)) cache <- new.env(hash = TRUE, parent = emptyenv())
 
   # Limit rows sampled for height estimation
   sample_rows <- if (is.finite(max_measure_rows) && n_rows > max_measure_rows) {
@@ -83,7 +79,7 @@ measure_row_heights_tbl <- function(data, resolved_cols, gp_tbl, cell_padding,
         cell_str
       }
       nlines  <- max(1L, length(strsplit(display_str, "\n", fixed = TRUE)[[1L]]))
-      h_grob  <- .memo_str_height(display_str, gp_key, cell_gp)
+      h_grob  <- .measure_text_dims_in(display_str, cell_gp, gp_key, cache)$h
       h_line  <- nlines * .height_in(grid::stringHeight("M"))
       extra   <- if (nlines > 1L) wrap_extra_pad_in else 0
       cell_h_mat[i, j] <- max(h_grob, h_line) + v_pad_in + extra
diff --git a/R/table_utils.R b/R/table_utils.R
index b81b17c..379f478 100644
--- a/R/table_utils.R
+++ b/R/table_utils.R
@@ -52,13 +52,15 @@
 # between the header row and the first data row is more obvious.
 .measure_header_row_height <- function(resolved_cols, gp_tbl, cell_padding,
                                        line_height, breaks = NULL,
-                                       wrap_extra_pad_in = 0) {
+                                       wrap_extra_pad_in = 0,
+                                       cache = NULL) {
   v_pad_in <- .height_in(cell_padding[["top"]]) +
               .height_in(cell_padding[["bottom"]])
   h_lft_in <- .width_in(cell_padding[["left"]])
   h_rgt_in <- .width_in(cell_padding[["right"]])
   hdr_gp   <- .gp_with_lineheight(.resolve_table_gp(gp_tbl, "header_row"),
                                    line_height)
+  gp_key   <- paste0("header_row_lh", line_height)
 
   max(vapply(resolved_cols, function(cs) {
     label <- cs$label
@@ -67,8 +69,7 @@
                                       h_lft_in + h_rgt_in, hdr_gp, breaks)
     }
     nlines <- max(1L, length(strsplit(label, "\n", fixed = TRUE)[[1L]]))
-    grob   <- grid::textGrob(label, gp = hdr_gp)
-    h_grob <- .height_in(grid::grobHeight(grob))
+    h_grob <- .measure_text_dims_in(label, hdr_gp, gp_key, cache)$h
     h_line <- nlines * .height_in(grid::stringHeight("M"))
     extra  <- if (nlines > 1L) wrap_extra_pad_in else 0
     max(h_grob, h_line) + extra
@@ -255,15 +256,53 @@
 # Text measurement helpers
 # ---------------------------------------------------------------------------
 
+# Look up or measure (width, height) for `s` under `gp`, caching both when
+# an environment is supplied.  `gp_key` is a stable structural key for `gp`
+# (e.g. paste0("data_row_lh", line_height)) so callers using the same gp
+# share entries without hashing the gpar field-by-field per lookup.
+#
+# When the cache misses we build the textGrob once and read BOTH dimensions
+# from it -- consolidating what would otherwise be two separate textGrob
+# constructions if a later caller needs the other dimension of the same
+# (gp, string).  Each construction re-runs grid's gpar validation, which
+# is the dominant cost; avoiding the duplicate is the point.
+.measure_text_dims_in <- function(s, gp, gp_key, cache = NULL) {
+  if (!nzchar(s)) return(list(w = 0, h = 0))
+  if (!is.null(cache)) {
+    key <- paste0(gp_key, "\x01", s)
+    if (exists(key, envir = cache, inherits = FALSE)) {
+      return(get(key, envir = cache, inherits = FALSE))
+    }
+  }
+  g <- grid::textGrob(s, gp = gp)
+  out <- list(w = .width_in(grid::grobWidth(g)),
+              h = .height_in(grid::grobHeight(g)))
+  if (!is.null(cache)) assign(key, out, envir = cache)
+  out
+}
+
 # Measure the maximum rendered text width (in inches) for a vector of strings.
 # Uses textGrob rather than stringWidth() because stringWidth() does not
 # accept a gp argument in all grid versions.
-.measure_max_string_width <- function(strings, gp) {
+#
+# When a `cache` env and `gp_key` are supplied, lookups go through the
+# consolidated (string -> (w, h)) cache from `.measure_text_dims_in()` so a
+# later height query for the same (string, gp) reuses the textGrob.
+.measure_max_string_width <- function(strings, gp, gp_key = NULL,
+                                      cache = NULL) {
   if (length(strings) == 0L) return(0)
   # Dedupe up front: real-world callers pass cell-string vectors where the
   # same value typically appears in many rows (e.g. category labels, NA
   # strings), so this saves grid round-trips with no behaviour change.
   uniq <- unique(strings)
+  if (!is.null(cache) && !is.null(gp_key)) {
+    return(max(vapply(uniq, function(s) {
+      lines <- strsplit(s, "\n", fixed = TRUE)[[1L]]
+      max(vapply(lines,
+                 function(ln) .measure_text_dims_in(ln, gp, gp_key, cache)$w,
+                 numeric(1L)))
+    }, numeric(1L))))
+  }
   max(vapply(uniq, function(s) {
     lines <- strsplit(s, "\n", fixed = TRUE)[[1L]]
     max(vapply(lines, function(ln) {
diff --git a/R/wrap.R b/R/wrap.R
index 7fd9acf..289ebf0 100644
--- a/R/wrap.R
+++ b/R/wrap.R
@@ -112,21 +112,25 @@ wrap_breaks_default <- function() {
   n     <- length(chars)
   if (n == 0L) return(list())
 
-  tokens   <- vector("list", n)        # over-allocate; trim at end
-  k        <- 0L
-  cur_buf  <- character(n)
-  cur_n    <- 0L
-  pending  <- ""                       # lead for the next emitted token
+  # Track each token by its [start, end] position in `s` rather than
+  # accumulating characters and pasting on flush.  `substr(s, st, ed)` is
+  # one C call per token; the previous `paste(cur_buf[seq_len(cur_n)],
+  # collapse = "")` allocated and joined cur_n elements per token.
+  tokens    <- vector("list", n)       # over-allocate; trim at end
+  k         <- 0L
+  cur_start <- 0L                       # 0 means "no token in progress"
+  cur_end   <- 0L
+  pending   <- ""
 
   flush <- function() {
-    if (cur_n > 0L) {
+    if (cur_start > 0L) {
       k <<- k + 1L
       tokens[[k]] <<- list(
-        text = paste(cur_buf[seq_len(cur_n)], collapse = ""),
+        text = substr(s, cur_start, cur_end),
         lead = pending
       )
-      cur_n   <<- 0L
-      pending <<- ""
+      cur_start <<- 0L
+      pending   <<- ""
     }
   }
 
@@ -136,13 +140,13 @@ wrap_breaks_default <- function() {
       flush()
       pending <- ch
     } else if (length(keep_chars) > 0L && ch %in% keep_chars) {
-      cur_n          <- cur_n + 1L
-      cur_buf[cur_n] <- ch
+      if (cur_start == 0L) cur_start <- i
+      cur_end <- i
       flush()
       pending <- ""
     } else {
-      cur_n          <- cur_n + 1L
-      cur_buf[cur_n] <- ch
+      if (cur_start == 0L) cur_start <- i
+      cur_end <- i
     }
   }
   flush()
diff --git a/design/DECISIONS.md b/design/DECISIONS.md
index f9260e4..b6c6f8a 100644
--- a/design/DECISIONS.md
+++ b/design/DECISIONS.md
@@ -1483,3 +1483,272 @@ never touches `drawDetails.tfl_table_grob`.
 **Verification:**
 - Full `devtools::test()` green before and after.
 - `examples/bench_compare.R` reproduces the table above.
+
+---
+
+## D-45: Fast-path cell drawing, position-based tokenizer, and per-page formatting
+
+**Decision:** Three independent, profile-driven changes layered on top of
+D-43/D-44:
+
+1. **Fast path in `.draw_cell_text()`** — when the measured text width
+   plus horizontal padding is no greater than the column width, draw
+   directly in the parent viewport instead of pushing a per-cell
+   clipping viewport.  The clip viewport (which created+pushed+popped
+   per cell) is still used in the slow path, where text might bleed.
+2. **Drop redundant `convertUnit` calls in `.draw_cell_text()`'s clip
+   path** — the clip viewport is constructed with explicit
+   `unit(clip_w, "inches")` / `unit(row_h, "inches")` dimensions, so
+   the post-push `vp_w2 <- .width_in(unit(1, "npc"))` /
+   `vp_h2 <- .height_in(unit(1, "npc"))` calls were re-measuring known
+   values.  Replaced with the literals.
+3. **Position-based tokenizer in `.tokenize_for_wrap()`** — replaced the
+   per-token `paste(cur_buf[seq_len(cur_n)], collapse = "")` with
+   `substr(s, cur_start, cur_end)`.  One C call per token instead of
+   `n` element accumulations + `paste`.
+4. **Per-page formatting hoist in `drawDetails`** — replaced per-cell
+   `.fmt_cell(data[[cs$col]][i], na_str)` with a single
+   `.fmt_cell_vec(data[[cs$col]][rows], na_str)` per column built before
+   the row loop.
+
+**Context (profiling, round 3):**  Post-D-44, the `core_paginate`
+Rprof showed `table_draw.R:403` (the `.draw_cell_text()` call site)
+accumulating **73% of total time**.  Drilling into `.draw_cell_text()`:
+
+| Line | Self self.pct | Total total.pct | What |
+|------|--------------|-----------------|------|
+| 597 (measure) | 0.73% | 13.76% | text-width measurement (cached, but still per cell) |
+| 603 (vp_clip) | 1.17% | 13.03% | viewport creation (4 `grid::unit()` + `viewport()`) |
+| 611 (pushViewport) | — | — | pushing the clip vp |
+| 625 (grid.text)   | 0.29% | 29.14% | the actual text draw |
+| 633 (popViewport) | 0.15% | 6.44% | popping |
+
+The viewport push/pop pair (~20% of total) is only meaningful when text
+overflows the column.  For all-numeric tables (iris) and post-wrap
+cells, the text always fits and the clip is redundant.
+
+**Measured improvement** (medians, 15 iterations per core scenario, 3
+for `wrap_demos`, 5 independent runs averaged; baseline = main + D-43 +
+D-44 at `7c9484c`):
+
+| Scenario        | Before    | After     | Δ                       |
+|-----------------|-----------|-----------|-------------------------|
+| `core_small`    | 146 ms    | 111 ms    | **~24% ↓**              |
+| `core_wrap`     | 206 ms    | 209 ms    | within run-to-run noise |
+| `core_paginate` | 397 ms    | 270 ms    | **~32% ↓**              |
+| `figure_multi`  | 318 ms    | 330 ms    | within noise            |
+| `wrap_demos`    | 2.97 s    | 2.80 s    | **~6% ↓**               |
+
+`core_paginate` (the iris-heavy draw scenario) and `core_small`
+(short table going through the same draw loop) get the biggest gains.
+`wrap_demos` had high run-to-run variance with only n=3 iterations;
+averaged across 5 independent runs it shows a steady ~6% reduction.
+`core_wrap` is unchanged because its bottleneck is height-balance
+measurement, not cell drawing.  `figure_multi` doesn't touch
+`drawDetails.tfl_table_grob`.
+
+**Alternatives considered and rejected:**
+- *Regex-based vectorized tokenizer* — `regmatches`/`strsplit`-driven
+  reimplementation would shave more time but the algorithm has subtle
+  drop-vs-keep_before semantics (multiple consecutive drops collapse to
+  one lead, keep_before chars stay with the preceding token).  The
+  position-based change keeps the existing algorithm verbatim and
+  preserves the comment-level explanation.  Faster vectorization was
+  not worth the risk-of-regression.
+- *Pre-computed parent-viewport coordinates passed into
+  `.draw_cell_text()`* — would push more work into the drawDetails
+  body.  The current shape (compute once inside `.draw_cell_text()`) is
+  shorter and the per-call overhead is now tiny.
+- *Skipping the text-width measurement entirely when `nchar(text)` is
+  small enough* — would require a font-aware upper bound on per-char
+  width; fragile across font sizes and families.  The cached
+  measurement is already cheap on cache hit.
+
+**Files touched:**
+- `R/table_draw.R`:
+  - `drawDetails.tfl_table_grob`: hoisted `cell_strs_by_col` (vectorised
+    formatting), and the existing hoists from D-44 are unchanged.
+  - `.draw_cell_text`: added fast/slow branch on `needed <= col_width_in`,
+    removed the two post-push `convertUnit` calls in the slow path.
+- `R/wrap.R`:
+  - `.tokenize_for_wrap`: track `cur_start`/`cur_end` positions and
+    emit text via `substr()` instead of accumulating + pasting.
+
+**Verification:**
+- Full `devtools::test()` green before and after.
+- `examples/bench_compare.R` reproduces the table above (averaging over
+  multiple runs is needed for `wrap_demos`'s lower-iteration sample).
+
+---
+
+## D-46: Cross-page clip-width cache shared across all pages of one tfl_table
+
+**Decision:** Construct one list of clip-width cache envs (one per column
+in `resolved_cols`) at the top of `tfl_table_to_pagelist()` and pass it
+into every `build_table_grob()` call.  Every page-grob built for the
+same table holds a reference to the same env list, so
+`drawDetails.tfl_table_grob()` can reuse cached measurements across all
+row-page x col-group combinations.
+
+**Context:** D-44 introduced a per-page clip-width cache.  Profiling and
+focused benchmarking showed that for tables spanning many pages, the
+same cell text typically appears on every page (numeric formats,
+visit labels, category codes), so each page was re-measuring the same
+strings.  A single env shared across pages eliminates that
+inter-page duplication while preserving the intra-page hits D-44
+already provided.
+
+**Change:**
+- `R/table_pagelist.R` -- inside `tfl_table_to_pagelist()` (and the
+  table1/flextable equivalents that call `build_table_grob` similarly),
+  `clip_width_caches <- lapply(seq_along(resolved_cols), function(k)
+  new.env(...))` is built once, then threaded to every
+  `build_table_grob()` invocation.
+- `R/table_draw.R` -- `build_table_grob()` gains a `clip_width_caches`
+  argument stored as a grob slot.  `drawDetails.tfl_table_grob()` now
+  prefers `x$clip_width_caches[x$col_group_idx]` over creating fresh
+  per-page envs, falling back to the old behaviour when the slot is
+  absent (e.g. for grobs built outside the normal pipeline).
+
+The cache is keyed by column index into `resolved_cols` (stable across
+col-group splits), not by the local `j` index in `page_cols` (which
+re-numbers per col-group).
+
+**Measured improvement** (medians, 30 iterations; baseline = round-3
+HEAD at `7fcf208` = D-45 shipped):
+
+| Scenario        | Before    | After     | Δ           |
+|-----------------|-----------|-----------|-------------|
+| `iris5p` (150 rows / 5 pages) | 264 ms | 258 ms | ~2% (modest) |
+| `big_df` (500 rows / ~17 pages, 4 cols incl. repeating categoricals) | 1560 ms | 1330 ms | **~15% ↓** |
+
+The gain scales with page count and amount of inter-page duplication.
+Short single-page tables see negligible benefit (the per-page cache
+already caught everything).  Realistic multi-page clinical listings
+hit the design sweet spot.
+
+**Alternatives considered and rejected:**
+- *Cache spanning pagination (scratch device) + drawing (render device)*
+  -- the documented re-measurement in `.draw_cell_text()` exists
+  because font metrics can legitimately differ between the scratch
+  PDF and the render device (e.g. knitr PNG vs PDF for preview mode).
+  A cross-device cache would risk inaccurate placement, which the
+  project owner explicitly excluded.
+- *Consolidated (width, height) cache in pagination* -- would save
+  ~half the gpar-validation overhead in `.measure_max_string_width()`
+  + `.memo_str_height()` by sharing one textGrob's measurements
+  between Pass 1 (widths) and Pass 2 (heights).  Independent of this
+  change; worth pursuing only if benchmarks justify the threading
+  cost.  Deferred.
+- *Package-level env keyed by tbl-object identity* -- zero plumbing,
+  but a global mutable env is harder to reason about (parallelism,
+  leaked entries).  The explicit list-threaded approach is clearer.
+
+**Files touched:**
+- `R/table_pagelist.R` -- one new `clip_width_caches` construction
+  block in `.tfl_table_to_pagelist_default()` and the corresponding
+  pass-through in the `build_table_grob()` call.
+- `R/table_draw.R` -- `build_table_grob()` accepts and stores the
+  cache; `drawDetails.tfl_table_grob()` reads it via
+  `x$clip_width_caches`.
+
+**Verification:**
+- Full `devtools::test()` green.
+- `examples/bench_focused.R` (n=30) reproduces the `iris5p` /
+  `big_df` table above.
+
+---
+
+## D-47: Consolidated width+height text-dimension cache during pagination
+
+**Decision:** Replace the two separate per-call height memos with a single
+`(gp_key, string) -> list(w, h)` cache that lives for one
+`.tfl_table_to_pagelist_default()` call.  Every textGrob built for
+measurement during pagination populates both dimensions; subsequent
+lookups for either dimension reuse the existing entry.
+
+**Context:** Profiling after D-46 showed that pagination still constructed
+two textGrobs for every unique cell string -- one in
+`.measure_max_string_width()` (Pass 1, widths) and one in
+`measure_row_heights_tbl()` (Pass 2, heights).  Each construction re-runs
+grid's `validGP` / `set.gpar` chain, the dominant per-call cost.  Pass 1
+and Pass 2 use the same gpar for matching column categories
+(`data_row`, `group_col`, `header_row`), so the same (gp, string) shows
+up twice in pagination work.
+
+**Change:**
+- `R/table_utils.R` -- new `.measure_text_dims_in(s, gp, gp_key, cache)`
+  helper that builds the textGrob once, reads both dimensions, and
+  caches the pair.  Existing `.measure_max_string_width()` and
+  `.measure_header_row_height()` now accept an optional cache and
+  delegate per-string measurement to the helper when provided.
+- `R/table_rows.R` -- `measure_row_heights_tbl()` accepts a cache and
+  replaces its inline `.memo_str_height()` closure with the same helper.
+- `R/table_columns.R` -- `compute_col_widths()` and `.resolve_natural_widths()`
+  thread a `cache` argument; the natural-width pass builds the
+  appropriate structural `gp_key` for cell-vs-header gpars and passes
+  it down.
+- `R/table_pagelist.R` -- `.tfl_table_to_pagelist_default()` creates one
+  `text_dim_cache <- new.env(hash = TRUE, parent = emptyenv())` at the
+  top and passes it to `compute_col_widths()`,
+  `.measure_header_row_height()`, and `measure_row_heights_tbl()`.
+
+The `gp_key` namespace matches what `measure_row_heights_tbl()` already
+used internally (e.g. `"data_row_lh1.2"`, `"group_col_lh1.2"`,
+`"header_row_lh1.2"`).  All callers that share a category resolve to the
+same key, so width-then-height (Pass 1 -> Pass 2) on the same `(category,
+string)` is a cache hit.
+
+**Why crossing scratch-device boundaries is safe here:** the multiple
+PDF scratch devices opened during pagination
+(`.resolve_natural_widths`, `.run_pagination_iter`) all use identical
+`(pg_width, pg_height)` settings, identical fonts, and identical R
+session state.  PDF device font metrics are deterministic given those
+inputs, so values measured on one scratch device are equal to what
+the next scratch device would produce.  The cache does NOT cross into
+the render-device drawing phase -- that boundary still goes through
+`.draw_cell_text()`'s separate re-measurement (D-44/D-46 territory).
+
+**Measured improvement** (n=30 iterations, baseline = round-3 HEAD at
+`61bd26a` = D-46 shipped; min-of-mins across 3 independent runs to
+suppress system-load noise):
+
+| Scenario | Before (min) | After (min) | Δ          |
+|----------|--------------|-------------|------------|
+| `iris5p` (150 rows / 5 pages) | 329 ms | 255 ms | **~22% ↓** |
+| `big_df` (500 rows / ~17 pages, 4 cols) | 1.45 s | 1.36 s | ~6% ↓ |
+
+`iris5p` is the targeted scenario for pagination-side optimisation:
+relatively small data, many measurement-heavy passes.  `big_df` is
+dominated by drawing rather than measurement (D-44/D-46 territory), so
+the pagination cache helps less in relative terms.
+
+**Alternatives considered and rejected:**
+- *Caching width and height under separate keys* -- requires the cache
+  consumer to ask for the right dimension and stores two entries per
+  unique string.  Consolidated `(w, h)` tuple is one entry, one
+  textGrob construction, and any pass that has either dim gets the
+  other for free.
+- *Hashing the full gpar object as the cache key* -- gpars carry many
+  fields; per-lookup hashing costs more than the structural-key
+  approach the codebase already uses internally.  Callers pre-compute
+  `gp_key` once per gpar.
+- *Threading the cache further down into `.compute_col_min_widths()`
+  and `.height_balance_widths_impl()`* -- those have their own
+  per-call caches keyed differently (per-token, per-(j, width)).
+  Bringing them into the unified cache would require harmonising key
+  schemes and is left for a follow-up if profile data justifies it.
+
+**Files touched:**
+- `R/table_utils.R` -- new `.measure_text_dims_in()` helper; cache-aware
+  `.measure_max_string_width()` and `.measure_header_row_height()`.
+- `R/table_rows.R` -- cache-aware `measure_row_heights_tbl()`.
+- `R/table_columns.R` -- cache-aware `compute_col_widths()` and
+  `.resolve_natural_widths()`.
+- `R/table_pagelist.R` -- cache construction in
+  `.tfl_table_to_pagelist_default()` and threading into the three
+  pagination measurement entry points.
+
+**Verification:**
+- Full `devtools::test()` green.
+- `examples/bench_focused.R` (n=30) reproduces the table above.
diff --git a/examples/bench_focused.R b/examples/bench_focused.R
new file mode 100644
index 0000000..1ab6602
--- /dev/null
+++ b/examples/bench_focused.R
@@ -0,0 +1,41 @@
+# examples/bench_focused.R
+#
+# Focused benchmark on the scenarios most likely to show cross-page cache
+# benefit: tfl_table(iris) (5 pages) and a 500-row synthetic table.
+
+suppressPackageStartupMessages({
+  devtools::load_all(quiet = TRUE)
+})
+
+big_df <- data.frame(
+  subject = sprintf("Subject %03d", 1:500),
+  visit   = rep(c("Baseline", "Week 1", "Week 4", "Week 12"), length.out = 500),
+  result  = round(rnorm(500, 100, 15), 1),
+  flag    = sample(c("Y", "N", "?"), 500, replace = TRUE)
+)
+
+scenarios <- list(
+  iris5p = function() {
+    out <- tempfile(fileext = ".pdf")
+    export_tfl(tfl_table(iris), file = out)
+    unlink(out)
+  },
+  big_df = function() {
+    out <- tempfile(fileext = ".pdf")
+    export_tfl(tfl_table(big_df), file = out)
+    unlink(out)
+  }
+)
+
+for (name in names(scenarios)) {
+  fn <- scenarios[[name]]
+  invisible(fn())  # warmup
+  bm <- bench::mark(fn(), iterations = 30L, check = FALSE,
+                    filter_gc = FALSE, memory = FALSE)
+  cat(sprintf("%-10s  min=%-9s  median=%-9s  iqr=%s  n=%d\n",
+              name,
+              format(bm$min),
+              format(bm$median),
+              format(bm$median - bm$min),
+              bm$n_itr))
+}