From 632c4baba1db394d337dad5eb4f1bdf6db0687ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Sat, 6 Jun 2026 12:59:20 +0200
Subject: [PATCH 1/9] Make the lexer benchmark robust and JIT-aware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run-lexer-benchmark.php timed a single pass, which is too noisy to compare a
change against. Rework it into a reliable throughput benchmark that the lexer
optimisations in this branch can be measured against:

- Load through src/load.php (parity with run-parser-benchmark.php) so a loaded
  native extension is benchmarked via the same public WP_MySQL_Lexer wrapper.
- Warm up with discarded passes (heating opcache, the tracing JIT, and CPU
  caches), then run N timed passes over the whole corpus.
- Headline the best pass: lexing is deterministic and CPU-bound, so outside
  interference can only slow a pass down, making the fastest pass the most
  reproducible estimate of intrinsic cost and the most stable basis for a
  before/after comparison. Median and best-vs-worst spread are reported too so
  a noisy machine is obvious.
- Detect and report the active config (opcache / tracing JIT) and the
  implementation (php / native-extension), and warn when opcache.jit is set but
  the JIT did not actually activate.
- Add --iterations / --warmup; keep --json (headline kept as "qps").

Add a `bench-lexer` script to the mysql-on-sqlite package's composer.json that
runs the benchmark twice — without and with the tracing JIT — so both
configurations are measured with one `composer run bench-lexer` (JIT is a
start-up setting that cannot be toggled mid-process).
---
 packages/mysql-on-sqlite/composer.json        |   6 +-
 .../tests/tools/run-lexer-benchmark.php       | 154 ++++++++++++++----
 2 files changed, 126 insertions(+), 34 deletions(-)

diff --git a/packages/mysql-on-sqlite/composer.json b/packages/mysql-on-sqlite/composer.json
index 9d2b148fa..c7ef2b417 100644
--- a/packages/mysql-on-sqlite/composer.json
+++ b/packages/mysql-on-sqlite/composer.json
@@ -2,7 +2,11 @@
     "name": "wordpress/mysql-on-sqlite",
     "type": "library",
 	"scripts": {
-		"test": "phpunit"
+		"test": "phpunit",
+		"bench-lexer": [
+			"@php tests/tools/run-lexer-benchmark.php",
+			"@php -d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing tests/tools/run-lexer-benchmark.php"
+		]
 	},
     "require-dev": {
         "phpunit/phpunit": "^8.5"
diff --git a/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php b/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
index 87f1ec798..23ecd1b43 100644
--- a/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
+++ b/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
@@ -1,73 +1,161 @@
 <?php
 
 /**
- * This script runs the MySQL lexer on queries from the MySQL server suite.
- * It ensures the lexer tokenizes all queries and measures lexing performance.
+ * Benchmark the MySQL lexer over the checked-in MySQL server test corpus and
+ * report its tokenization throughput (queries lexed per second).
+ *
+ * Mirrors run-parser-benchmark.php: it loads through src/load.php, so when the
+ * native wp_mysql_parser extension is loaded the benchmark runs the native
+ * lexer through the same public WP_MySQL_Lexer wrapper that the driver uses.
+ *
+ * JIT / opcache are start-up ini settings, so this script does not toggle them;
+ * it reports the active configuration so every run is self-describing. Run it
+ * twice to compare without and with the tracing JIT (the lexer behaves very
+ * differently under each):
+ *
+ *     php packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
+ *     php -d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing \
+ *         packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
+ *
+ * To check a change, run it on the base commit and again on your branch and
+ * compare the "best" numbers (ideally for both JIT configs).
+ *
+ * Methodology: a few warmup passes (discarded — they heat opcache, the tracing
+ * JIT, and the CPU caches) followed by N timed passes over the whole corpus.
+ * The headline is the BEST pass: lexing is deterministic and CPU-bound, so
+ * outside interference can only make a pass slower, never faster, which makes
+ * the fastest pass the most reproducible estimate of the code's intrinsic cost
+ * and the most stable basis for a before/after comparison. The median and the
+ * best-vs-worst spread are reported too, so a noisy machine is obvious.
  *
  * Options:
- *   --json       Print machine-readable benchmark output.
- *   --limit=N    Only benchmark the first N queries.
+ *   --json            Print machine-readable output.
+ *   --limit=N         Only benchmark the first N queries.
+ *   --iterations=N    Number of timed passes (default 10).
+ *   --warmup=N        Number of discarded warmup passes (default 3).
  */
 
-// Throw exception if anything fails.
+// Throw an exception if anything fails.
 set_error_handler(
 	function ( $severity, $message, $file, $line ) {
 		throw new ErrorException( $message, 0, $severity, $file, $line );
 	}
 );
 
-$json  = in_array( '--json', $argv, true );
-$limit = null;
+$json       = in_array( '--json', $argv, true );
+$limit      = null;
+$iterations = 10;
+$warmup     = 3;
 foreach ( $argv as $arg ) {
 	if ( 0 === strpos( $arg, '--limit=' ) ) {
 		$limit = max( 1, (int) substr( $arg, strlen( '--limit=' ) ) );
+	} elseif ( 0 === strpos( $arg, '--iterations=' ) ) {
+		$iterations = max( 1, (int) substr( $arg, strlen( '--iterations=' ) ) );
+	} elseif ( 0 === strpos( $arg, '--warmup=' ) ) {
+		$warmup = max( 0, (int) substr( $arg, strlen( '--warmup=' ) ) );
 	}
 }
 
-require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php';
-require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php';
-require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php';
+// Use the integration loader so an already-loaded native extension selects
+// the same public lexer class that runtime code uses.
+require_once __DIR__ . '/../../src/load.php';
 
 // Load the bounded checked-in corpus before timing so file IO is excluded
 // from the benchmark.
 $handle  = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' );
-$records = array();
+$queries = array();
 while ( ( $record = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) {
-	$records[] = $record;
-	if ( null !== $limit && count( $records ) >= $limit ) {
+	$query = $record[0] ?? null;
+	if ( null === $query || '' === $query ) {
+		continue;
+	}
+	$queries[] = $query;
+	if ( null !== $limit && count( $queries ) >= $limit ) {
 		break;
 	}
 }
+$query_count = count( $queries );
 
-// Run the lexer.
-$processed = 0;
-$start     = microtime( true );
-for ( $i = 0; $i < count( $records ); $i += 1 ) {
-	$query  = $records[ $i ][0];
-	$lexer  = new WP_MySQL_Lexer( $query );
-	$tokens = $lexer->remaining_tokens();
-	if ( count( $tokens ) === 0 ) {
-		throw new Exception( 'Failed to tokenize query: ' . $query );
+// Lex the whole corpus once. Calling native_token_stream() vs remaining_tokens()
+// mirrors how the driver consumes the chosen lexer.
+$native     = class_exists( 'WP_MySQL_Native_Lexer', false );
+$lex_corpus = function () use ( $queries, $native ) {
+	foreach ( $queries as $query ) {
+		$lexer  = new WP_MySQL_Lexer( $query );
+		$tokens = $native && $lexer instanceof WP_MySQL_Native_Lexer
+			? $lexer->native_token_stream()
+			: $lexer->remaining_tokens();
+		$count  = is_array( $tokens ) ? count( $tokens ) : $tokens->count();
+		if ( 0 === $count ) {
+			throw new Exception( 'Failed to tokenize query: ' . $query );
+		}
 	}
-	$processed += 1;
+};
+
+// Warmup passes are discarded.
+for ( $i = 0; $i < $warmup; $i++ ) {
+	$lex_corpus();
 }
-$duration = microtime( true ) - $start;
-$qps      = $processed / $duration;
+
+// Timed passes: one QPS sample per pass.
+$samples = array();
+for ( $i = 0; $i < $iterations; $i++ ) {
+	$start = microtime( true );
+	$lex_corpus();
+	$samples[] = $query_count / ( microtime( true ) - $start );
+}
+sort( $samples );
+
+$best   = $samples[ count( $samples ) - 1 ];
+$worst  = $samples[0];
+$mean   = array_sum( $samples ) / count( $samples );
+$mid    = intdiv( count( $samples ), 2 );
+$median = 0 === count( $samples ) % 2
+	? ( $samples[ $mid - 1 ] + $samples[ $mid ] ) / 2
+	: $samples[ $mid ];
+$spread = $best > 0 ? ( $best - $worst ) / $best : 0.0;
+
+// Detect the active runtime configuration so the run is self-describing.
+// opcache_get_status() returns false (no warning) when opcache is disabled.
+$opcache_status = function_exists( 'opcache_get_status' ) ? opcache_get_status( false ) : false;
+$opcache_on     = is_array( $opcache_status );
+$jit_on         = $opcache_on && ! empty( $opcache_status['jit']['on'] );
+$implementation = ( extension_loaded( 'wp_mysql_parser' ) && $native ) ? 'native-extension' : 'php';
 
 if ( $json ) {
 	echo json_encode(
 		array(
-			'benchmark'      => 'mysql-lexer',
-			'implementation' => 'php',
-			'queries'        => $processed,
-			'duration'       => $duration,
-			'qps'            => $qps,
-			'php_version'    => PHP_VERSION,
+			'benchmark'        => 'mysql-lexer',
+			'implementation'   => $implementation,
+			'extension_loaded' => extension_loaded( 'wp_mysql_parser' ),
+			'opcache'          => $opcache_on,
+			'jit'              => $jit_on,
+			'queries'          => $query_count,
+			'warmup'           => $warmup,
+			'iterations'       => $iterations,
+			'qps'              => $best, // Headline (best pass); kept as "qps" for compatibility.
+			'qps_best'         => $best,
+			'qps_median'       => $median,
+			'qps_mean'         => $mean,
+			'qps_worst'        => $worst,
+			'spread'           => $spread,
+			'php_version'      => PHP_VERSION,
 		),
 		JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
 	), "\n";
 	exit;
 }
 
-// Print the results.
-printf( "\nTokenized %d queries in %.5fs @ %d QPS.\n", $processed, $duration, $qps );
+$config = $jit_on ? 'opcache + tracing JIT' : ( $opcache_on ? 'opcache, no JIT' : 'no opcache' );
+printf( "MySQL lexer (%s implementation) — %s\n", $implementation, $config );
+$jit_requested = ! in_array( strtolower( (string) ini_get( 'opcache.jit' ) ), array( '', '0', 'off', 'disable' ), true );
+if ( $jit_requested && ! $jit_on ) {
+	printf( "  warning: opcache.jit is set but the JIT is NOT active here — check that opcache is enabled and jit_buffer_size > 0.\n" );
+}
+printf( "%s queries, %d warmup + %d timed passes\n", number_format( $query_count ), $warmup, $iterations );
+printf( "  best:   %s QPS\n", number_format( $best ) );
+printf( "  median: %s QPS\n", number_format( $median ) );
+printf( "  spread: %.1f%% (best vs worst)\n", $spread * 100 );
+if ( $spread > 0.10 ) {
+	printf( "  note: >10%% spread — the machine is noisy; close other apps for a steadier number.\n" );
+}

From bf5467891a45dd1c3bfd2b34598b6db53ea97ea3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Sat, 6 Jun 2026 13:14:09 +0200
Subject: [PATCH 2/9] Add a CI job that benchmarks the lexer and comments on
 the PR

On pull requests that touch the lexer (or the benchmark tool), run
run-lexer-benchmark.php for both the base commit and the PR head on the same
runner, without and with the tracing JIT, and post the before/after numbers as
a single comment that updates in place on every push.

The job is informational, not gating: hosted CI runners are too noisy for
absolute-throughput thresholds. Measuring base and head back-to-back on the
same runner cancels the runner's absolute speed, so the same-runner speedup
ratio is the meaningful signal. Only the source tree is swapped to the base
commit; the PR's benchmark tool is reused for both sides so they are timed
identically.
---
 .github/workflows/lexer-benchmark.yml | 116 ++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 .github/workflows/lexer-benchmark.yml

diff --git a/.github/workflows/lexer-benchmark.yml b/.github/workflows/lexer-benchmark.yml
new file mode 100644
index 000000000..9c41005f0
--- /dev/null
+++ b/.github/workflows/lexer-benchmark.yml
@@ -0,0 +1,116 @@
+name: Lexer benchmark
+
+on:
+  pull_request:
+    paths:
+      - 'packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php'
+      - 'packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php'
+      - 'packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php'
+      - 'packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php'
+      - '.github/workflows/lexer-benchmark.yml'
+
+# A new push supersedes the previous run; the result comment is updated in place.
+concurrency:
+  group: lexer-benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+# Disable permissions for all available scopes by default.
+permissions: {}
+
+jobs:
+  benchmark:
+    name: Lexer throughput (base vs PR)
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    permissions:
+      contents: read # Required to clone the repo.
+      pull-requests: write # Required to post/update the result comment.
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Need the base commit to benchmark the "before" state.
+
+      - name: Set up PHP
+        uses: shivammathur/setup-php@v2
+        with:
+          php-version: '8.4'
+          coverage: none
+
+      - name: Benchmark base vs PR
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+        run: |
+          BENCH=packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php
+
+          # Best-pass QPS for a given PHP flag set.
+          best() {
+            php -d memory_limit=512M "$@" "$BENCH" --json \
+              | php -r '$j = json_decode( stream_get_contents( STDIN ), true ); echo (int) $j["qps_best"];'
+          }
+          jit_flags="-d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing"
+
+          # PR (head) is the current checkout.
+          head_nojit=$( best )
+          head_jit=$( best $jit_flags )
+
+          # Swap only the source tree to the base commit and re-measure with the
+          # same (PR) benchmark tool, so both sides are timed identically. The
+          # benchmark tool itself (tests/tools/) is left at the PR version.
+          git checkout "$BASE_SHA" -- packages/mysql-on-sqlite/src
+          base_nojit=$( best )
+          base_jit=$( best $jit_flags )
+          git checkout HEAD -- packages/mysql-on-sqlite/src
+
+          fmt() { php -r 'echo number_format( (int) $argv[1] );' "$1"; }
+          ratio() { php -r 'printf( "%.2f", $argv[1] / max( 1, (int) $argv[2] ) );' "$1" "$2"; }
+
+          {
+            echo "<!-- lexer-benchmark -->"
+            echo "### 🤖 Lexer benchmark"
+            echo "Changes to lexer-related files were detected and triggered a benchmark:"
+            echo
+            echo "| Config | Base (QPS) | This PR (QPS) | Speedup |"
+            echo "| --- | ---: | ---: | ---: |"
+            echo "| **no JIT** | $( fmt "$base_nojit" ) | $( fmt "$head_nojit" ) | **$( ratio "$head_nojit" "$base_nojit" )×** |"
+            echo "| **tracing JIT** | $( fmt "$base_jit" ) | $( fmt "$head_jit" ) | **$( ratio "$head_jit" "$base_jit" )×** |"
+            echo
+            echo "**Note:** Hosted runners are noisy, and absolute numbers vary. Treat the results with caution and verify them locally."
+            echo
+            echo "To reproduce locally:"
+            echo '```'
+            echo "cd packages/mysql-on-sqlite && composer run bench-lexer"
+            echo '```'
+          } > "$RUNNER_TEMP/comment.md"
+          echo "COMMENT_FILE=$RUNNER_TEMP/comment.md" >> "$GITHUB_ENV"
+
+      - name: Post or update the PR comment
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require( 'fs' );
+            const body = fs.readFileSync( process.env.COMMENT_FILE, 'utf8' );
+            const marker = '<!-- lexer-benchmark -->';
+            const { data: comments } = await github.rest.issues.listComments( {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              per_page: 100,
+            } );
+            const existing = comments.find( ( c ) => c.body && c.body.includes( marker ) );
+            if ( existing ) {
+              await github.rest.issues.updateComment( {
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              } );
+            } else {
+              await github.rest.issues.createComment( {
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              } );
+            }

From 5c38b20822e258425f9c6bae73ccdc79397f2bb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Tue, 28 Apr 2026 09:36:59 +0200
Subject: [PATCH 3/9] Speed up the lexer with a cached length and inlined token
 loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apply the structural lexer optimisations from PR #375:

- Cache strlen($sql) once in $sql_length instead of recomputing it on each
  EOF/bounds check.
- Use strpos($sql, '*/', $pos) instead of a manual scan loop in
  read_comment_content().
- In read_quoted_text(), use strpos() to find the next quote, dropping the
  separate end-of-input check that followed the strcspn() scan.
- Inline next_token() + get_token() in remaining_tokens() so the hot loop
  builds tokens directly.

The #375 strspn()->byte-comparison swaps are intentionally not included: once
the dispatch chain is reordered by later commits those checks are off the hot
path and strspn() is marginally faster than the inline comparisons, so the
swaps were net-neutral-to-negative while adding code.

Co-authored-by: Adam Zieliński <adam@adamziel.com>
Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375
---
 .../src/mysql/class-wp-mysql-lexer.php        | 93 ++++++++++++++-----
 1 file changed, 69 insertions(+), 24 deletions(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index 10ecd90ad..4a5859f1e 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer {
 	 */
 	private $sql;
 
+	/**
+	 * Byte length of the SQL payload.
+	 *
+	 * @var int
+	 */
+	private $sql_length;
+
 	/**
 	 * The version of the MySQL server that the SQL payload is intended for.
 	 *
@@ -2189,6 +2196,7 @@ public function __construct(
 		array $sql_modes = array()
 	) {
 		$this->sql           = $sql;
+		$this->sql_length    = strlen( $sql );
 		$this->mysql_version = $mysql_version;
 
 		foreach ( $sql_modes as $sql_mode ) {
@@ -2284,10 +2292,46 @@ public function get_token(): ?WP_MySQL_Token {
 	 * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
 	 */
 	public function remaining_tokens(): array {
-		$tokens = array();
-		while ( true === $this->next_token() ) {
-			$token    = $this->get_token();
-			$tokens[] = $token;
+		$tokens                            = array();
+		$no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active(
+			self::SQL_MODE_NO_BACKSLASH_ESCAPES
+		);
+
+		while ( true ) {
+			if (
+				self::EOF === $this->token_type
+				|| ( null === $this->token_type && $this->bytes_already_read > 0 )
+			) {
+				$this->token_type = null;
+				break;
+			}
+
+			do {
+				$this->token_starts_at = $this->bytes_already_read;
+				$this->token_type      = $this->read_next_token();
+			} while (
+				self::WHITESPACE === $this->token_type
+				|| self::COMMENT === $this->token_type
+				|| self::MYSQL_COMMENT_START === $this->token_type
+				|| self::MYSQL_COMMENT_END === $this->token_type
+			);
+
+			if ( null === $this->token_type ) {
+				break;
+			}
+
+			$tokens[] = new WP_MySQL_Token(
+				$this->token_type,
+				$this->token_starts_at,
+				$this->bytes_already_read - $this->token_starts_at,
+				$this->sql,
+				$no_backslash_escapes_sql_mode_set
+			);
+
+			if ( self::EOF === $this->token_type ) {
+				$this->token_type = null;
+				break;
+			}
 		}
 		return $tokens;
 	}
@@ -2420,7 +2464,7 @@ private function read_next_token(): ?int {
 		} elseif ( '-' === $byte ) {
 			if (
 				'-' === $next_byte
-				&& $this->bytes_already_read + 2 < strlen( $this->sql )
+				&& $this->bytes_already_read + 2 < $this->sql_length
 				&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0
 			) {
 				$type = $this->read_line_comment();
@@ -2685,7 +2729,7 @@ private function read_number(): ?int {
 			$this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read );
 			if ( $is_quoted ) {
 				if (
-					$this->bytes_already_read >= strlen( $this->sql )
+					$this->bytes_already_read >= $this->sql_length
 					|| "'" !== $this->sql[ $this->bytes_already_read ]
 				) {
 					return null; // Invalid input.
@@ -2708,7 +2752,7 @@ private function read_number(): ?int {
 			$this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read );
 			if ( $is_quoted ) {
 				if (
-					$this->bytes_already_read >= strlen( $this->sql )
+					$this->bytes_already_read >= $this->sql_length
 					|| "'" !== $this->sql[ $this->bytes_already_read ]
 				) {
 					return null; // Invalid input.
@@ -2740,7 +2784,7 @@ private function read_number(): ?int {
 					strspn( $next_byte, self::DIGIT_MASK ) > 0
 					|| (
 						( '+' === $next_byte || '-' === $next_byte )
-						&& $this->bytes_already_read + 2 < strlen( $this->sql )
+						&& $this->bytes_already_read + 2 < $this->sql_length
 						&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0
 					)
 				);
@@ -2838,12 +2882,11 @@ private function read_quoted_text(): ?int {
 		// in which case the escape sequence is consumed and the loop continues.
 		$at = $this->bytes_already_read;
 		while ( true ) {
-			$at += strcspn( $this->sql, $quote, $at );
-
-			// Unclosed string - unexpected EOF.
-			if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
+			$quote_at = strpos( $this->sql, $quote, $at );
+			if ( false === $quote_at ) {
 				return null; // Invalid input.
 			}
+			$at = $quote_at;
 
 			/*
 			 * By default, quotes can be escaped with a "\".
@@ -2853,9 +2896,17 @@ private function read_quoted_text(): ?int {
 			 * The quote is escaped only when the number of preceding backslashes
 			 * is odd - "\" is an escape sequence, "\\" is an escaped backslash,
 			 * "\\\" is an escaped backslash and an escape sequence, and so on.
+			 *
+			 * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string-
+			 * offset wraparound (PHP 7.1+) when the closing-quote candidate
+			 * sits at the very start of the input. The `?? null` covers
+			 * positive out-of-range indexes belt-and-suspenders.
 			 */
 			if ( ! $no_backslash_escapes ) {
-				for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 );
+				$i = 0;
+				while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) {
+					$i += 1;
+				}
 				if ( 1 === $i % 2 ) {
 					$at += 1;
 					continue;
@@ -2920,17 +2971,11 @@ private function read_mysql_comment(): int {
 	}
 
 	private function read_comment_content(): void {
-		while ( true ) {
-			$this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read );
-			$this->bytes_already_read += 1; // Consume the '*'.
-			$byte                      = $this->sql[ $this->bytes_already_read ] ?? null;
-			if ( null === $byte ) {
-				break;
-			}
-			if ( '/' === $byte ) {
-				$this->bytes_already_read += 1; // Consume the '/'.
-				break;
-			}
+		$comment_end = strpos( $this->sql, '*/', $this->bytes_already_read );
+		if ( false === $comment_end ) {
+			$this->bytes_already_read = $this->sql_length;
+		} else {
+			$this->bytes_already_read = $comment_end + 2;
 		}
 	}
 

From 2d5d198ef524600353a2986a77f7927baf4d992c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Wed, 29 Apr 2026 10:53:29 +0200
Subject: [PATCH 4/9] Inline leading-whitespace skip in lexer's token loops

Both next_token() and remaining_tokens() previously paid a
read_next_token() function call per whitespace run only to
recognise and skip the resulting WHITESPACE token. A single
unguarded strspn() at the top of each loop iteration absorbs
the run inline, saving the call overhead for ~one whitespace
run per real token across millions of tokens.

The strspn() call is unguarded because an unconditional strspn()
(which returns 0 in a single C-side call when nothing matches)
is faster than gating it on a five-arm '$byte === ...' precheck.
---
 packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index 4a5859f1e..f80d7d420 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2235,6 +2235,9 @@ public function next_token(): bool {
 			return false;
 		}
 
+		// Skip leading whitespace inline for optimal performance.
+		$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );
+
 		do {
 			$this->token_starts_at = $this->bytes_already_read;
 			$this->token_type      = $this->read_next_token();
@@ -2306,6 +2309,9 @@ public function remaining_tokens(): array {
 				break;
 			}
 
+			// Skip leading whitespace inline for optimal performance.
+			$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );
+
 			do {
 				$this->token_starts_at = $this->bytes_already_read;
 				$this->token_type      = $this->read_next_token();

From a1e267f3200cde7d19e8581eeeb991db8a33ff8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Wed, 29 Apr 2026 10:54:33 +0200
Subject: [PATCH 5/9] Catch identifier and keyword tokens at the top of the
 chain

ASCII letters and UTF-8 multibyte start bytes account for most
token-start bytes on the MySQL corpus. They previously fell into
the catch-all `else` at the bottom of read_next_token() after
walking every operator arm in between. The new branch sits at
the top of the elseif chain and dispatches them directly.

The `next_byte !== "'"` guard keeps the x'..', n'..' and similar
specials on their dedicated branches. `_` and `$` starters stay
on the catch-all so the UNDERSCORE_CHARSET lookup still fires.
---
 .../src/mysql/class-wp-mysql-lexer.php        | 22 ++++++++++++-
 .../tests/mysql/WP_MySQL_Lexer_Tests.php      | 31 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index f80d7d420..0b6c6b2a6 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2404,7 +2404,27 @@ private function read_next_token(): ?int {
 		$byte      = $this->sql[ $this->bytes_already_read ] ?? null;
 		$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;
 
-		if ( "'" === $byte || '"' === $byte || '`' === $byte ) {
+		// Fast path for keywords and identifiers.
+		// These are the most common token types in MySQL payloads.
+		if (
+			(
+				( $byte >= 'a' && $byte <= 'z' )
+				|| ( $byte >= 'A' && $byte <= 'Z' )
+				|| $byte > "\x7F"
+			)
+			&& "'" !== $next_byte
+		) {
+			$started_at = $this->bytes_already_read;
+			$type       = $this->read_identifier();
+			if ( self::IDENTIFIER === $type ) {
+				// When preceded by a dot, it is always an identifier.
+				if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) {
+					$type = self::IDENTIFIER;
+				} else {
+					$type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() );
+				}
+			}
+		} elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) {
 			$type = $this->read_quoted_text();
 		} elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) {
 			$type = $this->read_number();
diff --git a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php
index 8f18cf170..383b03f57 100644
--- a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php
+++ b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php
@@ -367,6 +367,37 @@ function ( $severity, $message, $file, $line ) {
 		$this->assertNull( $lexer->get_token() );
 	}
 
+	/**
+	 * A charset-introducer-like name used as a qualified member (after a dot)
+	 * must lex as an identifier. A real charset introducer only appears before
+	 * a string literal, never as the member of a qualified reference.
+	 *
+	 * @dataProvider data_underscore_charset_after_dot
+	 */
+	public function test_underscore_charset_name_after_dot_is_identifier( string $sql, int $token_index, int $expected_id ): void {
+		$tokens = ( new WP_MySQL_Lexer( $sql ) )->remaining_tokens();
+		$this->assertSame(
+			WP_MySQL_Lexer::get_token_name( $expected_id ),
+			$tokens[ $token_index ]->get_name(),
+			$sql
+		);
+	}
+
+	/**
+	 * @return array<string,array{0:string,1:int,2:int}>
+	 */
+	public function data_underscore_charset_after_dot(): array {
+		return array(
+			// `t . _utf8` - the member name must be an identifier, not a charset.
+			'charset name after dot is identifier'  => array( 't._utf8', 2, WP_MySQL_Lexer::IDENTIFIER ),
+			'other charset name after dot'          => array( 'a._binary', 2, WP_MySQL_Lexer::IDENTIFIER ),
+			// A genuine charset introducer (before a string) stays a charset.
+			'charset introducer before string'      => array( "_utf8'x'", 0, WP_MySQL_Lexer::UNDERSCORE_CHARSET ),
+			// A non-charset underscore name after a dot stays an identifier.
+			'non-charset underscore name after dot' => array( 't._foo', 2, WP_MySQL_Lexer::IDENTIFIER ),
+		);
+	}
+
 	private function get_token_names( array $token_types ): array {
 		return array_map(
 			function ( $token_type ) {

From f3f11eb4b7f09d27afe3b6cb3b4f0bc52344d48c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Wed, 29 Apr 2026 10:55:33 +0200
Subject: [PATCH 6/9] Add a single-byte operator dispatch table

The ASCII bytes (, ), ',' ;, +, ~, %, ^, ?, {, }, and = each map to a
unique single-byte token type with no lookahead. A static array + isset()
arm dispatches them in one lookup, ahead of the per-byte elseif chain, and
the now-shadowed individual arms further down the chain are removed so the
table is the single source of truth for these tokens.

'*' and '|' are deliberately excluded because their token type depends on
context (in_mysql_comment for '*/', SQL_MODE_PIPES_AS_CONCAT for '||').
---
 .../src/mysql/class-wp-mysql-lexer.php        | 56 +++++++------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index 0b6c6b2a6..30ceb3d37 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2404,6 +2404,22 @@ private function read_next_token(): ?int {
 		$byte      = $this->sql[ $this->bytes_already_read ] ?? null;
 		$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;
 
+		// A map for a single-byte symbol fast path.
+		static $single_byte_ops = array(
+			'(' => self::OPEN_PAR_SYMBOL,
+			')' => self::CLOSE_PAR_SYMBOL,
+			',' => self::COMMA_SYMBOL,
+			';' => self::SEMICOLON_SYMBOL,
+			'+' => self::PLUS_OPERATOR,
+			'~' => self::BITWISE_NOT_OPERATOR,
+			'%' => self::MOD_OPERATOR,
+			'^' => self::BITWISE_XOR_OPERATOR,
+			'?' => self::PARAM_MARKER,
+			'{' => self::OPEN_CURLY_SYMBOL,
+			'}' => self::CLOSE_CURLY_SYMBOL,
+			'=' => self::EQUAL_OPERATOR,
+		);
+
 		// Fast path for keywords and identifiers.
 		// These are the most common token types in MySQL payloads.
 		if (
@@ -2424,6 +2440,10 @@ private function read_next_token(): ?int {
 					$type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() );
 				}
 			}
+		} elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) {
+			// Fast path for single-byte symbols.
+			$this->bytes_already_read += 1;
+			$type                      = $single_byte_ops[ $byte ];
 		} elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) {
 			$type = $this->read_quoted_text();
 		} elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) {
@@ -2435,9 +2455,6 @@ private function read_next_token(): ?int {
 				$this->bytes_already_read += 1;
 				$type                      = self::DOT_SYMBOL;
 			}
-		} elseif ( '=' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::EQUAL_OPERATOR;
 		} elseif ( ':' === $byte ) {
 			$this->bytes_already_read += 1; // Consume the ':'.
 			if ( '=' === $next_byte ) {
@@ -2484,9 +2501,6 @@ private function read_next_token(): ?int {
 			} else {
 				$type = self::LOGICAL_NOT_OPERATOR;
 			}
-		} elseif ( '+' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::PLUS_OPERATOR;
 		} elseif ( '-' === $byte ) {
 			if (
 				'-' === $next_byte
@@ -2536,9 +2550,6 @@ private function read_next_token(): ?int {
 				$this->bytes_already_read += 1;
 				$type                      = self::DIV_OPERATOR;
 			}
-		} elseif ( '%' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::MOD_OPERATOR;
 		} elseif ( '&' === $byte ) {
 			$this->bytes_already_read += 1; // Consume the '&'.
 			if ( '&' === $next_byte ) {
@@ -2547,9 +2558,6 @@ private function read_next_token(): ?int {
 			} else {
 				$type = self::BITWISE_AND_OPERATOR;
 			}
-		} elseif ( '^' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::BITWISE_XOR_OPERATOR;
 		} elseif ( '|' === $byte ) {
 			$this->bytes_already_read += 1; // Consume the '|'.
 			if ( '|' === $next_byte ) {
@@ -2560,27 +2568,6 @@ private function read_next_token(): ?int {
 			} else {
 				$type = self::BITWISE_OR_OPERATOR;
 			}
-		} elseif ( '~' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::BITWISE_NOT_OPERATOR;
-		} elseif ( ',' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::COMMA_SYMBOL;
-		} elseif ( ';' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::SEMICOLON_SYMBOL;
-		} elseif ( '(' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::OPEN_PAR_SYMBOL;
-		} elseif ( ')' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::CLOSE_PAR_SYMBOL;
-		} elseif ( '{' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::OPEN_CURLY_SYMBOL;
-		} elseif ( '}' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::CLOSE_CURLY_SYMBOL;
 		} elseif ( '@' === $byte ) {
 			$this->bytes_already_read += 1; // Consume the '@'.
 
@@ -2604,9 +2591,6 @@ private function read_next_token(): ?int {
 					$type = self::AT_SIGN_SYMBOL;
 				}
 			}
-		} elseif ( '?' === $byte ) {
-			$this->bytes_already_read += 1;
-			$type                      = self::PARAM_MARKER;
 		} elseif ( '\\' === $byte ) {
 			$this->bytes_already_read += 1; // Consume the '\'.
 			if ( 'N' === $next_byte ) {

From 528557995d4ecdfdc2704431e21a934a4535a6a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Mon, 4 May 2026 15:40:23 +0200
Subject: [PATCH 7/9] Document non-obvious lexer dispatch conditions

Three review-noted spots that were terse in the code:

- The remaining_tokens() loop guard now spells out why both EOF
  and `null === token_type && bytes_already_read > 0` are needed
  (EOF on clean end-of-input vs invalid byte mid-stream, with
  the `> 0` guard letting the very first iteration through).

- The identifier/keyword fast path now explains `$byte > "\x7F"`
  (UTF-8 multi-byte starter; MySQL identifiers allow U+0080-U+FFFF)
  and `next_byte !== "'"` (only single quotes form the special
  hex/bin/n-char literal starters; `"` never does, regardless of
  SQL mode).

No behavior change.
---
 .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index 30ceb3d37..721af9223 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2301,6 +2301,8 @@ public function remaining_tokens(): array {
 		);
 
 		while ( true ) {
+			// Bail on EOF, or on a null token type once at least one byte has
+			// been consumed (read_next_token() hit invalid input mid-stream).
 			if (
 				self::EOF === $this->token_type
 				|| ( null === $this->token_type && $this->bytes_already_read > 0 )
@@ -2421,7 +2423,11 @@ private function read_next_token(): ?int {
 		);
 
 		// Fast path for keywords and identifiers.
-		// These are the most common token types in MySQL payloads.
+		// `$byte > "\x7F"` catches any non-ASCII byte (0x80-0xFF); read_identifier()
+		// restricts the accepted identifier codepoints to U+0080-U+FFFF.
+		// `"'" !== $next_byte` defers x'..', n'..' and similar special
+		// literals to their dedicated branches below; only single quotes
+		// form those, regardless of SQL mode.
 		if (
 			(
 				( $byte >= 'a' && $byte <= 'z' )

From ee8664395f762ba0fcd458eb432adef3a468f6b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Tue, 28 Apr 2026 09:37:05 +0200
Subject: [PATCH 8/9] Skip parent constructor in WP_MySQL_Token
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Token construction is on the lexer hot path; bypassing the
`WP_Parser_Token::__construct()` indirection and assigning the four
properties directly removes one method call per token.

Requires `$input` on `WP_Parser_Token` to be `protected` instead of
`private` so the subclass can write to it.

Co-authored-by: Adam Zieliński <adam@adamziel.com>

Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375
---
 packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php | 6 +++++-
 .../mysql-on-sqlite/src/parser/class-wp-parser-token.php    | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php
index 1fb25ab42..0840bc2f2 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php
@@ -30,7 +30,11 @@ public function __construct(
 		string $input,
 		bool $sql_mode_no_backslash_escapes_enabled
 	) {
-		parent::__construct( $id, $start, $length, $input );
+		$this->id     = $id;
+		$this->start  = $start;
+		$this->length = $length;
+		$this->input  = $input;
+
 		$this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled;
 	}
 
diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php
index b77261896..4132ba382 100644
--- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php
+++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php
@@ -35,7 +35,7 @@ class WP_Parser_Token {
 	 *
 	 * @var string
 	 */
-	private $input;
+	protected $input;
 
 	/**
 	 * Constructor.

From 12a78da2068d59e6a1269cc14f12f9dfc8b5a855 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Jake=C5=A1?= <jan@jakes.pro>
Date: Fri, 5 Jun 2026 21:05:20 +0200
Subject: [PATCH 9/9] Inline the keyword lookup on the hot identifier path

The identifier/keyword branch handles the single largest share of tokens
(~17% identifiers plus all keywords, ~35-45% of the corpus). It called two
methods per token: get_current_token_bytes() to extract the token string and
determine_identifier_or_keyword_type() to classify it.

Inline that fast path into read_next_token(): extract the bytes and do the
strtoupper + TOKENS lookup directly, returning the identifier without any call
when it is not a keyword (the common case). The post-lookup keyword logic
(version gating, function-call lookahead, high-NOT precedence, synonyms) moves
to a new resolve_keyword_type() that is reached only for actual keywords;
determine_identifier_or_keyword_type() now delegates to it for its other caller.

Lex-only throughput on the MySQL server corpus: +5-6% under tracing JIT,
+3.4% without (best-of-seven, ABAB-confirmed). The keyword path's measured
ceiling was ~10% (JIT), most of which is the irreducible substr/strtoupper/
hash-lookup work that remains.
---
 .../src/mysql/class-wp-mysql-lexer.php        | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
index 721af9223..d6ee9970e 100644
--- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
+++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
@@ -2438,12 +2438,19 @@ private function read_next_token(): ?int {
 		) {
 			$started_at = $this->bytes_already_read;
 			$type       = $this->read_identifier();
-			if ( self::IDENTIFIER === $type ) {
+			if (
+				self::IDENTIFIER === $type
 				// When preceded by a dot, it is always an identifier.
-				if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) {
-					$type = self::IDENTIFIER;
-				} else {
-					$type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() );
+				&& ! ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] )
+			) {
+				// Inline the keyword lookup on the hot identifier path: most
+				// identifiers are not keywords, so this avoids two method calls
+				// (token-bytes extraction + keyword determination) per token.
+				$keyword = self::TOKENS[ strtoupper(
+					substr( $this->sql, $started_at, $this->bytes_already_read - $started_at )
+				) ] ?? self::IDENTIFIER;
+				if ( self::IDENTIFIER !== $keyword ) {
+					$type = $this->resolve_keyword_type( $keyword );
 				}
 			}
 		} elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) {
@@ -2996,13 +3003,20 @@ private function read_comment_content(): void {
 	}
 
 	private function determine_identifier_or_keyword_type( string $value ): int {
-		$value = strtoupper( $value );
-
-		// Lookup the string in the token table.
-		$type = self::TOKENS[ $value ] ?? self::IDENTIFIER;
+		$type = self::TOKENS[ strtoupper( $value ) ] ?? self::IDENTIFIER;
 		if ( self::IDENTIFIER === $type ) {
 			return self::IDENTIFIER;
 		}
+		return $this->resolve_keyword_type( $type );
+	}
+
+	/**
+	 * Resolve a keyword token id matched in self::TOKENS, applying version gating,
+	 * function-call lookahead, the SQL_MODE_HIGH_NOT_PRECEDENCE rule, and synonyms.
+	 *
+	 * @param int $type A token id already matched in self::TOKENS (never IDENTIFIER).
+	 */
+	private function resolve_keyword_type( int $type ): int {
 
 		// Apply MySQL version specifics (positive number: >= <version>, negative number: < <version>).
 		if ( isset( self::VERSIONS[ $type ] ) ) {