From 632c4baba1db394d337dad5eb4f1bdf6db0687ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 12:59:20 +0200 Subject: [PATCH 1/9] Make the lexer benchmark robust and JIT-aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run-lexer-benchmark.php timed a single pass, which is too noisy to compare a change against. Rework it into a reliable throughput benchmark that the lexer optimisations in this branch can be measured against: - Load through src/load.php (parity with run-parser-benchmark.php) so a loaded native extension is benchmarked via the same public WP_MySQL_Lexer wrapper. - Warm up with discarded passes (heating opcache, the tracing JIT, and CPU caches), then run N timed passes over the whole corpus. - Headline the best pass: lexing is deterministic and CPU-bound, so outside interference can only slow a pass down, making the fastest pass the most reproducible estimate of intrinsic cost and the most stable basis for a before/after comparison. Median and best-vs-worst spread are reported too so a noisy machine is obvious. - Detect and report the active config (opcache / tracing JIT) and the implementation (php / native-extension), and warn when opcache.jit is set but the JIT did not actually activate. - Add --iterations / --warmup; keep --json (headline kept as "qps"). Add a `bench-lexer` script to the mysql-on-sqlite package's composer.json that runs the benchmark twice — without and with the tracing JIT — so both configurations are measured with one `composer run bench-lexer` (JIT is a start-up setting that cannot be toggled mid-process). --- packages/mysql-on-sqlite/composer.json | 6 +- .../tests/tools/run-lexer-benchmark.php | 154 ++++++++++++++---- 2 files changed, 126 insertions(+), 34 deletions(-) diff --git a/packages/mysql-on-sqlite/composer.json b/packages/mysql-on-sqlite/composer.json index 9d2b148fa..c7ef2b417 100644 --- a/packages/mysql-on-sqlite/composer.json +++ b/packages/mysql-on-sqlite/composer.json @@ -2,7 +2,11 @@ "name": "wordpress/mysql-on-sqlite", "type": "library", "scripts": { - "test": "phpunit" + "test": "phpunit", + "bench-lexer": [ + "@php tests/tools/run-lexer-benchmark.php", + "@php -d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing tests/tools/run-lexer-benchmark.php" + ] }, "require-dev": { "phpunit/phpunit": "^8.5" diff --git a/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php b/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php index 87f1ec798..23ecd1b43 100644 --- a/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php +++ b/packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php @@ -1,73 +1,161 @@ = $limit ) { + $query = $record[0] ?? null; + if ( null === $query || '' === $query ) { + continue; + } + $queries[] = $query; + if ( null !== $limit && count( $queries ) >= $limit ) { break; } } +$query_count = count( $queries ); -// Run the lexer. -$processed = 0; -$start = microtime( true ); -for ( $i = 0; $i < count( $records ); $i += 1 ) { - $query = $records[ $i ][0]; - $lexer = new WP_MySQL_Lexer( $query ); - $tokens = $lexer->remaining_tokens(); - if ( count( $tokens ) === 0 ) { - throw new Exception( 'Failed to tokenize query: ' . $query ); +// Lex the whole corpus once. Calling native_token_stream() vs remaining_tokens() +// mirrors how the driver consumes the chosen lexer. +$native = class_exists( 'WP_MySQL_Native_Lexer', false ); +$lex_corpus = function () use ( $queries, $native ) { + foreach ( $queries as $query ) { + $lexer = new WP_MySQL_Lexer( $query ); + $tokens = $native && $lexer instanceof WP_MySQL_Native_Lexer + ? $lexer->native_token_stream() + : $lexer->remaining_tokens(); + $count = is_array( $tokens ) ? count( $tokens ) : $tokens->count(); + if ( 0 === $count ) { + throw new Exception( 'Failed to tokenize query: ' . $query ); + } } - $processed += 1; +}; + +// Warmup passes are discarded. +for ( $i = 0; $i < $warmup; $i++ ) { + $lex_corpus(); } -$duration = microtime( true ) - $start; -$qps = $processed / $duration; + +// Timed passes: one QPS sample per pass. +$samples = array(); +for ( $i = 0; $i < $iterations; $i++ ) { + $start = microtime( true ); + $lex_corpus(); + $samples[] = $query_count / ( microtime( true ) - $start ); +} +sort( $samples ); + +$best = $samples[ count( $samples ) - 1 ]; +$worst = $samples[0]; +$mean = array_sum( $samples ) / count( $samples ); +$mid = intdiv( count( $samples ), 2 ); +$median = 0 === count( $samples ) % 2 + ? ( $samples[ $mid - 1 ] + $samples[ $mid ] ) / 2 + : $samples[ $mid ]; +$spread = $best > 0 ? ( $best - $worst ) / $best : 0.0; + +// Detect the active runtime configuration so the run is self-describing. +// opcache_get_status() returns false (no warning) when opcache is disabled. +$opcache_status = function_exists( 'opcache_get_status' ) ? opcache_get_status( false ) : false; +$opcache_on = is_array( $opcache_status ); +$jit_on = $opcache_on && ! empty( $opcache_status['jit']['on'] ); +$implementation = ( extension_loaded( 'wp_mysql_parser' ) && $native ) ? 'native-extension' : 'php'; if ( $json ) { echo json_encode( array( - 'benchmark' => 'mysql-lexer', - 'implementation' => 'php', - 'queries' => $processed, - 'duration' => $duration, - 'qps' => $qps, - 'php_version' => PHP_VERSION, + 'benchmark' => 'mysql-lexer', + 'implementation' => $implementation, + 'extension_loaded' => extension_loaded( 'wp_mysql_parser' ), + 'opcache' => $opcache_on, + 'jit' => $jit_on, + 'queries' => $query_count, + 'warmup' => $warmup, + 'iterations' => $iterations, + 'qps' => $best, // Headline (best pass); kept as "qps" for compatibility. + 'qps_best' => $best, + 'qps_median' => $median, + 'qps_mean' => $mean, + 'qps_worst' => $worst, + 'spread' => $spread, + 'php_version' => PHP_VERSION, ), JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ), "\n"; exit; } -// Print the results. -printf( "\nTokenized %d queries in %.5fs @ %d QPS.\n", $processed, $duration, $qps ); +$config = $jit_on ? 'opcache + tracing JIT' : ( $opcache_on ? 'opcache, no JIT' : 'no opcache' ); +printf( "MySQL lexer (%s implementation) — %s\n", $implementation, $config ); +$jit_requested = ! in_array( strtolower( (string) ini_get( 'opcache.jit' ) ), array( '', '0', 'off', 'disable' ), true ); +if ( $jit_requested && ! $jit_on ) { + printf( " warning: opcache.jit is set but the JIT is NOT active here — check that opcache is enabled and jit_buffer_size > 0.\n" ); +} +printf( "%s queries, %d warmup + %d timed passes\n", number_format( $query_count ), $warmup, $iterations ); +printf( " best: %s QPS\n", number_format( $best ) ); +printf( " median: %s QPS\n", number_format( $median ) ); +printf( " spread: %.1f%% (best vs worst)\n", $spread * 100 ); +if ( $spread > 0.10 ) { + printf( " note: >10%% spread — the machine is noisy; close other apps for a steadier number.\n" ); +} From bf5467891a45dd1c3bfd2b34598b6db53ea97ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 13:14:09 +0200 Subject: [PATCH 2/9] Add a CI job that benchmarks the lexer and comments on the PR On pull requests that touch the lexer (or the benchmark tool), run run-lexer-benchmark.php for both the base commit and the PR head on the same runner, without and with the tracing JIT, and post the before/after numbers as a single comment that updates in place on every push. The job is informational, not gating: hosted CI runners are too noisy for absolute-throughput thresholds. Measuring base and head back-to-back on the same runner cancels the runner's absolute speed, so the same-runner speedup ratio is the meaningful signal. Only the source tree is swapped to the base commit; the PR's benchmark tool is reused for both sides so they are timed identically. --- .github/workflows/lexer-benchmark.yml | 116 ++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 .github/workflows/lexer-benchmark.yml diff --git a/.github/workflows/lexer-benchmark.yml b/.github/workflows/lexer-benchmark.yml new file mode 100644 index 000000000..9c41005f0 --- /dev/null +++ b/.github/workflows/lexer-benchmark.yml @@ -0,0 +1,116 @@ +name: Lexer benchmark + +on: + pull_request: + paths: + - 'packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php' + - 'packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php' + - 'packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php' + - 'packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php' + - '.github/workflows/lexer-benchmark.yml' + +# A new push supersedes the previous run; the result comment is updated in place. +concurrency: + group: lexer-benchmark-${{ github.ref }} + cancel-in-progress: true + +# Disable permissions for all available scopes by default. +permissions: {} + +jobs: + benchmark: + name: Lexer throughput (base vs PR) + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read # Required to clone the repo. + pull-requests: write # Required to post/update the result comment. + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Need the base commit to benchmark the "before" state. + + - name: Set up PHP + uses: shivammathur/setup-php@v2 + with: + php-version: '8.4' + coverage: none + + - name: Benchmark base vs PR + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + run: | + BENCH=packages/mysql-on-sqlite/tests/tools/run-lexer-benchmark.php + + # Best-pass QPS for a given PHP flag set. + best() { + php -d memory_limit=512M "$@" "$BENCH" --json \ + | php -r '$j = json_decode( stream_get_contents( STDIN ), true ); echo (int) $j["qps_best"];' + } + jit_flags="-d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing" + + # PR (head) is the current checkout. + head_nojit=$( best ) + head_jit=$( best $jit_flags ) + + # Swap only the source tree to the base commit and re-measure with the + # same (PR) benchmark tool, so both sides are timed identically. The + # benchmark tool itself (tests/tools/) is left at the PR version. + git checkout "$BASE_SHA" -- packages/mysql-on-sqlite/src + base_nojit=$( best ) + base_jit=$( best $jit_flags ) + git checkout HEAD -- packages/mysql-on-sqlite/src + + fmt() { php -r 'echo number_format( (int) $argv[1] );' "$1"; } + ratio() { php -r 'printf( "%.2f", $argv[1] / max( 1, (int) $argv[2] ) );' "$1" "$2"; } + + { + echo "" + echo "### 🤖 Lexer benchmark" + echo "Changes to lexer-related files were detected and triggered a benchmark:" + echo + echo "| Config | Base (QPS) | This PR (QPS) | Speedup |" + echo "| --- | ---: | ---: | ---: |" + echo "| **no JIT** | $( fmt "$base_nojit" ) | $( fmt "$head_nojit" ) | **$( ratio "$head_nojit" "$base_nojit" )×** |" + echo "| **tracing JIT** | $( fmt "$base_jit" ) | $( fmt "$head_jit" ) | **$( ratio "$head_jit" "$base_jit" )×** |" + echo + echo "**Note:** Hosted runners are noisy, and absolute numbers vary. Treat the results with caution and verify them locally." + echo + echo "To reproduce locally:" + echo '```' + echo "cd packages/mysql-on-sqlite && composer run bench-lexer" + echo '```' + } > "$RUNNER_TEMP/comment.md" + echo "COMMENT_FILE=$RUNNER_TEMP/comment.md" >> "$GITHUB_ENV" + + - name: Post or update the PR comment + uses: actions/github-script@v7 + with: + script: | + const fs = require( 'fs' ); + const body = fs.readFileSync( process.env.COMMENT_FILE, 'utf8' ); + const marker = ''; + const { data: comments } = await github.rest.issues.listComments( { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + per_page: 100, + } ); + const existing = comments.find( ( c ) => c.body && c.body.includes( marker ) ); + if ( existing ) { + await github.rest.issues.updateComment( { + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + } ); + } else { + await github.rest.issues.createComment( { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + } ); + } From 5c38b20822e258425f9c6bae73ccdc79397f2bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:36:59 +0200 Subject: [PATCH 3/9] Speed up the lexer with a cached length and inlined token loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the structural lexer optimisations from PR #375: - Cache strlen($sql) once in $sql_length instead of recomputing it on each EOF/bounds check. - Use strpos($sql, '*/', $pos) instead of a manual scan loop in read_comment_content(). - In read_quoted_text(), use strpos() to find the next quote, dropping the separate end-of-input check that followed the strcspn() scan. - Inline next_token() + get_token() in remaining_tokens() so the hot loop builds tokens directly. The #375 strspn()->byte-comparison swaps are intentionally not included: once the dispatch chain is reordered by later commits those checks are off the hot path and strspn() is marginally faster than the inline comparisons, so the swaps were net-neutral-to-negative while adding code. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- .../src/mysql/class-wp-mysql-lexer.php | 93 ++++++++++++++----- 1 file changed, 69 insertions(+), 24 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 10ecd90ad..4a5859f1e 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { */ private $sql; + /** + * Byte length of the SQL payload. + * + * @var int + */ + private $sql_length; + /** * The version of the MySQL server that the SQL payload is intended for. * @@ -2189,6 +2196,7 @@ public function __construct( array $sql_modes = array() ) { $this->sql = $sql; + $this->sql_length = strlen( $sql ); $this->mysql_version = $mysql_version; foreach ( $sql_modes as $sql_mode ) { @@ -2284,10 +2292,46 @@ public function get_token(): ?WP_MySQL_Token { * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { - $tokens = array(); - while ( true === $this->next_token() ) { - $token = $this->get_token(); - $tokens[] = $token; + $tokens = array(); + $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( + self::SQL_MODE_NO_BACKSLASH_ESCAPES + ); + + while ( true ) { + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + break; + } + + do { + $this->token_starts_at = $this->bytes_already_read; + $this->token_type = $this->read_next_token(); + } while ( + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type + ); + + if ( null === $this->token_type ) { + break; + } + + $tokens[] = new WP_MySQL_Token( + $this->token_type, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at, + $this->sql, + $no_backslash_escapes_sql_mode_set + ); + + if ( self::EOF === $this->token_type ) { + $this->token_type = null; + break; + } } return $tokens; } @@ -2420,7 +2464,7 @@ private function read_next_token(): ?int { } elseif ( '-' === $byte ) { if ( '-' === $next_byte - && $this->bytes_already_read + 2 < strlen( $this->sql ) + && $this->bytes_already_read + 2 < $this->sql_length && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 ) { $type = $this->read_line_comment(); @@ -2685,7 +2729,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2708,7 +2752,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2740,7 +2784,7 @@ private function read_number(): ?int { strspn( $next_byte, self::DIGIT_MASK ) > 0 || ( ( '+' === $next_byte || '-' === $next_byte ) - && $this->bytes_already_read + 2 < strlen( $this->sql ) + && $this->bytes_already_read + 2 < $this->sql_length && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 ) ); @@ -2838,12 +2882,11 @@ private function read_quoted_text(): ?int { // in which case the escape sequence is consumed and the loop continues. $at = $this->bytes_already_read; while ( true ) { - $at += strcspn( $this->sql, $quote, $at ); - - // Unclosed string - unexpected EOF. - if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { + $quote_at = strpos( $this->sql, $quote, $at ); + if ( false === $quote_at ) { return null; // Invalid input. } + $at = $quote_at; /* * By default, quotes can be escaped with a "\". @@ -2853,9 +2896,17 @@ private function read_quoted_text(): ?int { * The quote is escaped only when the number of preceding backslashes * is odd - "\" is an escape sequence, "\\" is an escaped backslash, * "\\\" is an escaped backslash and an escape sequence, and so on. + * + * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- + * offset wraparound (PHP 7.1+) when the closing-quote candidate + * sits at the very start of the input. The `?? null` covers + * positive out-of-range indexes belt-and-suspenders. */ if ( ! $no_backslash_escapes ) { - for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); + $i = 0; + while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { + $i += 1; + } if ( 1 === $i % 2 ) { $at += 1; continue; @@ -2920,17 +2971,11 @@ private function read_mysql_comment(): int { } private function read_comment_content(): void { - while ( true ) { - $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); - $this->bytes_already_read += 1; // Consume the '*'. - $byte = $this->sql[ $this->bytes_already_read ] ?? null; - if ( null === $byte ) { - break; - } - if ( '/' === $byte ) { - $this->bytes_already_read += 1; // Consume the '/'. - break; - } + $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); + if ( false === $comment_end ) { + $this->bytes_already_read = $this->sql_length; + } else { + $this->bytes_already_read = $comment_end + 2; } } From 2d5d198ef524600353a2986a77f7927baf4d992c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:53:29 +0200 Subject: [PATCH 4/9] Inline leading-whitespace skip in lexer's token loops Both next_token() and remaining_tokens() previously paid a read_next_token() function call per whitespace run only to recognise and skip the resulting WHITESPACE token. A single unguarded strspn() at the top of each loop iteration absorbs the run inline, saving the call overhead for ~one whitespace run per real token across millions of tokens. The strspn() call is unguarded because an unconditional strspn() (which returns 0 in a single C-side call when nothing matches) is faster than gating it on a five-arm '$byte === ...' precheck. --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 4a5859f1e..f80d7d420 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2235,6 +2235,9 @@ public function next_token(): bool { return false; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); @@ -2306,6 +2309,9 @@ public function remaining_tokens(): array { break; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); From a1e267f3200cde7d19e8581eeeb991db8a33ff8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:54:33 +0200 Subject: [PATCH 5/9] Catch identifier and keyword tokens at the top of the chain ASCII letters and UTF-8 multibyte start bytes account for most token-start bytes on the MySQL corpus. They previously fell into the catch-all `else` at the bottom of read_next_token() after walking every operator arm in between. The new branch sits at the top of the elseif chain and dispatches them directly. The `next_byte !== "'"` guard keeps the x'..', n'..' and similar specials on their dedicated branches. `_` and `$` starters stay on the catch-all so the UNDERSCORE_CHARSET lookup still fires. --- .../src/mysql/class-wp-mysql-lexer.php | 22 ++++++++++++- .../tests/mysql/WP_MySQL_Lexer_Tests.php | 31 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index f80d7d420..0b6c6b2a6 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,7 +2404,27 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; - if ( "'" === $byte || '"' === $byte || '`' === $byte ) { + // Fast path for keywords and identifiers. + // These are the most common token types in MySQL payloads. + if ( + ( + ( $byte >= 'a' && $byte <= 'z' ) + || ( $byte >= 'A' && $byte <= 'Z' ) + || $byte > "\x7F" + ) + && "'" !== $next_byte + ) { + $started_at = $this->bytes_already_read; + $type = $this->read_identifier(); + if ( self::IDENTIFIER === $type ) { + // When preceded by a dot, it is always an identifier. + if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { + $type = self::IDENTIFIER; + } else { + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + } + } + } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { $type = $this->read_number(); diff --git a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php index 8f18cf170..383b03f57 100644 --- a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php +++ b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php @@ -367,6 +367,37 @@ function ( $severity, $message, $file, $line ) { $this->assertNull( $lexer->get_token() ); } + /** + * A charset-introducer-like name used as a qualified member (after a dot) + * must lex as an identifier. A real charset introducer only appears before + * a string literal, never as the member of a qualified reference. + * + * @dataProvider data_underscore_charset_after_dot + */ + public function test_underscore_charset_name_after_dot_is_identifier( string $sql, int $token_index, int $expected_id ): void { + $tokens = ( new WP_MySQL_Lexer( $sql ) )->remaining_tokens(); + $this->assertSame( + WP_MySQL_Lexer::get_token_name( $expected_id ), + $tokens[ $token_index ]->get_name(), + $sql + ); + } + + /** + * @return array + */ + public function data_underscore_charset_after_dot(): array { + return array( + // `t . _utf8` - the member name must be an identifier, not a charset. + 'charset name after dot is identifier' => array( 't._utf8', 2, WP_MySQL_Lexer::IDENTIFIER ), + 'other charset name after dot' => array( 'a._binary', 2, WP_MySQL_Lexer::IDENTIFIER ), + // A genuine charset introducer (before a string) stays a charset. + 'charset introducer before string' => array( "_utf8'x'", 0, WP_MySQL_Lexer::UNDERSCORE_CHARSET ), + // A non-charset underscore name after a dot stays an identifier. + 'non-charset underscore name after dot' => array( 't._foo', 2, WP_MySQL_Lexer::IDENTIFIER ), + ); + } + private function get_token_names( array $token_types ): array { return array_map( function ( $token_type ) { From f3f11eb4b7f09d27afe3b6cb3b4f0bc52344d48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:55:33 +0200 Subject: [PATCH 6/9] Add a single-byte operator dispatch table The ASCII bytes (, ), ',' ;, +, ~, %, ^, ?, {, }, and = each map to a unique single-byte token type with no lookahead. A static array + isset() arm dispatches them in one lookup, ahead of the per-byte elseif chain, and the now-shadowed individual arms further down the chain are removed so the table is the single source of truth for these tokens. '*' and '|' are deliberately excluded because their token type depends on context (in_mysql_comment for '*/', SQL_MODE_PIPES_AS_CONCAT for '||'). --- .../src/mysql/class-wp-mysql-lexer.php | 56 +++++++------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 0b6c6b2a6..30ceb3d37 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,6 +2404,22 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; + // A map for a single-byte symbol fast path. + static $single_byte_ops = array( + '(' => self::OPEN_PAR_SYMBOL, + ')' => self::CLOSE_PAR_SYMBOL, + ',' => self::COMMA_SYMBOL, + ';' => self::SEMICOLON_SYMBOL, + '+' => self::PLUS_OPERATOR, + '~' => self::BITWISE_NOT_OPERATOR, + '%' => self::MOD_OPERATOR, + '^' => self::BITWISE_XOR_OPERATOR, + '?' => self::PARAM_MARKER, + '{' => self::OPEN_CURLY_SYMBOL, + '}' => self::CLOSE_CURLY_SYMBOL, + '=' => self::EQUAL_OPERATOR, + ); + // Fast path for keywords and identifiers. // These are the most common token types in MySQL payloads. if ( @@ -2424,6 +2440,10 @@ private function read_next_token(): ?int { $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); } } + } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { + // Fast path for single-byte symbols. + $this->bytes_already_read += 1; + $type = $single_byte_ops[ $byte ]; } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { @@ -2435,9 +2455,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DOT_SYMBOL; } - } elseif ( '=' === $byte ) { - $this->bytes_already_read += 1; - $type = self::EQUAL_OPERATOR; } elseif ( ':' === $byte ) { $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $next_byte ) { @@ -2484,9 +2501,6 @@ private function read_next_token(): ?int { } else { $type = self::LOGICAL_NOT_OPERATOR; } - } elseif ( '+' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PLUS_OPERATOR; } elseif ( '-' === $byte ) { if ( '-' === $next_byte @@ -2536,9 +2550,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DIV_OPERATOR; } - } elseif ( '%' === $byte ) { - $this->bytes_already_read += 1; - $type = self::MOD_OPERATOR; } elseif ( '&' === $byte ) { $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $next_byte ) { @@ -2547,9 +2558,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_AND_OPERATOR; } - } elseif ( '^' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $byte ) { $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $next_byte ) { @@ -2560,27 +2568,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_OR_OPERATOR; } - } elseif ( '~' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_NOT_OPERATOR; - } elseif ( ',' === $byte ) { - $this->bytes_already_read += 1; - $type = self::COMMA_SYMBOL; - } elseif ( ';' === $byte ) { - $this->bytes_already_read += 1; - $type = self::SEMICOLON_SYMBOL; - } elseif ( '(' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_PAR_SYMBOL; - } elseif ( ')' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_PAR_SYMBOL; - } elseif ( '{' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_CURLY_SYMBOL; - } elseif ( '}' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $byte ) { $this->bytes_already_read += 1; // Consume the '@'. @@ -2604,9 +2591,6 @@ private function read_next_token(): ?int { $type = self::AT_SIGN_SYMBOL; } } - } elseif ( '?' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PARAM_MARKER; } elseif ( '\\' === $byte ) { $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $next_byte ) { From 528557995d4ecdfdc2704431e21a934a4535a6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 15:40:23 +0200 Subject: [PATCH 7/9] Document non-obvious lexer dispatch conditions Three review-noted spots that were terse in the code: - The remaining_tokens() loop guard now spells out why both EOF and `null === token_type && bytes_already_read > 0` are needed (EOF on clean end-of-input vs invalid byte mid-stream, with the `> 0` guard letting the very first iteration through). - The identifier/keyword fast path now explains `$byte > "\x7F"` (UTF-8 multi-byte starter; MySQL identifiers allow U+0080-U+FFFF) and `next_byte !== "'"` (only single quotes form the special hex/bin/n-char literal starters; `"` never does, regardless of SQL mode). No behavior change. --- .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 30ceb3d37..721af9223 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2301,6 +2301,8 @@ public function remaining_tokens(): array { ); while ( true ) { + // Bail on EOF, or on a null token type once at least one byte has + // been consumed (read_next_token() hit invalid input mid-stream). if ( self::EOF === $this->token_type || ( null === $this->token_type && $this->bytes_already_read > 0 ) @@ -2421,7 +2423,11 @@ private function read_next_token(): ?int { ); // Fast path for keywords and identifiers. - // These are the most common token types in MySQL payloads. + // `$byte > "\x7F"` catches any non-ASCII byte (0x80-0xFF); read_identifier() + // restricts the accepted identifier codepoints to U+0080-U+FFFF. + // `"'" !== $next_byte` defers x'..', n'..' and similar special + // literals to their dedicated branches below; only single quotes + // form those, regardless of SQL mode. if ( ( ( $byte >= 'a' && $byte <= 'z' ) From ee8664395f762ba0fcd458eb432adef3a468f6b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:37:05 +0200 Subject: [PATCH 8/9] Skip parent constructor in WP_MySQL_Token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Token construction is on the lexer hot path; bypassing the `WP_Parser_Token::__construct()` indirection and assigning the four properties directly removes one method call per token. Requires `$input` on `WP_Parser_Token` to be `protected` instead of `private` so the subclass can write to it. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php | 6 +++++- .../mysql-on-sqlite/src/parser/class-wp-parser-token.php | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 1fb25ab42..0840bc2f2 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,7 +30,11 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { - parent::__construct( $id, $start, $length, $input ); + $this->id = $id; + $this->start = $start; + $this->length = $length; + $this->input = $input; + $this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled; } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php index b77261896..4132ba382 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php @@ -35,7 +35,7 @@ class WP_Parser_Token { * * @var string */ - private $input; + protected $input; /** * Constructor. From 12a78da2068d59e6a1269cc14f12f9dfc8b5a855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 5 Jun 2026 21:05:20 +0200 Subject: [PATCH 9/9] Inline the keyword lookup on the hot identifier path The identifier/keyword branch handles the single largest share of tokens (~17% identifiers plus all keywords, ~35-45% of the corpus). It called two methods per token: get_current_token_bytes() to extract the token string and determine_identifier_or_keyword_type() to classify it. Inline that fast path into read_next_token(): extract the bytes and do the strtoupper + TOKENS lookup directly, returning the identifier without any call when it is not a keyword (the common case). The post-lookup keyword logic (version gating, function-call lookahead, high-NOT precedence, synonyms) moves to a new resolve_keyword_type() that is reached only for actual keywords; determine_identifier_or_keyword_type() now delegates to it for its other caller. Lex-only throughput on the MySQL server corpus: +5-6% under tracing JIT, +3.4% without (best-of-seven, ABAB-confirmed). The keyword path's measured ceiling was ~10% (JIT), most of which is the irreducible substr/strtoupper/ hash-lookup work that remains. --- .../src/mysql/class-wp-mysql-lexer.php | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 721af9223..d6ee9970e 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2438,12 +2438,19 @@ private function read_next_token(): ?int { ) { $started_at = $this->bytes_already_read; $type = $this->read_identifier(); - if ( self::IDENTIFIER === $type ) { + if ( + self::IDENTIFIER === $type // When preceded by a dot, it is always an identifier. - if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { - $type = self::IDENTIFIER; - } else { - $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + && ! ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) + ) { + // Inline the keyword lookup on the hot identifier path: most + // identifiers are not keywords, so this avoids two method calls + // (token-bytes extraction + keyword determination) per token. + $keyword = self::TOKENS[ strtoupper( + substr( $this->sql, $started_at, $this->bytes_already_read - $started_at ) + ) ] ?? self::IDENTIFIER; + if ( self::IDENTIFIER !== $keyword ) { + $type = $this->resolve_keyword_type( $keyword ); } } } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { @@ -2996,13 +3003,20 @@ private function read_comment_content(): void { } private function determine_identifier_or_keyword_type( string $value ): int { - $value = strtoupper( $value ); - - // Lookup the string in the token table. - $type = self::TOKENS[ $value ] ?? self::IDENTIFIER; + $type = self::TOKENS[ strtoupper( $value ) ] ?? self::IDENTIFIER; if ( self::IDENTIFIER === $type ) { return self::IDENTIFIER; } + return $this->resolve_keyword_type( $type ); + } + + /** + * Resolve a keyword token id matched in self::TOKENS, applying version gating, + * function-call lookahead, the SQL_MODE_HIGH_NOT_PRECEDENCE rule, and synonyms. + * + * @param int $type A token id already matched in self::TOKENS (never IDENTIFIER). + */ + private function resolve_keyword_type( int $type ): int { // Apply MySQL version specifics (positive number: >= , negative number: < ). if ( isset( self::VERSIONS[ $type ] ) ) {