From 5a3f2115c3d9fa8b63b6c5a8c08f9c066d396404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:28:40 +0200 Subject: [PATCH 01/30] Inline terminal matching and defer parse node allocation Hot-path changes in WP_Parser::parse_recursive(): - Inline the terminal match in the branch loop instead of recursing into parse_recursive() for every token. Over the full MySQL test suite this eliminates ~1.6M function calls. - Hoist grammar, rules, fragment_ids, rule_names, tokens, and token_count into local variables so the inner loops avoid repeated property lookups on $this->grammar. - Cache the token count on the instance to avoid a count() per call. - Build branch children in a local array and only instantiate the WP_Parser_Node once the branch has matched; on the MySQL corpus ~75% of speculative nodes were previously created and thrown away. - Drop a dead is_array($subnode) check that never fires in practice (subnodes are false, true, tokens, or nodes - never arrays). - Inline fragment inlining: read the fragment's children directly instead of building a fragment node and immediately merging it. End-to-end parser benchmark on the MySQL server test corpus: Before: ~11,500 QPS After: ~14,900 QPS (+29%) --- .../src/mysql/class-wp-mysql-parser.php | 2 +- .../src/parser/class-wp-parser-node.php | 76 ++-------------- .../src/parser/class-wp-parser.php | 89 +++++++++++++------ 3 files changed, 69 insertions(+), 98 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index 69282b9c..b6b465bd 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -40,7 +40,7 @@ public function reset_tokens( array $tokens ): void { * @return bool Whether a query was successfully parsed. */ public function next_query(): bool { - if ( $this->position >= count( $this->tokens ) ) { + if ( $this->position >= $this->token_count ) { return false; } $this->current_ast = $this->parse(); diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index b61f38d5..13f093a0 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -27,79 +27,15 @@ public function append_child( $node ) { } /** - * Flatten the matched rule fragments as if their children were direct - * descendants of the current rule. + * Replace all children with the given array. * - * What are rule fragments? + * This is used by the parser to attach a batch of children built up in a + * local array while trying branches, without allocating a node per attempt. * - * When we initially parse the grammar file, it has compound rules such - * as this one: - * - * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) - * - * Building a parser that can understand such rules is way more complex than building - * a parser that only follows simple rules, so we flatten those compound rules into - * simpler ones. The above rule would be flattened to: - * - * query ::= EOF | %query0 - * %query0 ::= %%query01 %%query02 - * %%query01 ::= simpleStatement | beginWork - * %%query02 ::= SEMICOLON_SYMBOL EOF_zero_or_one | EOF - * EOF_zero_or_one ::= EOF | ε - * - * This factorization happens in "convert-grammar.php". - * - * "Fragments" are intermediate artifacts whose names are not in the original grammar. - * They are extremely useful for the parser, but the API consumer should never have to - * worry about them. Fragment names start with a percent sign ("%"). - * - * The code below inlines every fragment back in its parent rule. - * - * We could optimize this. The current $match may be discarded later on so any inlining - * effort here would be wasted. However, inlining seems cheap and doing it bottom-up here - * is **much** easier than reprocessing the parse tree top-down later on. - * - * The following parse tree: - * - * [ - * 'query' => [ - * [ - * '%query01' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ], - * '%query02' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * ] - * ] - * ] - * ] - * ] - * - * Would be inlined as: - * - * [ - * 'query' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ] - * ] - * ] + * @param array $children The new children. */ - public function merge_fragment( $node ) { - $this->children = array_merge( $this->children, $node->children ); + public function set_children( array $children ): void { + $this->children = $children; } /** diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 4436892f..96feb083 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -11,12 +11,14 @@ class WP_Parser { protected $grammar; protected $tokens; + protected $token_count; protected $position; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; + $this->tokens = $tokens; + $this->token_count = count( $tokens ); + $this->position = 0; } public function parse() { @@ -27,9 +29,11 @@ public function parse() { } private function parse_recursive( $rule_id ) { - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - if ( $this->position >= count( $this->tokens ) ) { + $grammar = $this->grammar; + $highest_terminal_id = $grammar->highest_terminal_id; + + if ( $rule_id <= $highest_terminal_id ) { + if ( $this->position >= $this->token_count ) { return false; } @@ -38,41 +42,67 @@ private function parse_recursive( $rule_id ) { } if ( $this->tokens[ $this->position ]->id === $rule_id ) { + $token = $this->tokens[ $this->position ]; ++$this->position; - return $this->tokens[ $this->position - 1 ]; + return $token; } return false; } - $branches = $this->grammar->rules[ $rule_id ]; - if ( ! count( $branches ) ) { + $branches = $grammar->rules[ $rule_id ]; + if ( ! $branches ) { return false; } // Bale out from processing the current branch if none of its rules can // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { + $rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null; + if ( null !== $rule_lookahead ) { $token_id = $this->tokens[ $this->position ]->id; if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] ) + ! isset( $rule_lookahead[ $token_id ] ) && + ! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] ) ) { return false; } } - $rule_name = $this->grammar->rule_names[ $rule_id ]; + $rule_name = $grammar->rule_names[ $rule_id ]; + $fragment_ids = $grammar->fragment_ids; + $rules = $grammar->rules; + $tokens = $this->tokens; + $token_count = $this->token_count; $starting_position = $this->position; + $branch_matches = false; foreach ( $branches as $branch ) { $this->position = $starting_position; - $node = new WP_Parser_Node( $rule_id, $rule_name ); + $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { + // Inline terminal matching to avoid a recursive call per token. + if ( $subrule_id <= $highest_terminal_id ) { + if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { + // Epsilon rule: matches without consuming input. + continue; + } + if ( + $this->position < $token_count + && $tokens[ $this->position ]->id === $subrule_id + ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); if ( false === $subnode ) { $branch_matches = false; break; - } elseif ( true === $subnode ) { + } + if ( true === $subnode ) { /* * The subrule was matched without actually matching a token. * This means a special empty "ε" (epsilon) rule was matched. @@ -80,16 +110,15 @@ private function parse_recursive( $rule_id ) { * It is used to represent optional grammar productions. */ continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; - } - if ( is_array( $subnode ) && ! count( $subnode ) ) { - continue; } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); + if ( isset( $fragment_ids[ $subrule_id ] ) ) { + // Fragments: inline their children directly to avoid building + // a throwaway WP_Parser_Node that would be merged afterwards. + foreach ( $subnode->get_children_ref() as $c ) { + $children[] = $c; + } } else { - $node->append_child( $subnode ); + $children[] = $subnode; } } @@ -100,12 +129,16 @@ private function parse_recursive( $rule_id ) { // for right-associative rules, which could solve this. // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994 // See: https://github.com/antlr/antlr4/issues/488 - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) { + if ( + $branch_matches + && 'selectStatement' === $rule_name + && $this->position < $token_count + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { $branch_matches = false; } - if ( true === $branch_matches ) { + if ( $branch_matches ) { break; } } @@ -115,10 +148,12 @@ private function parse_recursive( $rule_id ) { return false; } - if ( ! $node->has_child() ) { + if ( ! $children ) { return true; } + $node = new WP_Parser_Node( $rule_id, $rule_name ); + $node->set_children( $children ); return $node; } } From cdc9713db6507c777bd27da38ed7800171adda0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:34:20 +0200 Subject: [PATCH 02/30] Use per-branch FIRST sets to skip unreachable branches The grammar now precomputes FIRST and NULLABLE via fixpoint, then indexes each rule's branches by the tokens that can start them. At parse time the parser jumps straight to the candidate branches for the current token instead of iterating every branch and letting most fail. On the full MySQL test suite, 59% of branch attempts previously failed because the first token could never match the branch's FIRST set; with per-branch lookahead those attempts are eliminated. End-to-end parser benchmark: Before: ~14,900 QPS After: ~22,400 QPS (+50%) --- .../src/parser/class-wp-parser-grammar.php | 212 ++++++++++++++---- .../src/parser/class-wp-parser.php | 60 +++-- 2 files changed, 197 insertions(+), 75 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 9bf30b97..1e4c461b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -29,7 +29,32 @@ class WP_Parser_Grammar { public $rules; public $rule_names; public $fragment_ids; - public $lookahead_is_match_possible = array(); + + /** + * Per-rule branch selector keyed by the next token id. + * + * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list + * of branch indexes in `$rules[$rule_id]` that can possibly match when the + * current token has the given id. Nullable branches appear in every entry. + * + * If an entry does not exist for the current token, `$nullable_branches` + * is consulted. If both are empty, the rule cannot match and the parser + * returns immediately. + * + * Rules whose FIRST set could not be computed do not appear in the map; + * for those the parser falls back to trying every branch. + * + * @var array> + */ + public $branches_for_token = array(); + + /** + * Per-rule list of nullable branch indexes. + * + * @var array + */ + public $nullable_branches = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; @@ -57,8 +82,8 @@ private function inflate( $grammar ) { $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rule_names[ $rule_id ] = $rule_name; /** * Treat all intermediate rules as fragments to inline before returning @@ -76,7 +101,7 @@ private function inflate( $grammar ) { * They are prefixed with a "%" to be distinguished from the original rules. */ if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + $this->fragment_ids[ $rule_id ] = true; } } @@ -86,55 +111,154 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bail out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whopping 80%! - * - * @TODO: Explore these possible next steps: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { - continue; - } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; + $this->build_branch_selectors(); + } + + /** + * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize + * them into a per-rule map of `token_id => branch_index[]` so the parser + * can jump straight to the branches that can possibly match the current + * token. + * + * This replaces the previous coarse "can any branch match this token?" + * lookahead. On the MySQL corpus the fine-grained selector skips ~60% + * of the branch attempts that the parser used to try and fail. + */ + private function build_branch_selectors() { + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); + + foreach ( $rule_ids as $rule_id ) { + $nullable[ $rule_id ] = false; + $first_sets[ $rule_id ] = array(); + } + + // Iterate to fixpoint. FIRST and NULLABLE set monotonically grow. + do { + $changed = false; + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. + continue; + } + if ( $symbol < $low_nt ) { + // Terminal. + if ( ! isset( $first_sets[ $rule_id ][ $symbol ] ) ) { + $first_sets[ $rule_id ][ $symbol ] = true; + $changed = true; + } + $branch_nullable = false; + break; + } + // Non-terminal. + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + if ( ! isset( $first_sets[ $rule_id ][ $tid ] ) ) { + $first_sets[ $rule_id ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; + } } + if ( $branch_nullable && ! $nullable[ $rule_id ] ) { + $nullable[ $rule_id ] = true; + $changed = true; + } + } + } + } while ( $changed ); - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; + // Build per-(rule, token) branch indices. + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; break; } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + $branch_first[ $tid ] = true; + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; } } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; + } + } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); } + $selector = $merged; + $this->nullable_branches[ $rule_id ] = $nullable_branch_ids; } + if ( $selector ) { + $this->branches_for_token[ $rule_id ] = $selector; + } + } + } + + /** + * Merge two ascending int arrays into one ascending int array without + * duplicates. Preserves original branch order as required by the parser. + * + * @param int[] $a + * @param int[] $b + * @return int[] + */ + private static function merge_sorted( array $a, array $b ): array { + $i = 0; + $j = 0; + $na = count( $a ); + $nb = count( $b ); + $out = array(); + while ( $i < $na && $j < $nb ) { + if ( $a[ $i ] < $b[ $j ] ) { + $out[] = $a[ $i++ ]; + } elseif ( $a[ $i ] > $b[ $j ] ) { + $out[] = $b[ $j++ ]; + } else { + $out[] = $a[ $i ]; + ++$i; + ++$j; + } + } + while ( $i < $na ) { + $out[] = $a[ $i++ ]; + } + while ( $j < $nb ) { + $out[] = $b[ $j++ ]; } + return $out; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 96feb083..d674312b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -54,42 +54,48 @@ private function parse_recursive( $rule_id ) { return false; } - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - $rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null; - if ( null !== $rule_lookahead ) { - $token_id = $this->tokens[ $this->position ]->id; - if ( - ! isset( $rule_lookahead[ $token_id ] ) && - ! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] ) - ) { + $tokens = $this->tokens; + $token_count = $this->token_count; + $position = $this->position; + + // Narrow the set of branches worth trying using the precomputed FIRST + // sets. When no entry exists for the current token, fall back to the + // rule's nullable branches (if any); if both are empty the rule cannot + // match here. + $branch_selector = $grammar->branches_for_token[ $rule_id ] ?? null; + if ( null !== $branch_selector ) { + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $branch_selector[ $tid ] ) ) { + $candidate_branches = $branch_selector[ $tid ]; + } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + $candidate_branches = $grammar->nullable_branches[ $rule_id ]; + } else { return false; } + } else { + $candidate_branches = array_keys( $branches ); } - $rule_name = $grammar->rule_names[ $rule_id ]; - $fragment_ids = $grammar->fragment_ids; - $rules = $grammar->rules; - $tokens = $this->tokens; - $token_count = $this->token_count; - $starting_position = $this->position; - $branch_matches = false; - foreach ( $branches as $branch ) { - $this->position = $starting_position; + $rule_name = $grammar->rule_names[ $rule_id ]; + $fragment_ids = $grammar->fragment_ids; + $is_select_statement = 'selectStatement' === $rule_name; + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $idx ) { + $branch = $branches[ $idx ]; + $this->position = $position; $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { - // Inline terminal matching to avoid a recursive call per token. if ( $subrule_id <= $highest_terminal_id ) { if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { - // Epsilon rule: matches without consuming input. continue; } if ( $this->position < $token_count && $tokens[ $this->position ]->id === $subrule_id ) { - $children[] = $tokens[ $this->position ]; + $children[] = $tokens[ $this->position ]; ++$this->position; continue; } @@ -103,17 +109,9 @@ private function parse_recursive( $rule_id ) { break; } if ( true === $subnode ) { - /* - * The subrule was matched without actually matching a token. - * This means a special empty "ε" (epsilon) rule was matched. - * An "ε" rule in a grammar matches an empty input of 0 bytes. - * It is used to represent optional grammar productions. - */ continue; } if ( isset( $fragment_ids[ $subrule_id ] ) ) { - // Fragments: inline their children directly to avoid building - // a throwaway WP_Parser_Node that would be merged afterwards. foreach ( $subnode->get_children_ref() as $c ) { $children[] = $c; } @@ -131,7 +129,7 @@ private function parse_recursive( $rule_id ) { // See: https://github.com/antlr/antlr4/issues/488 if ( $branch_matches - && 'selectStatement' === $rule_name + && $is_select_statement && $this->position < $token_count && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { @@ -144,7 +142,7 @@ private function parse_recursive( $rule_id ) { } if ( ! $branch_matches ) { - $this->position = $starting_position; + $this->position = $position; return false; } From 332c10aac15f3e5f0023a3e56c6709d338c763b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:39:54 +0200 Subject: [PATCH 03/30] Short-circuit nullable-fallback and inline single-branch fragments Two grammar/parser refinements that both reduce recursive calls: * In parse_recursive(): when the rule has a per-token branch selector but the current token is not in any branch's FIRST and the rule itself is nullable, return 'matched empty' immediately instead of descending into nullable branches that would recursively do the same thing. This alone eliminates ~460k recursive calls on the MySQL corpus. * At grammar build time, expand every single-branch fragment rule into its call sites. Fragments exist only to factor shared sub-sequences and their children are already flattened into the parent AST node, so splicing them directly into parent branches is a no-op for the resulting tree but removes an entire recursive call per use. 480 of the grammar's fragments qualify. Also drops the dead terminal branch at the top of parse_recursive() (the branch loop inlines terminal matching, so parse_recursive is only ever called with non-terminal rule ids) and the always-false empty-branches guard. End-to-end parser benchmark: Before: ~22,400 QPS After: ~27,500 QPS (+23%) --- .../src/parser/class-wp-parser-grammar.php | 108 +++++++++++++++++- .../src/parser/class-wp-parser.php | 60 ++++------ 2 files changed, 123 insertions(+), 45 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 1e4c461b..7165780b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -28,7 +28,7 @@ class WP_Parser_Grammar { */ public $rules; public $rule_names; - public $fragment_ids; + public $fragment_ids = array(); /** * Per-rule branch selector keyed by the next token id. @@ -38,11 +38,8 @@ class WP_Parser_Grammar { * current token has the given id. Nullable branches appear in every entry. * * If an entry does not exist for the current token, `$nullable_branches` - * is consulted. If both are empty, the rule cannot match and the parser - * returns immediately. - * - * Rules whose FIRST set could not be computed do not appear in the map; - * for those the parser falls back to trying every branch. + * is consulted. If neither has an entry for this rule, the rule cannot + * match and the parser returns immediately. * * @var array> */ @@ -111,9 +108,108 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } + $this->inline_single_branch_fragments(); $this->build_branch_selectors(); } + /** + * Inline single-branch fragment rules into their call sites. + * + * The grammar contains many single-branch fragment rules that exist only + * to factor shared sub-sequences out of larger productions. At runtime + * the parser would descend into each such fragment via a recursive call + * just to walk the same symbol sequence and splice the results back into + * the parent. Expanding them in-place at build time eliminates that call + * chain without changing the resulting AST because fragment children are + * already flattened into the parent node. + * + * Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are + * left intact because they represent real choices that must be evaluated + * against the current token. + */ + private function inline_single_branch_fragments() { + $rules = $this->rules; + $fragment_ids = $this->fragment_ids; + $low_nt = $this->lowest_non_terminal_id; + + // Precompute the set of single-branch fragments that are candidates + // for inlining. + $inlinable = array(); + foreach ( $fragment_ids as $rule_id => $_ ) { + if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) { + $inlinable[ $rule_id ] = true; + } + } + + // Depth-first expansion memoized per rule, with cycle detection. + $expanded = array(); + $visiting = array(); + $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + $out[] = $sym; + continue; + } + if ( ! isset( $inlinable[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( isset( $visiting[ $sym ] ) ) { + // Cycle: leave the reference in place. + $out[] = $sym; + continue; + } + if ( ! isset( $expanded[ $sym ] ) ) { + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + unset( $visiting[ $sym ] ); + } + foreach ( $expanded[ $sym ] as $s ) { + $out[] = $s; + } + } + return $out; + }; + + // Rewrite every rule's branches with fragments inlined. + foreach ( $this->rules as $rule_id => $branches ) { + $new_branches = array(); + foreach ( $branches as $branch ) { + $new_branches[] = $expand_branch( $branch ); + } + $this->rules[ $rule_id ] = $new_branches; + } + } + + /** + * Remove explicit `EMPTY_RULE_ID` markers from branches. + * + * The epsilon marker is a zero-width, always-matching symbol used in the + * grammar to express optional productions. At parse time it would still + * be walked and "continued" over for no effect, so stripping it ahead of + * time removes a per-symbol branch in the hot loop. + * + * A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`) + * which the parser already handles: the inner symbol loop does nothing and + * the rule returns a successful empty match. + */ + private function strip_epsilon_markers() { + foreach ( $this->rules as $rule_id => $branches ) { + foreach ( $branches as $i => $branch ) { + if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) { + $stripped = array(); + foreach ( $branch as $symbol ) { + if ( self::EMPTY_RULE_ID !== $symbol ) { + $stripped[] = $symbol; + } + } + $this->rules[ $rule_id ][ $i ] = $stripped; + } + } + } + } + /** * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize * them into a per-rule map of `token_id => branch_index[]` so the parser diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index d674312b..b80fe96f 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -28,54 +28,36 @@ public function parse() { return false === $ast ? null : $ast; } + /** + * Parse a single non-terminal rule. + * + * This function is only called for non-terminal rule ids. Terminals are + * matched inline inside the branch loop below to avoid a function-call + * round trip per consumed token. + */ private function parse_recursive( $rule_id ) { - $grammar = $this->grammar; - $highest_terminal_id = $grammar->highest_terminal_id; - - if ( $rule_id <= $highest_terminal_id ) { - if ( $this->position >= $this->token_count ) { - return false; - } - - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) { - return true; - } - - if ( $this->tokens[ $this->position ]->id === $rule_id ) { - $token = $this->tokens[ $this->position ]; - ++$this->position; - return $token; - } - return false; - } - - $branches = $grammar->rules[ $rule_id ]; - if ( ! $branches ) { - return false; - } - + $grammar = $this->grammar; $tokens = $this->tokens; $token_count = $this->token_count; $position = $this->position; // Narrow the set of branches worth trying using the precomputed FIRST - // sets. When no entry exists for the current token, fall back to the - // rule's nullable branches (if any); if both are empty the rule cannot - // match here. - $branch_selector = $grammar->branches_for_token[ $rule_id ] ?? null; - if ( null !== $branch_selector ) { - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; - if ( isset( $branch_selector[ $tid ] ) ) { - $candidate_branches = $branch_selector[ $tid ]; - } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { - $candidate_branches = $grammar->nullable_branches[ $rule_id ]; - } else { - return false; - } + // sets. When no entry exists for the current token but the rule is + // nullable, all candidate branches would match empty, so we return + // immediately without entering any branch. + $branch_selector = $grammar->branches_for_token[ $rule_id ]; + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $branch_selector[ $tid ] ) ) { + $candidate_branches = $branch_selector[ $tid ]; + } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + return true; } else { - $candidate_branches = array_keys( $branches ); + return false; } + $highest_terminal_id = $grammar->highest_terminal_id; + $branches = $grammar->rules[ $rule_id ]; + $rule_name = $grammar->rule_names[ $rule_id ]; $fragment_ids = $grammar->fragment_ids; $is_select_statement = 'selectStatement' === $rule_name; From 402554cd1fb9db53862af817f48f0768faa184a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:43:59 +0200 Subject: [PATCH 04/30] Strip epsilon markers and cache grammar refs on the parser Two minor reductions in per-call work: * Strip explicit EMPTY_RULE_ID symbols out of rule branches at grammar build time. The parser loop would have 'continue'd over them anyway, so removing them ahead of time lets the hot symbol loop drop the epsilon check. Pure-epsilon branches become empty branches and still match empty via the existing empty-children fast path. * Cache the grammar's rules, fragment_ids, rule_names, branches_for_token, nullable_branches, and highest_terminal_id as direct parser instance fields so parse_recursive() no longer pays for a $this->grammar->... double hop on every call. * Collapse the two-step node construction (new + set_children) into a single constructor call that takes the children array directly. This saves a method call per allocated node (~820k across the MySQL corpus). End-to-end parser benchmark: ~27,500 QPS -> ~28,500 QPS (+3.5%). --- .../src/parser/class-wp-parser-grammar.php | 1 + .../src/parser/class-wp-parser-node.php | 17 ++----- .../src/parser/class-wp-parser.php | 49 +++++++++++-------- 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 7165780b..a5ea66c0 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -109,6 +109,7 @@ private function inflate( $grammar ) { } $this->inline_single_branch_fragments(); + $this->strip_epsilon_markers(); $this->build_branch_selectors(); } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 13f093a0..c727ef03 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -15,29 +15,18 @@ class WP_Parser_Node { */ public $rule_id; public $rule_name; - protected $children = array(); + protected $children; - public function __construct( $rule_id, $rule_name ) { + public function __construct( $rule_id, $rule_name, array $children = array() ) { $this->rule_id = $rule_id; $this->rule_name = $rule_name; + $this->children = $children; } public function append_child( $node ) { $this->children[] = $node; } - /** - * Replace all children with the given array. - * - * This is used by the parser to attach a batch of children built up in a - * local array while trying branches, without allocating a node per attempt. - * - * @param array $children The new children. - */ - public function set_children( array $children ): void { - $this->children = $children; - } - /** * Check if this node has any child nodes or tokens. * diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index b80fe96f..54bed302 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -14,11 +14,26 @@ class WP_Parser { protected $token_count; protected $position; + // Grammar data cached as instance fields so the hot path avoids an extra + // property hop via $this->grammar on every recursive call. + private $rules; + private $rule_names; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->token_count = count( $tokens ); - $this->position = 0; + $this->grammar = $grammar; + $this->tokens = $tokens; + $this->token_count = count( $tokens ); + $this->position = 0; + $this->rules = $grammar->rules; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; } public function parse() { @@ -36,7 +51,6 @@ public function parse() { * round trip per consumed token. */ private function parse_recursive( $rule_id ) { - $grammar = $this->grammar; $tokens = $this->tokens; $token_count = $this->token_count; $position = $this->position; @@ -45,21 +59,19 @@ private function parse_recursive( $rule_id ) { // sets. When no entry exists for the current token but the rule is // nullable, all candidate branches would match empty, so we return // immediately without entering any branch. - $branch_selector = $grammar->branches_for_token[ $rule_id ]; - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; - if ( isset( $branch_selector[ $tid ] ) ) { - $candidate_branches = $branch_selector[ $tid ]; - } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { return true; } else { return false; } - $highest_terminal_id = $grammar->highest_terminal_id; - $branches = $grammar->rules[ $rule_id ]; - - $rule_name = $grammar->rule_names[ $rule_id ]; - $fragment_ids = $grammar->fragment_ids; + $highest_terminal_id = $this->highest_terminal_id; + $branches = $this->rules[ $rule_id ]; + $fragment_ids = $this->fragment_ids; + $rule_name = $this->rule_names[ $rule_id ]; $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -70,9 +82,6 @@ private function parse_recursive( $rule_id ) { $branch_matches = true; foreach ( $branch as $subrule_id ) { if ( $subrule_id <= $highest_terminal_id ) { - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { - continue; - } if ( $this->position < $token_count && $tokens[ $this->position ]->id === $subrule_id @@ -132,8 +141,6 @@ private function parse_recursive( $rule_id ) { return true; } - $node = new WP_Parser_Node( $rule_id, $rule_name ); - $node->set_children( $children ); - return $node; + return new WP_Parser_Node( $rule_id, $rule_name, $children ); } } From c43425fdf0af73f67fcae5f6a486623422fa54b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:01:14 +0200 Subject: [PATCH 05/30] Return fragment results as children arrays, skip the intermediate node Multi-branch fragment rules can't be expanded at grammar build time, but their runtime role is still trivial: match a sequence of symbols and have the caller splice the resulting children into its own node. The old code allocated a full WP_Parser_Node for each fragment match just to have the caller immediately copy its children out. Return the children array directly from fragments instead. The caller distinguishes via is_array($subnode) and splices in-place, saving a Parser_Node allocation per fragment match (~253k per 10k queries). End-to-end parser benchmark: Before: ~27,000 QPS (avg) After: ~28,700 QPS (+6%). --- .../src/parser/class-wp-parser.php | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 54bed302..78aced53 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -72,6 +72,7 @@ private function parse_recursive( $rule_id ) { $branches = $this->rules[ $rule_id ]; $fragment_ids = $this->fragment_ids; $rule_name = $this->rule_names[ $rule_id ]; + $is_fragment = isset( $fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -102,8 +103,11 @@ private function parse_recursive( $rule_id ) { if ( true === $subnode ) { continue; } - if ( isset( $fragment_ids[ $subrule_id ] ) ) { - foreach ( $subnode->get_children_ref() as $c ) { + if ( is_array( $subnode ) ) { + // Fragment results are returned directly as a children + // array so the parser does not allocate a Parser_Node + // that would immediately be unwrapped into the parent. + foreach ( $subnode as $c ) { $children[] = $c; } } else { @@ -141,6 +145,14 @@ private function parse_recursive( $rule_id ) { return true; } + // Fragments exist only to group symbols for reuse; their "node" would + // get inlined into the parent on the very next step. Return the raw + // children array so the caller can splice it without allocating a + // throwaway WP_Parser_Node. + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $rule_name, $children ); } } From e35e14f35ba1ebe3430efc92744ae8ea069f1071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:07:09 +0200 Subject: [PATCH 06/30] Append end-of-input sentinel token to drop range checks Add a sentinel WP_Parser_Token with id EMPTY_RULE_ID (0) to the end of the token array. Real MySQL tokens never have id 0 (WHITESPACE, the only token with id 0, is stripped by the lexer before tokens reach the parser), so the sentinel cannot match any real terminal. This lets the hot path drop the 'position < token_count' range check everywhere it reads the current token id: the selector lookup at method entry, the inline terminal match inside the branch loop, and the post-branch INTO negative lookahead for selectStatement. Any read past the last real token falls naturally into the nullable-fallback or branch-miss handling. Also drop a few dead locals ($token_count, $fragment_ids) that no longer appear in the hot path after the change. End-to-end parser benchmark: Before: ~28,700 QPS (avg) After: ~29,800 QPS (+4%). --- .../src/parser/class-wp-parser.php | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 78aced53..a0728aef 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -25,8 +25,28 @@ class WP_Parser { public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; - $this->tokens = $tokens; $this->token_count = count( $tokens ); + // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID + // (0). The hot path can then read $tokens[$pos]->id unconditionally + // when $pos is the current cursor, because the sentinel naturally + // fails to match any real grammar terminal while feeding the + // nullable-fallback branch of the selector check. + // + // Invariants the hot path relies on: + // - The sentinel id (0) cannot match any grammar terminal. + // strip_epsilon_markers() removes id 0 from every branch at + // grammar build time, so no $subrule_id in the inner loop ever + // equals 0 and ++$this->position can never advance past the + // sentinel. + // - The sentinel must never be appended to a node's children. It + // is only inspected via $tokens[$pos]->id; tokens are pushed + // into $children only on terminal-id equality, which the + // sentinel cannot satisfy. + // - WP_MySQL_Parser::next_query() bounds at $position < $token_count + // (set above, before the append), so the sentinel sits at index + // $token_count and is never fed into a parse round. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; $this->position = 0; $this->rules = $grammar->rules; $this->rule_names = $grammar->rule_names; @@ -51,15 +71,14 @@ public function parse() { * round trip per consumed token. */ private function parse_recursive( $rule_id ) { - $tokens = $this->tokens; - $token_count = $this->token_count; - $position = $this->position; + $tokens = $this->tokens; + $position = $this->position; // Narrow the set of branches worth trying using the precomputed FIRST // sets. When no entry exists for the current token but the rule is // nullable, all candidate branches would match empty, so we return // immediately without entering any branch. - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + $tid = $tokens[ $position ]->id; if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { @@ -70,9 +89,8 @@ private function parse_recursive( $rule_id ) { $highest_terminal_id = $this->highest_terminal_id; $branches = $this->rules[ $rule_id ]; - $fragment_ids = $this->fragment_ids; $rule_name = $this->rule_names[ $rule_id ]; - $is_fragment = isset( $fragment_ids[ $rule_id ] ); + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -83,10 +101,10 @@ private function parse_recursive( $rule_id ) { $branch_matches = true; foreach ( $branch as $subrule_id ) { if ( $subrule_id <= $highest_terminal_id ) { - if ( - $this->position < $token_count - && $tokens[ $this->position ]->id === $subrule_id - ) { + // The sentinel at $tokens[$token_count] has id 0 so it + // cannot match any real terminal, making the range check + // unnecessary here. + if ( $tokens[ $this->position ]->id === $subrule_id ) { $children[] = $tokens[ $this->position ]; ++$this->position; continue; @@ -125,7 +143,6 @@ private function parse_recursive( $rule_id ) { if ( $branch_matches && $is_select_statement - && $this->position < $token_count && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { $branch_matches = false; From a3b8a087dc280e9b5e5254697d1aade314e3764d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:10:50 +0200 Subject: [PATCH 07/30] Embed branch symbol sequences directly in the per-token selector Previously the per-(rule, token) selector stored a list of branch indexes that the parser then had to look up in $rules[$rule_id] on every branch attempt. Store the branch symbol sequences themselves so the hot loop can iterate candidate branches directly. PHP arrays are copy-on-write, so sharing the same branch sequence across selector entries for many tokens costs negligible extra memory. The nullable_branches map shrinks to a bool marker since the parser only uses it for existence checks. Also cache the start rule id on the grammar so parse() skips its array_search() across rule_names on every call. End-to-end parser benchmark: Before: ~29,800 QPS (avg) After: ~31,700 QPS (+6%). --- .../src/parser/class-wp-parser-grammar.php | 56 ++++++++++++++++--- .../src/parser/class-wp-parser.php | 7 +-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index a5ea66c0..4ad117de 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -34,21 +34,22 @@ class WP_Parser_Grammar { * Per-rule branch selector keyed by the next token id. * * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list - * of branch indexes in `$rules[$rule_id]` that can possibly match when the - * current token has the given id. Nullable branches appear in every entry. + * of candidate branch symbol sequences (drawn from `$rules[$rule_id]`) + * that can possibly match when the current token has the given id. + * Nullable branches appear in every entry. * * If an entry does not exist for the current token, `$nullable_branches` * is consulted. If neither has an entry for this rule, the rule cannot * match and the parser returns immediately. * - * @var array> + * @var array> */ public $branches_for_token = array(); /** - * Per-rule list of nullable branch indexes. + * Per-rule marker indicating the rule has at least one nullable branch. * - * @var array + * @var array */ public $nullable_branches = array(); @@ -56,6 +57,18 @@ class WP_Parser_Grammar { public $highest_terminal_id; public $native_grammar; + /** + * Memoized rule-id lookups, keyed by rule name. + * + * `get_rule_id()` is a linear `array_search` over `$rule_names` and + * costs a few microseconds per call on the MySQL grammar. The parser + * looks up its start rule and the `selectStatement` rule on a hot path, + * so the results are memoized via `get_or_cache_rule_id()`. + * + * @var array + */ + private $cached_rule_ids = array(); + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -68,6 +81,25 @@ public function get_rule_id( $rule_name ) { return array_search( $rule_name, $this->rule_names, true ); } + /** + * Return the rule id for a given rule name, memoizing the result. + * + * Equivalent to `get_rule_id()` but caches the lookup so repeated + * queries for the same rule name (typically the start rule and a few + * grammar-specific rules consulted on the parser hot path) avoid + * the linear scan over `$rule_names`. Returns `false` for unknown + * rule names, mirroring `get_rule_id()`. + * + * @param string $rule_name + * @return int|false + */ + public function get_or_cache_rule_id( $rule_name ) { + if ( ! array_key_exists( $rule_name, $this->cached_rule_ids ) ) { + $this->cached_rule_ids[ $rule_name ] = $this->get_rule_id( $rule_name ); + } + return $this->cached_rule_ids[ $rule_name ]; + } + /** * Inflate the grammar to an internal representation optimized for parsing. * @@ -316,10 +348,20 @@ private function build_branch_selectors() { foreach ( $selector as $tid => $idx_list ) { $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); } - $selector = $merged; - $this->nullable_branches[ $rule_id ] = $nullable_branch_ids; + $selector = $merged; + $this->nullable_branches[ $rule_id ] = true; } if ( $selector ) { + // Store the candidate branch sequences directly so the parser + // can foreach over them without an extra $branches[$idx] + // indirection on every branch attempt. + foreach ( $selector as $tid => $idx_list ) { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $selector[ $tid ] = $seqs; + } $this->branches_for_token[ $rule_id ] = $selector; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index a0728aef..c74e82f5 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -58,8 +58,7 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { public function parse() { // @TODO: Make the starting rule lookup non-grammar-specific. - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - $ast = $this->parse_recursive( $query_rule_id ); + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); return false === $ast ? null : $ast; } @@ -88,14 +87,12 @@ private function parse_recursive( $rule_id ) { } $highest_terminal_id = $this->highest_terminal_id; - $branches = $this->rules[ $rule_id ]; $rule_name = $this->rule_names[ $rule_id ]; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); - foreach ( $candidate_branches as $idx ) { - $branch = $branches[ $idx ]; + foreach ( $candidate_branches as $branch ) { $this->position = $position; $children = array(); $branch_matches = true; From 25dad620f307c821744163c9be98af59e946c2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:12:18 +0200 Subject: [PATCH 08/30] Compare selectStatement by rule id instead of by name Minor cleanup in parse_recursive(): cache the selectStatement rule id once and compare integers on every call instead of re-comparing the 'selectStatement' string against every rule's name. Also drops the $rules instance cache from the parser, which the hot path no longer touches now that branch sequences are embedded in the selector. --- .../mysql-on-sqlite/src/parser/class-wp-parser.php | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index c74e82f5..30efb6cf 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -16,12 +16,12 @@ class WP_Parser { // Grammar data cached as instance fields so the hot path avoids an extra // property hop via $this->grammar on every recursive call. - private $rules; private $rule_names; private $fragment_ids; private $branches_for_token; private $nullable_branches; private $highest_terminal_id; + private $select_statement_rule_id; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; @@ -48,12 +48,16 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); $this->tokens = $tokens; $this->position = 0; - $this->rules = $grammar->rules; $this->rule_names = $grammar->rule_names; $this->fragment_ids = $grammar->fragment_ids; $this->branches_for_token = $grammar->branches_for_token; $this->nullable_branches = $grammar->nullable_branches; $this->highest_terminal_id = $grammar->highest_terminal_id; + + // The INTO negative-lookahead only fires for selectStatement. Cache + // the rule id so the per-call check is an int compare instead of a + // string compare. + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); } public function parse() { @@ -87,9 +91,8 @@ private function parse_recursive( $rule_id ) { } $highest_terminal_id = $this->highest_terminal_id; - $rule_name = $this->rule_names[ $rule_id ]; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); - $is_select_statement = 'selectStatement' === $rule_name; + $is_select_statement = $rule_id === $this->select_statement_rule_id; $branch_matches = false; $children = array(); foreach ( $candidate_branches as $branch ) { @@ -167,6 +170,6 @@ private function parse_recursive( $rule_id ) { return $children; } - return new WP_Parser_Node( $rule_id, $rule_name, $children ); + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } } From e4a0951fbb0188a974ff8784eb4e2433e1974d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:19:29 +0200 Subject: [PATCH 09/30] Re-align grammar and parser whitespace after recent changes Adopts phpcbf's trivial whitespace alignment fixes in the grammar and parser source to keep `composer run check-cs` clean after the prior optimisation commits added new local variables and reshaped the selector-build code. --- .../src/parser/class-wp-parser-grammar.php | 20 +++++++++---------- .../src/parser/class-wp-parser.php | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 4ad117de..0644457d 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -175,8 +175,8 @@ private function inline_single_branch_fragments() { } // Depth-first expansion memoized per rule, with cycle detection. - $expanded = array(); - $visiting = array(); + $expanded = array(); + $visiting = array(); $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { $out = array(); foreach ( $branch as $sym ) { @@ -194,8 +194,8 @@ private function inline_single_branch_fragments() { continue; } if ( ! isset( $expanded[ $sym ] ) ) { - $visiting[ $sym ] = true; - $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); unset( $visiting[ $sym ] ); } foreach ( $expanded[ $sym ] as $s ) { @@ -254,12 +254,12 @@ private function strip_epsilon_markers() { * of the branch attempts that the parser used to try and fail. */ private function build_branch_selectors() { - $rules = $this->rules; - $low_nt = $this->lowest_non_terminal_id; - $empty_rule = self::EMPTY_RULE_ID; - $rule_ids = array_keys( $rules ); - $nullable = array(); - $first_sets = array(); + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); foreach ( $rule_ids as $rule_id ) { $nullable[ $rule_id ] = false; diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 30efb6cf..48930dd7 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -24,8 +24,8 @@ class WP_Parser { private $select_statement_rule_id; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->token_count = count( $tokens ); + $this->grammar = $grammar; + $this->token_count = count( $tokens ); // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID // (0). The hot path can then read $tokens[$pos]->id unconditionally // when $pos is the current cursor, because the sentinel naturally From 0e233205753ab27a0d075788eb9db039b83e6807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:28:03 +0200 Subject: [PATCH 10/30] Deduplicate selector entries while embedding branch sequences The per-(rule, token) branch selector stored a separate inner array per token, even when many tokens within the same rule mapped to identical branch lists (a single branch's FIRST set covers many tokens, for example). Loading the MySQL grammar used ~40 MB of PHP memory, most of which was duplicated inner arrays. Deduplicate by signature during grammar build so all tokens that land on the same branch list share one inner array via copy-on-write. The inner arrays still embed the branch symbol sequences directly so the hot loop iterates them without an extra $rules[$rule_id][$idx] indirection per branch attempt. Grammar memory on the MySQL grammar drops from ~40 MB to ~10 MB. PHPUnit peak memory drops from 198 MB to 110 MB. Parser throughput is unchanged from the previous (non-deduplicated) embedded-sequences form. --- .../src/parser/class-wp-parser-grammar.php | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 0644457d..6e0210ae 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -352,15 +352,26 @@ private function build_branch_selectors() { $this->nullable_branches[ $rule_id ] = true; } if ( $selector ) { - // Store the candidate branch sequences directly so the parser - // can foreach over them without an extra $branches[$idx] - // indirection on every branch attempt. + // Embed the branch symbol sequences directly so the parser can + // iterate candidate branches without a $branches[$idx] lookup on + // every attempt. Many tokens in a rule share the same branch-id + // list, so deduplicate by signature and let copy-on-write share + // one sequences array across them. This dedup matters: unshared, + // the table would be ~35 MiB on the MySQL grammar; shared, it is + // a few MiB, built once per process (not per query). + $by_signature = array(); foreach ( $selector as $tid => $idx_list ) { - $seqs = array(); - foreach ( $idx_list as $idx ) { - $seqs[] = $branches[ $idx ]; + $sig = implode( ',', $idx_list ); + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; } - $selector[ $tid ] = $seqs; } $this->branches_for_token[ $rule_id ] = $selector; } From 3164da91ff3ca91ddccb2e39963a6a80e16b1d76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 22:42:52 +0200 Subject: [PATCH 11/30] Add direct-return fast path for single-candidate rules On the MySQL grammar, 1,290 of 1,916 rules have a selector where every (rule, token) entry points to exactly one branch. Those rules account for ~55% of parse_recursive calls on the test corpus (722k of 1.3M per 10k queries). Flag those rules at grammar build time. In parse_recursive, detect the flag and take the only candidate branch directly, skipping the candidate-iteration loop. On match failure, restore $position and return false directly instead of going through the multi-candidate branch_matches/break sequence. End-to-end parser benchmark: no JIT: ~31.6K -> ~32.6K QPS avg (+3%) tracing JIT: ~52.6K -> ~55.7K QPS avg (+6%) --- .../src/parser/class-wp-parser-grammar.php | 18 ++++- .../src/parser/class-wp-parser.php | 72 ++++++++++++++++--- 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 6e0210ae..e41991fb 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -53,6 +53,15 @@ class WP_Parser_Grammar { */ public $nullable_branches = array(); + /** + * Per-rule flag indicating every (rule, token) selector entry points + * to exactly one branch. The parser uses this to skip the outer + * foreach when a single candidate is the only possibility. + * + * @var array + */ + public $single_candidate_rules = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; @@ -359,8 +368,12 @@ private function build_branch_selectors() { // one sequences array across them. This dedup matters: unshared, // the table would be ~35 MiB on the MySQL grammar; shared, it is // a few MiB, built once per process (not per query). - $by_signature = array(); + $by_signature = array(); + $all_single_candidates = true; foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } $sig = implode( ',', $idx_list ); if ( isset( $by_signature[ $sig ] ) ) { $selector[ $tid ] = $by_signature[ $sig ]; @@ -374,6 +387,9 @@ private function build_branch_selectors() { } } $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; + } } } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 48930dd7..03c00280 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -22,6 +22,7 @@ class WP_Parser { private $nullable_branches; private $highest_terminal_id; private $select_statement_rule_id; + private $single_candidate_rules; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; @@ -45,14 +46,15 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { // - WP_MySQL_Parser::next_query() bounds at $position < $token_count // (set above, before the append), so the sentinel sits at index // $token_count and is never fed into a parse round. - $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); - $this->tokens = $tokens; - $this->position = 0; - $this->rule_names = $grammar->rule_names; - $this->fragment_ids = $grammar->fragment_ids; - $this->branches_for_token = $grammar->branches_for_token; - $this->nullable_branches = $grammar->nullable_branches; - $this->highest_terminal_id = $grammar->highest_terminal_id; + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = $grammar->single_candidate_rules; // The INTO negative-lookahead only fires for selectStatement. Cache // the rule id so the per-call check is an int compare instead of a @@ -93,8 +95,58 @@ private function parse_recursive( $rule_id ) { $highest_terminal_id = $this->highest_terminal_id; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = $rule_id === $this->select_statement_rule_id; - $branch_matches = false; - $children = array(); + + // Fast path for rules where every (rule, token) selector entry + // points to exactly one branch - about 55% of nonterminal calls + // on the MySQL corpus. Skip the outer foreach and the + // $branch_matches bookkeeping; every failure path just rewinds + // the position and returns false directly. + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); + } + + $branch_matches = false; + $children = array(); foreach ( $candidate_branches as $branch ) { $this->position = $position; $children = array(); From 45c344a180d3766538f94d341bc7d63be64f0b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:36:59 +0200 Subject: [PATCH 12/30] Speed up the lexer with cheaper byte checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply lexer optimisations from PR #375: - Cache `strlen($sql)` once in `$sql_length` instead of recomputing on each EOF check. - Replace `strspn($byte, MASK) > 0` with direct byte comparisons (`$byte >= '0' && $byte <= '9'`, `false !== strpos(MASK, $byte)`, unrolled whitespace check). - Use `strpos($sql, '*/', $pos)` instead of a manual scan loop in `read_comment_content()`. - In `read_quoted_text()`, use `strpos()` to find the next quote, eliminating the separate end-of-input check that follows the `strcspn()` scan. - Inline `next_token()` + `get_token()` in `remaining_tokens()` so the hot loop builds tokens directly. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- .../src/mysql/class-wp-mysql-lexer.php | 114 +++++++++++++----- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 10ecd90a..06d01623 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { */ private $sql; + /** + * Byte length of the SQL payload. + * + * @var int + */ + private $sql_length; + /** * The version of the MySQL server that the SQL payload is intended for. * @@ -2189,6 +2196,7 @@ public function __construct( array $sql_modes = array() ) { $this->sql = $sql; + $this->sql_length = strlen( $sql ); $this->mysql_version = $mysql_version; foreach ( $sql_modes as $sql_mode ) { @@ -2284,10 +2292,46 @@ public function get_token(): ?WP_MySQL_Token { * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { - $tokens = array(); - while ( true === $this->next_token() ) { - $token = $this->get_token(); - $tokens[] = $token; + $tokens = array(); + $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( + self::SQL_MODE_NO_BACKSLASH_ESCAPES + ); + + while ( true ) { + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + break; + } + + do { + $this->token_starts_at = $this->bytes_already_read; + $this->token_type = $this->read_next_token(); + } while ( + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type + ); + + if ( null === $this->token_type ) { + break; + } + + $tokens[] = new WP_MySQL_Token( + $this->token_type, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at, + $this->sql, + $no_backslash_escapes_sql_mode_set + ); + + if ( self::EOF === $this->token_type ) { + $this->token_type = null; + break; + } } return $tokens; } @@ -2356,10 +2400,10 @@ private function read_next_token(): ?int { if ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); - } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { + } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); } elseif ( '.' === $byte ) { - if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) { + if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) { $type = $this->read_number(); } else { $this->bytes_already_read += 1; @@ -2420,8 +2464,8 @@ private function read_next_token(): ?int { } elseif ( '-' === $byte ) { if ( '-' === $next_byte - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && false !== strpos( self::WHITESPACE_MASK, $this->sql[ $this->bytes_already_read + 2 ] ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { @@ -2547,7 +2591,13 @@ private function read_next_token(): ?int { } } elseif ( '#' === $byte ) { $type = $this->read_line_comment(); - } elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) { + } elseif ( + ' ' === $byte + || "\t" === $byte + || "\n" === $byte + || "\r" === $byte + || "\f" === $byte + ) { $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); $type = self::WHITESPACE; } elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) { @@ -2675,7 +2725,7 @@ private function read_number(): ?int { '0' === $byte && 'x' === $next_byte && null !== $third_byte - && strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0 + && false !== strpos( self::HEX_DIGIT_MASK, $third_byte ) ) // HEX number in the form of x'N' or X'N'. || ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte ) @@ -2685,7 +2735,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2708,7 +2758,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2737,11 +2787,12 @@ private function read_number(): ?int { ( 'e' === $byte || 'E' === $byte ) && null !== $next_byte && ( - strspn( $next_byte, self::DIGIT_MASK ) > 0 + ( $next_byte >= '0' && $next_byte <= '9' ) || ( ( '+' === $next_byte || '-' === $next_byte ) - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && $this->sql[ $this->bytes_already_read + 2 ] >= '0' + && $this->sql[ $this->bytes_already_read + 2 ] <= '9' ) ); if ( $has_exponent ) { @@ -2838,12 +2889,11 @@ private function read_quoted_text(): ?int { // in which case the escape sequence is consumed and the loop continues. $at = $this->bytes_already_read; while ( true ) { - $at += strcspn( $this->sql, $quote, $at ); - - // Unclosed string - unexpected EOF. - if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { + $quote_at = strpos( $this->sql, $quote, $at ); + if ( false === $quote_at ) { return null; // Invalid input. } + $at = $quote_at; /* * By default, quotes can be escaped with a "\". @@ -2853,9 +2903,17 @@ private function read_quoted_text(): ?int { * The quote is escaped only when the number of preceding backslashes * is odd - "\" is an escape sequence, "\\" is an escaped backslash, * "\\\" is an escaped backslash and an escape sequence, and so on. + * + * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- + * offset wraparound (PHP 7.1+) when the closing-quote candidate + * sits at the very start of the input. The `?? null` covers + * positive out-of-range indexes belt-and-suspenders. */ if ( ! $no_backslash_escapes ) { - for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); + $i = 0; + while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { + $i += 1; + } if ( 1 === $i % 2 ) { $at += 1; continue; @@ -2920,17 +2978,11 @@ private function read_mysql_comment(): int { } private function read_comment_content(): void { - while ( true ) { - $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); - $this->bytes_already_read += 1; // Consume the '*'. - $byte = $this->sql[ $this->bytes_already_read ] ?? null; - if ( null === $byte ) { - break; - } - if ( '/' === $byte ) { - $this->bytes_already_read += 1; // Consume the '/'. - break; - } + $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); + if ( false === $comment_end ) { + $this->bytes_already_read = $this->sql_length; + } else { + $this->bytes_already_read = $comment_end + 2; } } From a76308795a46f869dba250819d8f5fb675b6aae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:37:05 +0200 Subject: [PATCH 13/30] Skip parent constructor in WP_MySQL_Token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Token construction is on the lexer hot path; bypassing the `WP_Parser_Token::__construct()` indirection and assigning the four properties directly removes one method call per token. Requires `$input` on `WP_Parser_Token` to be `protected` instead of `private` so the subclass can write to it. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php | 6 +++++- .../mysql-on-sqlite/src/parser/class-wp-parser-token.php | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 1fb25ab4..0840bc2f 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,7 +30,11 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { - parent::__construct( $id, $start, $length, $input ); + $this->id = $id; + $this->start = $start; + $this->length = $length; + $this->input = $input; + $this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled; } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php index b7726189..4132ba38 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php @@ -35,7 +35,7 @@ class WP_Parser_Token { * * @var string */ - private $input; + protected $input; /** * Constructor. From 60981f657496248b9168b2f7121d241c6e55517d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:37:13 +0200 Subject: [PATCH 14/30] Use ! empty() in WP_Parser_Node::has_child() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `! empty( $this->children )` short-circuits without calling `count()`, saving one function call per invocation. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/376 --- packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index c727ef03..2ed046a5 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -33,7 +33,7 @@ public function append_child( $node ) { * @return bool True if this node has any child nodes or tokens, false otherwise. */ public function has_child(): bool { - return count( $this->children ) > 0; + return ! empty( $this->children ); } /** From 1aa326a24a375109c870fba1b8e01033dbb9564e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:53:29 +0200 Subject: [PATCH 15/30] Inline leading-whitespace skip in lexer's token loops Both next_token() and remaining_tokens() previously paid a read_next_token() function call per whitespace run only to recognise and skip the resulting WHITESPACE token. A single unguarded strspn() at the top of each loop iteration absorbs the run inline, saving the call overhead for ~one whitespace run per real token across millions of tokens. The strspn() call is unguarded because an unconditional strspn() (which returns 0 in a single C-side call when nothing matches) is faster than gating it on a five-arm '$byte === ...' precheck. --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 06d01623..78c0c6b7 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2235,6 +2235,9 @@ public function next_token(): bool { return false; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); @@ -2306,6 +2309,9 @@ public function remaining_tokens(): array { break; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); From 7256e625cb6ac799fb33b7486d7e83e213a12ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:54:33 +0200 Subject: [PATCH 16/30] Catch identifier and keyword tokens at the top of the chain ASCII letters and UTF-8 multibyte start bytes account for most token-start bytes on the MySQL corpus. They previously fell into the catch-all `else` at the bottom of read_next_token() after walking every operator arm in between. The new branch sits at the top of the elseif chain and dispatches them directly. The `next_byte !== "'"` guard keeps the x'..', n'..' and similar specials on their dedicated branches. `_` and `$` starters stay on the catch-all so the UNDERSCORE_CHARSET lookup still fires. --- .../src/mysql/class-wp-mysql-lexer.php | 22 ++++++++++++- .../tests/mysql/WP_MySQL_Lexer_Tests.php | 31 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 78c0c6b7..87d35cbb 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,7 +2404,27 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; - if ( "'" === $byte || '"' === $byte || '`' === $byte ) { + // Fast path for keywords and identifiers. + // These are the most common token types in MySQL payloads. + if ( + ( + ( $byte >= 'a' && $byte <= 'z' ) + || ( $byte >= 'A' && $byte <= 'Z' ) + || $byte > "\x7F" + ) + && "'" !== $next_byte + ) { + $started_at = $this->bytes_already_read; + $type = $this->read_identifier(); + if ( self::IDENTIFIER === $type ) { + // When preceded by a dot, it is always an identifier. + if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { + $type = self::IDENTIFIER; + } else { + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + } + } + } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); diff --git a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php index 8f18cf17..383b03f5 100644 --- a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php +++ b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php @@ -367,6 +367,37 @@ function ( $severity, $message, $file, $line ) { $this->assertNull( $lexer->get_token() ); } + /** + * A charset-introducer-like name used as a qualified member (after a dot) + * must lex as an identifier. A real charset introducer only appears before + * a string literal, never as the member of a qualified reference. + * + * @dataProvider data_underscore_charset_after_dot + */ + public function test_underscore_charset_name_after_dot_is_identifier( string $sql, int $token_index, int $expected_id ): void { + $tokens = ( new WP_MySQL_Lexer( $sql ) )->remaining_tokens(); + $this->assertSame( + WP_MySQL_Lexer::get_token_name( $expected_id ), + $tokens[ $token_index ]->get_name(), + $sql + ); + } + + /** + * @return array + */ + public function data_underscore_charset_after_dot(): array { + return array( + // `t . _utf8` - the member name must be an identifier, not a charset. + 'charset name after dot is identifier' => array( 't._utf8', 2, WP_MySQL_Lexer::IDENTIFIER ), + 'other charset name after dot' => array( 'a._binary', 2, WP_MySQL_Lexer::IDENTIFIER ), + // A genuine charset introducer (before a string) stays a charset. + 'charset introducer before string' => array( "_utf8'x'", 0, WP_MySQL_Lexer::UNDERSCORE_CHARSET ), + // A non-charset underscore name after a dot stays an identifier. + 'non-charset underscore name after dot' => array( 't._foo', 2, WP_MySQL_Lexer::IDENTIFIER ), + ); + } + private function get_token_names( array $token_types ): array { return array_map( function ( $token_type ) { From ee75d4b3722b5a557f25733bd1bc4fe37ccd2f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:55:33 +0200 Subject: [PATCH 17/30] Add a single-byte operator dispatch table The ASCII bytes (, ), ',' ;, +, ~, %, ^, ?, {, }, and = each map to a unique single-byte token type with no lookahead. A static array + isset() arm dispatches them in one lookup, ahead of the per-byte elseif chain, and the now-shadowed individual arms further down the chain are removed so the table is the single source of truth for these tokens. '*' and '|' are deliberately excluded because their token type depends on context (in_mysql_comment for '*/', SQL_MODE_PIPES_AS_CONCAT for '||'). --- .../src/mysql/class-wp-mysql-lexer.php | 56 +++++++------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 87d35cbb..32b2ef86 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,6 +2404,22 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; + // A map for a single-byte symbol fast path. + static $single_byte_ops = array( + '(' => self::OPEN_PAR_SYMBOL, + ')' => self::CLOSE_PAR_SYMBOL, + ',' => self::COMMA_SYMBOL, + ';' => self::SEMICOLON_SYMBOL, + '+' => self::PLUS_OPERATOR, + '~' => self::BITWISE_NOT_OPERATOR, + '%' => self::MOD_OPERATOR, + '^' => self::BITWISE_XOR_OPERATOR, + '?' => self::PARAM_MARKER, + '{' => self::OPEN_CURLY_SYMBOL, + '}' => self::CLOSE_CURLY_SYMBOL, + '=' => self::EQUAL_OPERATOR, + ); + // Fast path for keywords and identifiers. // These are the most common token types in MySQL payloads. if ( @@ -2424,6 +2440,10 @@ private function read_next_token(): ?int { $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); } } + } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { + // Fast path for single-byte symbols. + $this->bytes_already_read += 1; + $type = $single_byte_ops[ $byte ]; } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { @@ -2435,9 +2455,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DOT_SYMBOL; } - } elseif ( '=' === $byte ) { - $this->bytes_already_read += 1; - $type = self::EQUAL_OPERATOR; } elseif ( ':' === $byte ) { $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $next_byte ) { @@ -2484,9 +2501,6 @@ private function read_next_token(): ?int { } else { $type = self::LOGICAL_NOT_OPERATOR; } - } elseif ( '+' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PLUS_OPERATOR; } elseif ( '-' === $byte ) { if ( '-' === $next_byte @@ -2536,9 +2550,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DIV_OPERATOR; } - } elseif ( '%' === $byte ) { - $this->bytes_already_read += 1; - $type = self::MOD_OPERATOR; } elseif ( '&' === $byte ) { $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $next_byte ) { @@ -2547,9 +2558,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_AND_OPERATOR; } - } elseif ( '^' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $byte ) { $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $next_byte ) { @@ -2560,27 +2568,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_OR_OPERATOR; } - } elseif ( '~' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_NOT_OPERATOR; - } elseif ( ',' === $byte ) { - $this->bytes_already_read += 1; - $type = self::COMMA_SYMBOL; - } elseif ( ';' === $byte ) { - $this->bytes_already_read += 1; - $type = self::SEMICOLON_SYMBOL; - } elseif ( '(' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_PAR_SYMBOL; - } elseif ( ')' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_PAR_SYMBOL; - } elseif ( '{' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_CURLY_SYMBOL; - } elseif ( '}' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $byte ) { $this->bytes_already_read += 1; // Consume the '@'. @@ -2604,9 +2591,6 @@ private function read_next_token(): ?int { $type = self::AT_SIGN_SYMBOL; } } - } elseif ( '?' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PARAM_MARKER; } elseif ( '\\' === $byte ) { $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $next_byte ) { From e36e2d20c5ec307eefae5feae78a789643d9d68f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 15:40:23 +0200 Subject: [PATCH 18/30] Document non-obvious lexer dispatch conditions Three review-noted spots that were terse in the code: - The remaining_tokens() loop guard now spells out why both EOF and `null === token_type && bytes_already_read > 0` are needed (EOF on clean end-of-input vs invalid byte mid-stream, with the `> 0` guard letting the very first iteration through). - The identifier/keyword fast path now explains `$byte > "\x7F"` (UTF-8 multi-byte starter; MySQL identifiers allow U+0080-U+FFFF) and `next_byte !== "'"` (only single quotes form the special hex/bin/n-char literal starters; `"` never does, regardless of SQL mode). No behavior change. --- .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 32b2ef86..48becafb 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2301,6 +2301,8 @@ public function remaining_tokens(): array { ); while ( true ) { + // Bail on EOF, or on a null token type once at least one byte has + // been consumed (read_next_token() hit invalid input mid-stream). if ( self::EOF === $this->token_type || ( null === $this->token_type && $this->bytes_already_read > 0 ) @@ -2421,7 +2423,11 @@ private function read_next_token(): ?int { ); // Fast path for keywords and identifiers. - // These are the most common token types in MySQL payloads. + // `$byte > "\x7F"` catches any non-ASCII byte (0x80-0xFF); read_identifier() + // restricts the accepted identifier codepoints to U+0080-U+FFFF. + // `"'" !== $next_byte` defers x'..', n'..' and similar special + // literals to their dedicated branches below; only single quotes + // form those, regardless of SQL mode. if ( ( ( $byte >= 'a' && $byte <= 'z' ) From 3867de929aa1a4fa89127d4eadb64267222cc922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 17:02:25 +0200 Subject: [PATCH 19/30] Unroll whitespace check in '--' line-comment dispatch The leading-whitespace skip at the top of read_next_token() was already unrolled into byte-equality checks for the perf reasons documented in 916b512e. Apply the same unroll to the third-byte whitespace check that gates a '--' as a line-comment start, so the hot dispatch chain doesn't fall back into strpos() on a 5-char mask for this case. The bound check is folded into '?? null' on the third-byte read, matching the rest of the lookahead style. --- .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 48becafb..28cac8a0 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2508,10 +2508,16 @@ private function read_next_token(): ?int { $type = self::LOGICAL_NOT_OPERATOR; } } elseif ( '-' === $byte ) { + $third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null; if ( '-' === $next_byte - && $this->bytes_already_read + 2 < $this->sql_length - && false !== strpos( self::WHITESPACE_MASK, $this->sql[ $this->bytes_already_read + 2 ] ) + && ( + ' ' === $third_byte + || "\t" === $third_byte + || "\n" === $third_byte + || "\r" === $third_byte + || "\f" === $third_byte + ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { From 0e914290c1e330050b0e1cd65af73659f1e656df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 18:53:43 +0200 Subject: [PATCH 20/30] Extract WP_Parser::set_tokens() helper for shared sentinel handling The end-of-input sentinel that the parser hot path relies on must be appended whenever the token stream is (re)assigned, not only at construction time. Trunk's WP_MySQL_Parser::reset_tokens() didn't know about it, so reusing a parser across queries left the parser walking off the end of the array. Move the sentinel append, $token_count compute, and $position reset into a single protected set_tokens() helper on WP_Parser. The constructor and the WP_MySQL_Parser::reset_tokens() override both call it, so the invariant has one source of truth. --- .../src/mysql/class-wp-mysql-parser.php | 3 +- .../src/parser/class-wp-parser.php | 59 +++++++++++-------- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index b6b465bd..4b74a904 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -14,8 +14,7 @@ class WP_MySQL_Parser extends WP_Parser { * @param array $tokens The parser tokens. */ public function reset_tokens( array $tokens ): void { - $this->tokens = $tokens; - $this->position = 0; + $this->set_tokens( $tokens ); $this->current_ast = null; } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 03c00280..992a01ae 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -25,30 +25,7 @@ class WP_Parser { private $single_candidate_rules; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->token_count = count( $tokens ); - // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID - // (0). The hot path can then read $tokens[$pos]->id unconditionally - // when $pos is the current cursor, because the sentinel naturally - // fails to match any real grammar terminal while feeding the - // nullable-fallback branch of the selector check. - // - // Invariants the hot path relies on: - // - The sentinel id (0) cannot match any grammar terminal. - // strip_epsilon_markers() removes id 0 from every branch at - // grammar build time, so no $subrule_id in the inner loop ever - // equals 0 and ++$this->position can never advance past the - // sentinel. - // - The sentinel must never be appended to a node's children. It - // is only inspected via $tokens[$pos]->id; tokens are pushed - // into $children only on terminal-id equality, which the - // sentinel cannot satisfy. - // - WP_MySQL_Parser::next_query() bounds at $position < $token_count - // (set above, before the append), so the sentinel sits at index - // $token_count and is never fed into a parse round. - $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; $this->rule_names = $grammar->rule_names; $this->fragment_ids = $grammar->fragment_ids; $this->branches_for_token = $grammar->branches_for_token; @@ -60,6 +37,40 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { // the rule id so the per-call check is an int compare instead of a // string compare. $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + + $this->set_tokens( $tokens ); + } + + /** + * Initialize the parser's token state. + * + * Stores the given token array, resets the position cursor, and appends + * an end-of-input sentinel token whose id is `EMPTY_RULE_ID` (0). The + * hot path can then read `$tokens[$pos]->id` unconditionally when + * `$pos` is the current cursor, because the sentinel naturally fails + * to match any real grammar terminal while feeding the nullable-fallback + * branch of the selector check. + * + * Invariants the hot path relies on: + * - The sentinel id (0) cannot match any grammar terminal. + * `strip_epsilon_markers()` removes id 0 from every branch at grammar + * build time, so no `$subrule_id` in the inner loop ever equals 0 + * and `++$this->position` can never advance past the sentinel. + * - The sentinel must never be appended to a node's children. It is + * only inspected via `$tokens[$pos]->id`; tokens are pushed into + * `$children` only on terminal-id equality, which the sentinel + * cannot satisfy. + * - `WP_MySQL_Parser::next_query()` bounds at `$position < $token_count` + * (set below, before the sentinel append), so the sentinel sits at + * index `$token_count` and is never fed into a parse round. + * + * @param array $tokens + */ + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; } public function parse() { From 58d84c6d3d9d35e0c30fcd7a806b172e9f529d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 18:53:55 +0200 Subject: [PATCH 21/30] Update native parser to consume branches_for_token + nullable_branches The pure-PHP parser was rewritten to use the precise per-token branches_for_token + nullable_branches pair (replacing the earlier coarse lookahead_is_match_possible map). Update the native (Rust) parser to consume the same two fields directly: - mysql-rust-bridge.php exports the new fields verbatim and stops producing the legacy lookahead view. - The Rust extension parses branches_for_token's outer key set into a per-rule FIRST set (the inner branch sequences are pure-PHP parser detail and aren't relevant here) and tracks nullable as a separate bool on Rule, replacing the "0 in lookahead" trick. The early-bailout check is unchanged in spirit. No PHP-side compatibility shim survives - the native bridge is now in lock-step with the grammar's actual fields. --- .../src/mysql/native/mysql-rust-bridge.php | 11 +- packages/php-ext-wp-mysql-parser/src/lib.rs | 569 +++--------------- 2 files changed, 91 insertions(+), 489 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php index 974cfa66..13f8e52f 100644 --- a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php +++ b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php @@ -13,10 +13,11 @@ */ function wp_sqlite_mysql_native_export_grammar( WP_Parser_Grammar $grammar ): array { return array( - 'highest_terminal_id' => $grammar->highest_terminal_id, - 'rules' => $grammar->rules, - 'lookahead_is_match_possible' => $grammar->lookahead_is_match_possible, - 'rule_names' => $grammar->rule_names, - 'fragment_ids' => $grammar->fragment_ids, + 'highest_terminal_id' => $grammar->highest_terminal_id, + 'rules' => $grammar->rules, + 'branches_for_token' => $grammar->branches_for_token, + 'nullable_branches' => $grammar->nullable_branches, + 'rule_names' => $grammar->rule_names, + 'fragment_ids' => $grammar->fragment_ids, ); } diff --git a/packages/php-ext-wp-mysql-parser/src/lib.rs b/packages/php-ext-wp-mysql-parser/src/lib.rs index 35f17fbd..07c20694 100644 --- a/packages/php-ext-wp-mysql-parser/src/lib.rs +++ b/packages/php-ext-wp-mysql-parser/src/lib.rs @@ -61,7 +61,7 @@ fn php_function(name: &str) -> PhpResult> { struct PhpClasses { parser_token: &'static ClassEntry, mysql_token: &'static ClassEntry, - native_parser_node: &'static ClassEntry, + parser_node: &'static ClassEntry, } fn php_classes() -> PhpResult { @@ -70,8 +70,8 @@ fn php_classes() -> PhpResult { .ok_or_else(|| php_error("Missing WP_Parser_Token class"))?, mysql_token: ClassEntry::try_find("WP_MySQL_Token") .ok_or_else(|| php_error("Missing WP_MySQL_Token class"))?, - native_parser_node: ClassEntry::try_find("WP_MySQL_Native_Parser_Node") - .ok_or_else(|| php_error("Missing WP_MySQL_Native_Parser_Node class"))?, + parser_node: ClassEntry::try_find("WP_Parser_Node") + .ok_or_else(|| php_error("Missing WP_Parser_Node class"))?, }) } @@ -921,7 +921,12 @@ struct Grammar { struct Rule { branches: Vec>, - lookahead: Option>, + /// Sorted FIRST set: token ids that can start a match for this rule. + /// `None` means the rule has no FIRST entry at all (cannot match the + /// non-empty case); see `nullable` for the empty case. + first_set: Option>, + /// At least one branch is nullable (matches empty input). + nullable: bool, rule_name: String, is_fragment: bool, } @@ -1043,27 +1048,6 @@ struct NativeAstArena { struct NativeAstState { arena: Arc, - /// Per-AST identity map: node arena index → live PHP wrapper pointer. - /// - /// `WP_Parser_Node` callers expect stable child identity (mutate a child - /// once, walk past, walk back, the mutation is still there). Each - /// accessor in this extension constructs a fresh wrapper unless we - /// intern it here. The cache intentionally stores raw wrapper pointers, - /// not strong PHP references, so Rust can preserve identity without - /// pinning wrappers after PHP drops them. - node_cache: RefCell>, -} - -struct NativeAstWrapperEntry { - ast: Rc, - node_index: usize, - /// Materialized wrappers still participate in identity lookups but no - /// longer delegate reads through the native AST bridge. - is_materialized: bool, -} - -thread_local! { - static NATIVE_AST_WRAPPERS: RefCell> = RefCell::new(HashMap::new()); } impl NativeAstArena { @@ -1155,89 +1139,9 @@ impl NativeAstArena { } } -fn native_ast_wrapper_key(wrapper_zval: &Zval) -> PhpResult { - let object = wrapper_zval - .object() - .ok_or_else(|| php_error("Missing native AST wrapper"))?; - Ok(ptr::from_ref(object) as usize) -} - -fn native_ast_from_wrapper(wrapper_zval: &Zval) -> PhpResult<(Rc, usize)> { - let key = native_ast_wrapper_key(wrapper_zval)?; - NATIVE_AST_WRAPPERS - .with(|wrappers| { - wrappers.borrow().get(&key).and_then(|entry| { - (!entry.is_materialized).then(|| (Rc::clone(&entry.ast), entry.node_index)) - }) - }) - .ok_or_else(|| php_error("Missing native AST handle")) -} - -fn register_native_ast_wrapper( - object: &ZendObject, - ast: &Rc, - node_index: usize, -) -> usize { - let key = ptr::from_ref(object) as usize; - NATIVE_AST_WRAPPERS.with(|wrappers| { - wrappers.borrow_mut().insert( - key, - NativeAstWrapperEntry { - ast: Rc::clone(ast), - node_index, - is_materialized: false, - }, - ); - }); - ast.node_cache.borrow_mut().insert(node_index, key); - key -} - -fn mark_native_ast_wrapper_materialized_key(key: usize) { - NATIVE_AST_WRAPPERS.with(|wrappers| { - if let Some(entry) = wrappers.borrow_mut().get_mut(&key) { - entry.is_materialized = true; - } - }); -} - -fn release_native_ast_wrapper_key(key: usize) { - let entry = NATIVE_AST_WRAPPERS.with(|wrappers| wrappers.borrow_mut().remove(&key)); - if let Some(entry) = entry { - let mut cache = entry.ast.node_cache.borrow_mut(); - if cache.get(&entry.node_index).copied() == Some(key) { - cache.remove(&entry.node_index); - } - } -} - -fn native_ast_wrapper_matches(key: usize, ast: &Rc, node_index: usize) -> bool { - NATIVE_AST_WRAPPERS.with(|wrappers| { - wrappers - .borrow() - .get(&key) - .is_some_and(|entry| Rc::ptr_eq(&entry.ast, ast) && entry.node_index == node_index) - }) -} - -/// Build a Zval that references an existing PHP object. -/// -/// Used on cache hits to hand a live wrapper back to PHP without allocating a -/// new object. `Zval::set_object()` bumps the object refcount for the returned -/// zval; the Rust cache only stores the pointer and does not own a reference. -unsafe fn zval_from_cached_object(key: usize) -> Zval { - let obj = &mut *(key as *mut ZendObject); - let mut zv = Zval::new(); - zv.set_object(obj); - zv -} - impl NativeAstState { fn new(arena: Arc) -> Rc { - Rc::new(Self { - arena, - node_cache: RefCell::new(HashMap::new()), - }) + Rc::new(Self { arena }) } fn create_php_ast(self: &Rc) -> PhpResult { @@ -1253,344 +1157,54 @@ impl NativeAstState { zval.set_bool(true); Ok(zval) } - NativeAstRoot::Node(index) => self.create_php_node_with_classes(index, classes), + NativeAstRoot::Node(index) => create_php_node_with_classes(&self.arena, index, classes), NativeAstRoot::Token(index) => self .arena .token_source .create_php_token_with_classes(index, classes), } } - - /// Resolve a child slot to a Zval, going through the per-AST identity - /// cache for nodes. Tokens are not yet cached — they have no public - /// mutators and no caller in this repo relies on token identity. - fn cached_child_zval( - self: &Rc, - child: NativeAstChild, - classes: &PhpClasses, - ) -> PhpResult { - match child { - NativeAstChild::Node(index) => self.cached_node_zval(index, classes), - NativeAstChild::Token(index) => self - .arena - .token_source - .create_php_token_with_classes(index, classes), - } - } - - fn cached_node_zval(self: &Rc, index: usize, classes: &PhpClasses) -> PhpResult { - let cached_key = { - let cache = self.node_cache.borrow(); - cache.get(&index).copied() - }; - if let Some(key) = cached_key { - if native_ast_wrapper_matches(key, self, index) { - return Ok(unsafe { zval_from_cached_object(key) }); - } - self.node_cache.borrow_mut().remove(&index); - } - - self.create_php_node_with_classes(index, classes) - } - - fn create_php_node_with_classes( - self: &Rc, - index: usize, - classes: &PhpClasses, - ) -> PhpResult { - let node = self.arena.node(index)?; - let mut object = classes.native_parser_node.new(); - let rule_name = self - .arena - .grammar - .rule(node.rule_id) - .map(|rule| rule.rule_name.as_str()) - .unwrap_or_default(); - - update_object_property( - &mut object, - classes.native_parser_node, - "rule_id", - node.rule_id, - )?; - update_object_property( - &mut object, - classes.native_parser_node, - "rule_name", - rule_name.to_owned(), - )?; - - register_native_ast_wrapper(object.as_ref(), self, index); - object.into_zval(false).map_err(php_error) - } -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_release_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { - let key = native_ast_wrapper_key(wrapper_zval)?; - release_native_ast_wrapper_key(key); - Ok(()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_materialize_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { - let key = native_ast_wrapper_key(wrapper_zval)?; - mark_native_ast_wrapper_materialized_key(key); - Ok(()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(!ast.arena.node(node_index)?.children.is_empty()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(ast - .arena - .node(node_index)? - .children - .iter() - .copied() - .any(|child| ast.arena.child_node_matches(child, rule_name.as_deref()))) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child_token( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(ast - .arena - .node(node_index)? - .children - .iter() - .copied() - .any(|child| ast.arena.child_token_matches(child, token_id))) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let Some(child) = ast.arena.node(node_index)?.children.first().copied() else { - return Ok(Zval::null()); - }; - ast.cached_child_zval(child, &classes) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - for child in &ast.arena.node(node_index)?.children { - if ast.arena.child_node_matches(*child, rule_name.as_deref()) { - return ast.cached_child_zval(*child, &classes); - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child_token( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - for child in &ast.arena.node(node_index)?.children { - if ast.arena.child_token_matches(*child, token_id) { - return ast.cached_child_zval(*child, &classes); - } - } - Ok(Zval::null()) } -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_descendant_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_node_matches(child, rule_name.as_deref()) { - return ast.cached_child_zval(child, &classes); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_descendant_token( - wrapper_zval: &Zval, - token_id: Option, +/// Build a complete PHP `WP_Parser_Node` Zval, recursively materializing +/// children from the Rust arena. The returned object is a plain +/// `WP_Parser_Node` instance with `rule_id`, `rule_name`, and `children` +/// populated, so callers see no difference from the pure-PHP parser's output. +fn create_php_node_with_classes( + arena: &NativeAstArena, + index: usize, + classes: &PhpClasses, ) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_token_matches(child, token_id) { - return ast.cached_child_zval(child, &classes); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_children(wrapper_zval: &Zval) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_child_nodes( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .filter(|child| ast.arena.child_node_matches(*child, rule_name.as_deref())) - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_child_tokens( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .filter(|child| ast.arena.child_token_matches(*child, token_id)) - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendants(wrapper_zval: &Zval) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let root = ast.arena.node(node_index)?; - let mut descendants = Vec::with_capacity(root.descendant_count); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - descendants.push(ast.cached_child_zval(child, &classes)?); - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(descendants) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendant_nodes( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut descendants = Vec::new(); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_node_matches(child, rule_name.as_deref()) { - descendants.push(ast.cached_child_zval(child, &classes)?); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(descendants) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendant_tokens( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut descendants = Vec::new(); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_token_matches(child, token_id) { - descendants.push(ast.cached_child_zval(child, &classes)?); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); + let node = arena.node(index)?; + let rule_name = arena + .grammar + .rule(node.rule_id) + .map(|rule| rule.rule_name.as_str()) + .unwrap_or_default(); + + let mut children: Vec = Vec::with_capacity(node.children.len()); + for child in &node.children { + let child_zval = match child { + NativeAstChild::Node(child_index) => { + create_php_node_with_classes(arena, *child_index, classes)? } - } + NativeAstChild::Token(token_index) => arena + .token_source + .create_php_token_with_classes(*token_index, classes)?, + }; + children.push(child_zval); } - Ok(descendants) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_start(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let node = ast.arena.node(node_index)?; - let token_index = node - .first_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let token = ast.arena.token_source.token_info(token_index)?; - i64::try_from(token.start).map_err(php_error) -} -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_length(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let node = ast.arena.node(node_index)?; - let first_token_index = node - .first_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let last_token_index = node - .last_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let first_token = ast.arena.token_source.token_info(first_token_index)?; - let last_token = ast.arena.token_source.token_info(last_token_index)?; - let length = last_token.end.saturating_sub(first_token.start); - i64::try_from(length).map_err(php_error) + let mut object = classes.parser_node.new(); + update_object_property(&mut object, classes.parser_node, "rule_id", node.rule_id)?; + update_object_property( + &mut object, + classes.parser_node, + "rule_name", + rule_name.to_owned(), + )?; + update_object_property(&mut object, classes.parser_node, "children", children)?; + object.into_zval(false).map_err(php_error) } #[php_class] @@ -1729,11 +1343,13 @@ impl WpMySqlNativeParser { return Ok(NativeParseMatch::No); } - if let Some(lookahead) = rule.lookahead.as_ref() { + if let Some(first_set) = rule.first_set.as_ref() { let token_id = self.token_ids.get(self.position).copied().unwrap_or(0); - if lookahead.binary_search(&token_id).is_err() && lookahead.binary_search(&0).is_err() { + if first_set.binary_search(&token_id).is_err() && !rule.nullable { return Ok(NativeParseMatch::No); } + } else if !rule.nullable { + return Ok(NativeParseMatch::No); } let starting_position = self.position; @@ -1823,11 +1439,17 @@ fn export_grammar(grammar_zval: &mut Zval) -> PhpResult> { .and_then(Zval::array) .ok_or_else(|| php_error("Missing grammar rules"))?, )?; - let parsed_lookahead = parse_lookahead( + let parsed_first_sets = parse_branches_for_token_first_sets( + array + .get("branches_for_token") + .and_then(Zval::array) + .ok_or_else(|| php_error("Missing grammar branches_for_token"))?, + )?; + let parsed_nullable = parse_id_set( array - .get("lookahead_is_match_possible") + .get("nullable_branches") .and_then(Zval::array) - .ok_or_else(|| php_error("Missing grammar lookahead"))?, + .ok_or_else(|| php_error("Missing grammar nullable_branches"))?, )?; let parsed_rule_names = parse_rule_names( array @@ -1850,7 +1472,8 @@ fn export_grammar(grammar_zval: &mut Zval) -> PhpResult> { .find_map(|(id, name)| (name == "selectStatement").then_some(*id)); let rules = build_rules( parsed_rules, - parsed_lookahead, + parsed_first_sets, + parsed_nullable, parsed_rule_names, parsed_fragment_ids, )?; @@ -1928,13 +1551,15 @@ fn export_tokens(tokens: &mut Zval) -> PhpResult<(ParserTokenSource, Vec)> fn build_rules( rules: HashMap>>, - lookahead: HashMap>, + first_sets: HashMap>, + nullable: HashSet, rule_names: HashMap, fragment_ids: HashSet, ) -> PhpResult>> { let max_rule_id = rules .keys() - .chain(lookahead.keys()) + .chain(first_sets.keys()) + .chain(nullable.iter()) .chain(rule_names.keys()) .chain(fragment_ids.iter()) .copied() @@ -1945,18 +1570,19 @@ fn build_rules( for (rule_id, branches) in rules { let index = usize::try_from(rule_id).map_err(php_error)?; - let mut lookahead = lookahead.get(&rule_id).map(|set| { + let mut first_set = first_sets.get(&rule_id).map(|set| { let mut values: Vec = set.iter().copied().collect(); values.sort_unstable(); values }); - if let Some(values) = lookahead.as_mut() { + if let Some(values) = first_set.as_mut() { values.dedup(); } dense_rules[index] = Some(Rule { branches, - lookahead, + first_set, + nullable: nullable.contains(&rule_id), rule_name: rule_names.get(&rule_id).cloned().unwrap_or_default(), is_fragment: fragment_ids.contains(&rule_id), }); @@ -1992,20 +1618,27 @@ fn parse_rules(array: &ZendHashTable) -> PhpResult>>> Ok(rules) } -fn parse_lookahead(array: &ZendHashTable) -> PhpResult>> { - let mut lookahead = HashMap::new(); - for (rule_key, lookup_zval) in array { +/// Build a per-rule FIRST set from `branches_for_token`, which is keyed +/// `[rule_id => [token_id => array]]`. Only the inner keys +/// (the token ids) are needed here; the branch sequences are the +/// pure-PHP parser's per-token candidate set, irrelevant to the native +/// parser's early-bailout. +fn parse_branches_for_token_first_sets( + array: &ZendHashTable, +) -> PhpResult>> { + let mut first_sets = HashMap::new(); + for (rule_key, selector_zval) in array { let rule_id = array_key_to_i64(rule_key)?; - let lookup_array = lookup_zval + let selector = selector_zval .array() - .ok_or_else(|| php_error("Grammar lookahead entry must be an array"))?; - let mut set = HashSet::with_capacity(lookup_array.len()); - for (token_key, _) in lookup_array { + .ok_or_else(|| php_error("Grammar branches_for_token entry must be an array"))?; + let mut set = HashSet::with_capacity(selector.len()); + for (token_key, _) in selector { set.insert(array_key_to_i64(token_key)?); } - lookahead.insert(rule_id, set); + first_sets.insert(rule_id, set); } - Ok(lookahead) + Ok(first_sets) } fn parse_rule_names(array: &ZendHashTable) -> PhpResult> { @@ -2050,37 +1683,5 @@ pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { .class::() .class::() .class::() - .function(wrap_function!(wp_sqlite_mysql_native_ast_release_wrapper)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_materialize_wrapper - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_node)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_token)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_first_child)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_child_node - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_child_token - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_descendant_node - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_descendant_token - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_children)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_nodes)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_tokens)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_descendants)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_descendant_nodes - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_descendant_tokens - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_start)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_length)) .info_function(php_module_info) } From b3306760294b0a053a4fd1754acfed67798aac7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 18:54:11 +0200 Subject: [PATCH 22/30] Materialize native AST eagerly into plain WP_Parser_Node instances Trunk's WP_MySQL_Native_Parser_Node was a lazy-materialization wrapper that extended WP_Parser_Node and overrode 18 read methods to delegate into the Rust-owned arena until first mutation. The performance branch needs WP_Parser_Node to be 'final' for opcache/JIT specialization, and PHP forbids extending a final class. Switch the native parser to eager materialization: - The Rust extension constructs plain WP_Parser_Node instances at parse() time, recursing through the arena to build a complete children array up front. Done in the previous commit by updating the Rust create_php_node_with_classes() to write the rule_id, rule_name, and children properties directly. - Drop the wp_sqlite_mysql_native_ast_* lazy-access exports and the arena-keyed wrapper registry from the Rust extension - the eager tree no longer needs them. - Remove the WP_MySQL_Native_Parser_Node class and the two PHPUnit test files that exercised the wrapper-identity / cycle-collection invariants of the lazy implementation. Stable child identity now follows from PHP's normal object semantics on the eagerly built array. The verifier script gets the same instanceof relaxation (WP_Parser_Node, not the removed subclass). WP_Parser_Node stays 'final', the native and pure-PHP parsers produce indistinguishable ASTs, and 'instanceof WP_Parser_Node' checks throughout the codebase keep working without changes. --- ...wp-tests-phpunit-native-extension-setup.sh | 15 +- packages/mysql-on-sqlite/src/load.php | 1 - .../class-wp-mysql-native-parser-node.php | 179 ------------ ...P_MySQL_Native_Parser_Node_Cycle_Tests.php | 267 ------------------ ...ySQL_Native_Parser_Node_Identity_Tests.php | 142 ---------- .../tools/verify-native-parser-extension.php | 17 +- packages/php-ext-wp-mysql-parser/src/lib.rs | 104 +------ 7 files changed, 12 insertions(+), 713 deletions(-) delete mode 100644 packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php delete mode 100644 packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php delete mode 100644 packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php diff --git a/.github/workflows/wp-tests-phpunit-native-extension-setup.sh b/.github/workflows/wp-tests-phpunit-native-extension-setup.sh index 943b06c5..c6ba3972 100644 --- a/.github/workflows/wp-tests-phpunit-native-extension-setup.sh +++ b/.github/workflows/wp-tests-phpunit-native-extension-setup.sh @@ -145,7 +145,7 @@ $parser = new WP_MySQL_Parser( $grammar, $tokens ); wp_sqlite_assert_native_parser_delegate( $parser, 'WordPress PHP test container did not select the native parser delegate.' ); $parser_ast = $parser->parse(); -if ( ! ( $parser_ast instanceof WP_MySQL_Native_Parser_Node ) ) { +if ( ! ( $parser_ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'Native parser did not produce a native-backed AST in the WordPress PHP test container.' ); } @@ -155,18 +155,13 @@ wp_sqlite_assert_native_parser_delegate( $parser, 'WordPress PHP test container $parser->next_query(); $ast = $parser->get_query_ast(); -if ( ! ( $ast instanceof WP_MySQL_Native_Parser_Node ) ) { +if ( ! ( $ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'WordPress PHP test container did not select the native-backed AST.' ); } -$reflection = new ReflectionObject( $ast ); -if ( $reflection->hasProperty( 'native_ast' ) || $reflection->hasProperty( 'native_node_index' ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper still stores Rust AST handle properties.' ); -} - $first = $ast->get_first_child_node(); -if ( ! ( $first instanceof WP_MySQL_Native_Parser_Node ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a native-backed child node.' ); +if ( ! ( $first instanceof WP_Parser_Node ) ) { + wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a child node.' ); } if ( $first !== $ast->get_first_child_node() ) { @@ -177,7 +172,7 @@ $synthetic = new WP_Parser_Node( 0, 'synthetic' ); $first->append_child( $synthetic ); $same_first = $ast->get_first_child_node(); if ( $same_first !== $first || ! in_array( $synthetic, $same_first->get_children(), true ) ) { - wp_sqlite_native_parser_verification_fail( 'Materialized native wrapper was lost from the parent cache.' ); + wp_sqlite_native_parser_verification_fail( 'Mutated child was lost from the parent.' ); } EOF diff --git a/packages/mysql-on-sqlite/src/load.php b/packages/mysql-on-sqlite/src/load.php index 62387a2e..2ab15276 100644 --- a/packages/mysql-on-sqlite/src/load.php +++ b/packages/mysql-on-sqlite/src/load.php @@ -27,7 +27,6 @@ if ( class_exists( 'WP_MySQL_Native_Parser', false ) ) { require_once __DIR__ . '/mysql/native/mysql-rust-bridge.php'; - require_once __DIR__ . '/mysql/native/class-wp-mysql-native-parser-node.php'; require_once __DIR__ . '/mysql/native/trait-wp-mysql-native-parser-impl.php'; require_once __DIR__ . '/mysql/native/class-wp-mysql-parser.php'; } else { diff --git a/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php b/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php deleted file mode 100644 index 47c2b240..00000000 --- a/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php +++ /dev/null @@ -1,179 +0,0 @@ -materialize_native_children(); - parent::append_child( $node ); - } - - /** @inheritDoc */ - public function merge_fragment( $node ) { - $this->materialize_native_children(); - if ( $node instanceof self ) { - $node->materialize_native_children(); - } - parent::merge_fragment( $node ); - } - - /** @inheritDoc */ - public function has_child(): bool { - if ( $this->was_mutated ) { - return parent::has_child(); - } - return wp_sqlite_mysql_native_ast_has_child( $this ); - } - - /** @inheritDoc */ - public function has_child_node( ?string $rule_name = null ): bool { - if ( $this->was_mutated ) { - return parent::has_child_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_has_child_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function has_child_token( ?int $token_id = null ): bool { - if ( $this->was_mutated ) { - return parent::has_child_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_has_child_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_first_child() { - if ( $this->was_mutated ) { - return parent::get_first_child(); - } - return wp_sqlite_mysql_native_ast_get_first_child( $this ); - } - - /** @inheritDoc */ - public function get_first_child_node( ?string $rule_name = null ): ?WP_Parser_Node { - if ( $this->was_mutated ) { - return parent::get_first_child_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_first_child_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_first_child_token( ?int $token_id = null ): ?WP_Parser_Token { - if ( $this->was_mutated ) { - return parent::get_first_child_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_first_child_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_first_descendant_node( ?string $rule_name = null ): ?WP_Parser_Node { - if ( $this->was_mutated ) { - return parent::get_first_descendant_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_first_descendant_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_first_descendant_token( ?int $token_id = null ): ?WP_Parser_Token { - if ( $this->was_mutated ) { - return parent::get_first_descendant_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_first_descendant_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_children(): array { - if ( $this->was_mutated ) { - return parent::get_children(); - } - return wp_sqlite_mysql_native_ast_get_children( $this ); - } - - /** @inheritDoc */ - public function get_child_nodes( ?string $rule_name = null ): array { - if ( $this->was_mutated ) { - return parent::get_child_nodes( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_child_nodes( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_child_tokens( ?int $token_id = null ): array { - if ( $this->was_mutated ) { - return parent::get_child_tokens( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_child_tokens( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_descendants(): array { - if ( $this->was_mutated ) { - return parent::get_descendants(); - } - return wp_sqlite_mysql_native_ast_get_descendants( $this ); - } - - /** @inheritDoc */ - public function get_descendant_nodes( ?string $rule_name = null ): array { - if ( $this->was_mutated ) { - return parent::get_descendant_nodes( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_descendant_nodes( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_descendant_tokens( ?int $token_id = null ): array { - if ( $this->was_mutated ) { - return parent::get_descendant_tokens( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_descendant_tokens( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_start(): int { - if ( $this->was_mutated ) { - return parent::get_start(); - } - return wp_sqlite_mysql_native_ast_get_start( $this ); - } - - /** @inheritDoc */ - public function get_length(): int { - if ( $this->was_mutated ) { - return parent::get_length(); - } - return wp_sqlite_mysql_native_ast_get_length( $this ); - } - - private function materialize_native_children(): void { - if ( $this->was_mutated ) { - return; - } - - $this->children = wp_sqlite_mysql_native_ast_get_children( $this ); - $this->was_mutated = true; - if ( function_exists( 'wp_sqlite_mysql_native_ast_materialize_wrapper' ) ) { - wp_sqlite_mysql_native_ast_materialize_wrapper( $this ); - } - } -} diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php deleted file mode 100644 index 57672162..00000000 --- a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php +++ /dev/null @@ -1,267 +0,0 @@ -markTestSkipped( 'Native MySQL parser extension is not loaded.' ); - } - // Force a clean slate before each test — ASTs from earlier tests - // must not pollute the memory measurements below. - gc_collect_cycles(); - } - - private function parse( string $sql ): WP_Parser_Node { - static $grammar = null; - if ( null === $grammar ) { - $grammar = new WP_Parser_Grammar( include __DIR__ . '/../../../src/mysql/mysql-grammar.php' ); - } - $lexer = new WP_MySQL_Lexer( $sql ); - $tokens = $lexer instanceof WP_MySQL_Native_Lexer - ? $lexer->native_token_stream() - : $lexer->remaining_tokens(); - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $tree = $parser->parse(); - $this->assertNotNull( $tree, 'Failed to parse SQL: ' . $sql ); - return $tree; - } - - /** - * Hostile loop: parse and walk many ASTs in a tight loop, only - * `gc_collect_cycles()` between iterations. Memory must plateau. - * - * If wrapper registry entries or cache pointers are not released, peak - * memory grows linearly with iteration count. With cleanup in place, the - * working set stays bounded. - */ - public function test_repeated_parse_walk_drop_does_not_leak(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3) AND d = 4'; - - // Warm-up: do enough work that allocator overhead is amortized - // before we sample the floor. - for ( $i = 0; $i < 20; $i++ ) { - $ast = $this->parse( $sql ); - $ast->get_descendants(); - $ast = null; - gc_collect_cycles(); - } - $baseline = memory_get_usage(); - - // Now run substantially more iterations and assert the working - // set stays within a small multiple of the warm-up floor. - for ( $i = 0; $i < 500; $i++ ) { - $ast = $this->parse( $sql ); - $ast->get_descendants(); - $ast = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - - // 4 MB headroom — generous, but a leaking cache adds tens of MB - // across 500 iterations on this query. - $delta = $after - $baseline; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Memory grew %.1f MB across 500 parse-walk-drop cycles; the per-AST cache is not being collected.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * After dropping the AST and triggering GC, the entire wrapper - * graph must be reclaimable. We hand out one descendant, drop the - * root, then drop the descendant — the next gc cycle must reclaim - * the rest of the cached wrappers. - */ - public function test_drop_then_gc_reclaims_cached_wrappers(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3) AND d = 4'; - - // Establish a memory floor with no AST live. - gc_collect_cycles(); - $floor = memory_get_usage(); - - $ast = $this->parse( $sql ); - $descendant = $ast->get_first_descendant_node(); - $this->assertNotNull( $descendant ); - $ast = null; - $descendant = null; - gc_collect_cycles(); - - $after = memory_get_usage(); - $delta = $after - $floor; - // Generous bound — but tens of MB of leaked wrappers would blow it. - $this->assertLessThan( - 1 * 1024 * 1024, - $delta, - sprintf( - 'After dropping the AST and the descendant and running gc, %.1f MB of cached wrappers remain.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * Holding a child wrapper *outlives* the variable holding the root. - * The child's registry entry must keep the AST alive (no UAF when the - * bridge is called on the orphaned child). Once the child is also dropped, - * the registry entry must be released. - */ - public function test_orphaned_child_keeps_ast_alive_then_collects(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3)'; - $child = ( function () use ( $sql ) { - $ast = $this->parse( $sql ); - return $ast->get_first_descendant_node(); - } )(); - - // Root variable is gone; only the child reference remains, but the - // registry entry still pins the AST. The child must still be - // functional — accessing it must not crash. - $this->assertNotNull( $child ); - $this->assertIsString( $child->rule_name ); - // The child's own children should also resolve without UAF. - $grand = $child->get_first_child(); - $this->assertNotNull( $grand ); - - // Now drop the child too; the AST + cache should be reclaimable. - $child = null; - $grand = null; - gc_collect_cycles(); - // If the registry entry was released, this assertion always passes; - // the real signal is the absence of a segfault during teardown. - $this->addToAssertionCount( 1 ); - } - - /** - * Mutating a cached wrapper through `append_child` before dropping - * the AST must not block collection. The mutated wrapper's - * `$children` array now contains a non-cached node; that must not keep - * stale registry/cache entries alive. - */ - public function test_mutation_before_drop_does_not_block_collection(): void { - $sql = 'SELECT 1 + 2'; - - gc_collect_cycles(); - $floor = memory_get_usage(); - - for ( $i = 0; $i < 200; $i++ ) { - $ast = $this->parse( $sql ); - $child = $ast->get_first_child_node(); - $injected = new WP_Parser_Node( 0, 'synthetic-' . $i ); - $ast->append_child( $injected ); - // Touch the cache after mutation to keep wrappers live. - $ast->get_descendants(); - $ast = null; - $child = null; - $injected = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - $delta = $after - $floor; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Memory grew %.1f MB across 200 mutate-then-drop cycles.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * Two ASTs alive simultaneously, then dropped in interleaved order. - * Dropping AST A must not affect AST B's cached wrappers; both must - * eventually collect once unreferenced. - */ - public function test_overlapping_asts_do_not_corrupt_each_other(): void { - $ast_a = $this->parse( 'SELECT a FROM ta WHERE a > 1' ); - $ast_b = $this->parse( 'SELECT b FROM tb WHERE b < 9' ); - - $child_a = $ast_a->get_first_descendant_node(); - $child_b = $ast_b->get_first_descendant_node(); - - // Drop A first and run gc; B must remain fully functional. - $ast_a = null; - $child_a = null; - gc_collect_cycles(); - - $this->assertNotNull( $child_b ); - $walk = $ast_b->get_descendants(); - $this->assertNotEmpty( $walk ); - - // Drop B too; walk one of its still-held descendants — the cache - // is still alive because $child_b pins it. - $ast_b = null; - $this->assertIsString( $child_b->rule_name ); - - $child_b = null; - $walk = null; - gc_collect_cycles(); - $this->addToAssertionCount( 1 ); - } - - /** - * Re-walk + drop + collect across many iterations. This is the - * "translator pass on each query" shape of real workloads. The wrapper - * registry and cache must not create a memory cliff under repeated walks. - */ - public function test_rewalk_loop_stays_bounded(): void { - $sql = 'SELECT a, b, c, d, e FROM t WHERE (a + b) * (c - d) > e AND f IN (1,2,3,4,5)'; - - gc_collect_cycles(); - // Warm-up. - for ( $i = 0; $i < 10; $i++ ) { - $ast = $this->parse( $sql ); - for ( $r = 0; $r < 10; $r++ ) { - $ast->get_descendants(); - } - $ast = null; - gc_collect_cycles(); - } - $floor = memory_get_usage(); - - for ( $i = 0; $i < 200; $i++ ) { - $ast = $this->parse( $sql ); - for ( $r = 0; $r < 10; $r++ ) { - $ast->get_descendants(); - } - $ast = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - $delta = $after - $floor; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Rewalk loop grew memory by %.1f MB; cache likely uncollectable.', - $delta / 1024 / 1024 - ) - ); - } -} diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php deleted file mode 100644 index 066fd38d..00000000 --- a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php +++ /dev/null @@ -1,142 +0,0 @@ -markTestSkipped( 'Native MySQL parser extension is not loaded.' ); - } - } - - private function parse( string $sql ): WP_Parser_Node { - static $grammar = null; - if ( null === $grammar ) { - $grammar = new WP_Parser_Grammar( include __DIR__ . '/../../../src/mysql/mysql-grammar.php' ); - } - $lexer = new WP_MySQL_Lexer( $sql ); - $tokens = $lexer instanceof WP_MySQL_Native_Lexer - ? $lexer->native_token_stream() - : $lexer->remaining_tokens(); - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $tree = $parser->parse(); - $this->assertNotNull( $tree, 'Failed to parse SQL: ' . $sql ); - return $tree; - } - - public function test_get_first_child_node_returns_same_instance(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $first = $tree->get_first_child_node(); - $second = $tree->get_first_child_node(); - - $this->assertNotNull( $first ); - $this->assertSame( $first, $second ); - } - - public function test_native_wrapper_does_not_store_native_ast_handle(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $reflection = new ReflectionObject( $tree ); - - $this->assertFalse( $reflection->hasProperty( 'native_ast' ) ); - $this->assertFalse( $reflection->hasProperty( 'native_node_index' ) ); - } - - public function test_get_children_returns_same_instances_across_calls(): void { - $tree = $this->parse( 'SELECT 1, 2, 3' ); - - $first_pass = $tree->get_children(); - $second_pass = $tree->get_children(); - - $this->assertSameSize( $first_pass, $second_pass ); - foreach ( $first_pass as $i => $child ) { - if ( $child instanceof WP_Parser_Node ) { - $this->assertSame( $child, $second_pass[ $i ] ); - } - } - } - - public function test_descendant_lookup_shares_identity_with_child_lookup(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $descendant = $tree->get_first_descendant_node(); - $this->assertNotNull( $descendant ); - - // Walk down to the same node via direct children. We don't know the - // exact depth, so we descend until we hit the descendant we found. - $cursor = $tree; - while ( null !== $cursor && $cursor !== $descendant ) { - $next = $cursor->get_first_child_node(); - if ( $next === $cursor ) { - break; - } - $cursor = $next; - } - - $this->assertSame( $descendant, $cursor, 'Descendant and child lookups must return the same wrapper instance.' ); - } - - public function test_mutation_on_child_survives_re_read(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - - // Mutate via the public WP_Parser_Node API. This catches regressions - // where accessors hand back fresh wrappers and lose state written - // through a previously returned child. - $child->rule_name = 'mutated-rule'; - - $same_child = $tree->get_first_child_node(); - $this->assertSame( $child, $same_child ); - $this->assertSame( 'mutated-rule', $same_child->rule_name ); - } - - public function test_materialized_child_survives_re_read_from_native_parent(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - - $synthetic = new WP_Parser_Node( 0, 'synthetic' ); - $child->append_child( $synthetic ); - - $same_child = $tree->get_first_child_node(); - $this->assertSame( $child, $same_child ); - $this->assertTrue( - in_array( $synthetic, $same_child->get_children(), true ), - 'Materialized live child wrappers must stay discoverable through the parent native cache.' - ); - } - - public function test_mutation_survives_parent_materialization(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - $child->rule_name = 'before-materialize'; - - // Force the parent to materialize its native children by appending - // a sibling. After this, the parent walks $this->children directly. - $sibling = new WP_Parser_Node( 0, 'synthetic' ); - $tree->append_child( $sibling ); - - $children = $tree->get_children(); - $this->assertContains( $child, $children, 'Materialized children must include the previously-mutated wrapper.' ); - $this->assertSame( 'before-materialize', $child->rule_name ); - } -} diff --git a/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php b/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php index 84e99ba5..3d388742 100644 --- a/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php +++ b/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php @@ -59,7 +59,7 @@ function wp_sqlite_verify_native_parser_extension(): void { ); $parser_ast = $parser->parse(); - if ( ! ( $parser_ast instanceof WP_MySQL_Native_Parser_Node ) || 'query' !== $parser_ast->rule_name ) { + if ( ! ( $parser_ast instanceof WP_Parser_Node ) || 'query' !== $parser_ast->rule_name ) { wp_sqlite_native_parser_verification_fail( 'Native parser did not produce the expected query AST.' ); } @@ -72,29 +72,24 @@ function wp_sqlite_verify_native_parser_extension(): void { $parser->next_query(); $ast = $parser->get_query_ast(); - if ( ! ( $ast instanceof WP_MySQL_Native_Parser_Node ) ) { + if ( ! ( $ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'WP_PDO_MySQL_On_SQLite did not produce a native-backed AST.' ); } - $reflection = new ReflectionObject( $ast ); - if ( $reflection->hasProperty( 'native_ast' ) || $reflection->hasProperty( 'native_node_index' ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper still stores Rust AST handle properties.' ); - } - $first = $ast->get_first_child_node(); - if ( ! ( $first instanceof WP_MySQL_Native_Parser_Node ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a native-backed child node.' ); + if ( ! ( $first instanceof WP_Parser_Node ) ) { + wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a child node.' ); } if ( $first !== $ast->get_first_child_node() ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper identity is not stable across reads.' ); + wp_sqlite_native_parser_verification_fail( 'AST node identity is not stable across reads.' ); } $synthetic = new WP_Parser_Node( 0, 'synthetic' ); $first->append_child( $synthetic ); $same_first = $ast->get_first_child_node(); if ( $same_first !== $first || ! in_array( $synthetic, $same_first->get_children(), true ) ) { - wp_sqlite_native_parser_verification_fail( 'Materialized native wrapper was lost from the parent cache.' ); + wp_sqlite_native_parser_verification_fail( 'Mutated child was lost from the parent.' ); } } diff --git a/packages/php-ext-wp-mysql-parser/src/lib.rs b/packages/php-ext-wp-mysql-parser/src/lib.rs index 07c20694..3bf92c22 100644 --- a/packages/php-ext-wp-mysql-parser/src/lib.rs +++ b/packages/php-ext-wp-mysql-parser/src/lib.rs @@ -1,6 +1,5 @@ #![cfg_attr(windows, feature(abi_vectorcall))] -use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::os::raw::c_char; use std::ptr; @@ -975,38 +974,6 @@ impl ParserTokenSource { } } } - - fn token_info(&self, index: usize) -> PhpResult { - match self { - Self::Php(tokens) => { - let token = tokens - .get(index) - .ok_or_else(|| php_error("Parser token index is out of range"))?; - let token_object = token - .object() - .ok_or_else(|| php_error("Parser token must be an object"))?; - let id = token_object.get_property::("id").map_err(php_error)?; - let start = token_object - .get_property::("start") - .map_err(php_error)?; - let length = token_object - .get_property::("length") - .map_err(php_error)?; - let start = usize::try_from(start).map_err(php_error)?; - let length = usize::try_from(length).map_err(php_error)?; - - Ok(TokenInfo { - id, - start, - end: start.saturating_add(length), - }) - } - Self::Native { tokens, .. } => tokens - .get(index) - .copied() - .ok_or_else(|| php_error("Parser token index is out of range")), - } - } } #[derive(Clone, Copy)] @@ -1034,9 +1001,6 @@ enum NativeParseMatch { struct NativeAstNode { rule_id: i64, children: Vec, - first_token: Option, - last_token: Option, - descendant_count: usize, } struct NativeAstArena { @@ -1062,39 +1026,7 @@ impl NativeAstArena { fn push_node(&mut self, rule_id: i64, children: Vec) -> usize { let index = self.nodes.len(); - let mut first_token = None; - let mut last_token = None; - let mut descendant_count = 0; - for child in &children { - match child { - NativeAstChild::Node(child_index) => { - if let Some(node) = self.nodes.get(*child_index) { - descendant_count += 1 + node.descendant_count; - if first_token.is_none() { - first_token = node.first_token; - } - if node.last_token.is_some() { - last_token = node.last_token; - } - } - } - NativeAstChild::Token(token_index) => { - if first_token.is_none() { - first_token = Some(*token_index); - } - last_token = Some(*token_index); - descendant_count += 1; - } - } - } - - self.nodes.push(NativeAstNode { - rule_id, - children, - first_token, - last_token, - descendant_count, - }); + self.nodes.push(NativeAstNode { rule_id, children }); index } @@ -1103,40 +1035,6 @@ impl NativeAstArena { .get(index) .ok_or_else(|| php_error("Native AST node index is out of range")) } - - fn child_node_matches(&self, child: NativeAstChild, rule_name: Option<&str>) -> bool { - let NativeAstChild::Node(index) = child else { - return false; - }; - let Ok(node) = self.node(index) else { - return false; - }; - rule_name.is_none_or(|expected| { - self.grammar - .rule(node.rule_id) - .map(|rule| rule.rule_name == expected) - .unwrap_or(false) - }) - } - - fn child_token_matches(&self, child: NativeAstChild, token_id: Option) -> bool { - let NativeAstChild::Token(index) = child else { - return false; - }; - token_id.is_none_or(|expected| { - self.token_source - .token_info(index) - .map(|token| token.id == expected) - .unwrap_or(false) - }) - } - - fn descendant_stack(&self, index: usize) -> PhpResult> { - let node = self.node(index)?; - let mut stack = Vec::with_capacity(node.descendant_count); - stack.extend(node.children.iter().rev().copied()); - Ok(stack) - } } impl NativeAstState { From ec5a74ee8b8c97b0e4b65e8d0b6af516f2c07771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 23:22:22 +0200 Subject: [PATCH 23/30] Mark WP_Parser_Node as final Nothing extends WP_Parser_Node. Marking it final lets PHP's opcache and tracing JIT specialize property access and method dispatch since the class layout is now fixed. Small but consistent improvement measured across multiple runs under tracing JIT (~+2% avg, ~+2% best). End-to-end parser benchmark: tracing JIT: ~57K -> ~57-58K QPS avg, 60-61K QPS best no JIT: ~33K -> ~34K QPS avg, 35K QPS best --- packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 2ed046a5..67ff851e 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,7 +9,7 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -class WP_Parser_Node { +final class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ From 2e20dc564db4fa1c1e993f6ed04466d039c1f92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 3 Jun 2026 16:54:25 +0200 Subject: [PATCH 24/30] Document parent-ctor bypass and remaining_tokens duplication Note that WP_MySQL_Token intentionally bypasses parent::__construct() for the hot path and must keep its field assignments in sync with WP_Parser_Token, and that remaining_tokens() deliberately inlines the next_token() tokenizer step and must stay in sync with it. --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 5 +++++ packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 28cac8a0..37a79ffa 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2292,6 +2292,11 @@ public function get_token(): ?WP_MySQL_Token { * This method can be used to tokenize the whole SQL payload at once, at the * expense of storing all token objects in memory at the same time. * + * This deliberately inlines the same tokenizer step as next_token() instead + * of looping over next_token()/get_token(), to avoid a method call and a + * token-object round trip per token. Keep the EOF/invalid-input guard, the + * whitespace skip, and the comment-skip do-while in sync with next_token(). + * * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 0840bc2f..2853c7c6 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,6 +30,10 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { + // Assign the inherited fields directly instead of calling + // parent::__construct(). The lexer builds one token per recognized + // token, so skipping the parent call is a measurable hot-path win. + // Keep these assignments in sync with WP_Parser_Token's fields. $this->id = $id; $this->start = $start; $this->length = $length; From 849c596c758e884cda37e7fce7368dda04b5a27b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 3 Jun 2026 16:57:51 +0200 Subject: [PATCH 25/30] Add unit tests for grammar build-time transforms Cover epsilon stripping, single-branch fragment inlining (including cyclic-fragment termination), per-token branch selectors with FIRST/ NULLABLE propagation, single-candidate classification, and the merge_sorted helper. Add an invariant check over the real MySQL grammar that no branch retains an epsilon marker and that every single-candidate rule maps each token to exactly one branch sequence. --- .../tests/parser/WP_Parser_Grammar_Tests.php | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php diff --git a/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php new file mode 100644 index 00000000..7db79e2a --- /dev/null +++ b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php @@ -0,0 +1,183 @@ + $rules_offset, + 'rules_names' => $names, + 'grammar' => $grammar, + ) + ); + } + + public function test_strip_epsilon_markers_and_nullable_fallback(): void { + // opt ::= A ε | ε (A = 1) + $g = $this->build_grammar( + 10, + array( 'opt' ), + array( + array( array( 1, 0 ), array( 0 ) ), + ) + ); + + // Epsilon markers are removed; the pure-epsilon branch becomes empty. + $this->assertSame( array( array( 1 ), array() ), $g->rules[10] ); + + // The rule is nullable (it has an empty branch). + $this->assertArrayHasKey( 10, $g->nullable_branches ); + + // Token A selects both branches: the A-led one and the nullable one. + $this->assertSame( array( array( 1 ), array() ), $g->branches_for_token[10][1] ); + + // Two candidate branches for token A, so it is not single-candidate. + $this->assertArrayNotHasKey( 10, $g->single_candidate_rules ); + } + + public function test_inline_single_branch_fragment(): void { + // r ::= %f C ; %f ::= A B (A=1, B=2, C=3) + $g = $this->build_grammar( + 10, + array( 'r', '%f' ), + array( + array( array( 11, 3 ) ), + array( array( 1, 2 ) ), + ) + ); + + // The single-branch fragment is expanded in place. + $this->assertSame( array( array( 1, 2, 3 ) ), $g->rules[10] ); + + // The fragment rule itself is left intact. + $this->assertSame( array( array( 1, 2 ) ), $g->rules[11] ); + + // Only token A (the inlined first symbol) starts the rule. + $this->assertSame( array( 1 ), array_keys( $g->branches_for_token[10] ) ); + $this->assertSame( array( array( 1, 2, 3 ) ), $g->branches_for_token[10][1] ); + $this->assertArrayHasKey( 10, $g->single_candidate_rules ); + } + + public function test_multi_candidate_rule_is_not_single_candidate(): void { + // top ::= A B | A C (both branches start with A) + $g = $this->build_grammar( + 10, + array( 'top', 'alt' ), + array( + array( array( 1, 2 ), array( 1, 3 ) ), + array( array( 1 ) ), + ) + ); + + $this->assertSame( array( array( 1, 2 ), array( 1, 3 ) ), $g->branches_for_token[10][1] ); + $this->assertArrayNotHasKey( 10, $g->single_candidate_rules ); + + // The single-branch rule is single-candidate. + $this->assertArrayHasKey( 11, $g->single_candidate_rules ); + } + + public function test_first_set_propagates_through_non_terminal(): void { + // top ::= child ; child ::= A | B + $g = $this->build_grammar( + 10, + array( 'top', 'child' ), + array( + array( array( 11 ) ), + array( array( 1 ), array( 2 ) ), + ) + ); + + // FIRST(child) = {A, B} flows up into top's selector. + $this->assertSame( array( 1, 2 ), array_keys( $g->branches_for_token[10] ) ); + $this->assertSame( array( array( 11 ) ), $g->branches_for_token[10][1] ); + $this->assertSame( array( array( 11 ) ), $g->branches_for_token[10][2] ); + $this->assertArrayHasKey( 10, $g->single_candidate_rules ); + + $this->assertSame( array( array( 1 ) ), $g->branches_for_token[11][1] ); + $this->assertSame( array( array( 2 ) ), $g->branches_for_token[11][2] ); + } + + public function test_inlining_terminates_on_cyclic_fragments(): void { + // r ::= %a ; %a ::= %b ; %b ::= %a (mutually recursive fragments) + // The inliner must detect the cycle and leave a reference in place + // instead of recursing forever. + $g = $this->build_grammar( + 10, + array( 'r', '%a', '%b' ), + array( + array( array( 11 ) ), + array( array( 12 ) ), + array( array( 11 ) ), + ) + ); + + $this->assertSame( array( array( 11 ) ), $g->rules[10] ); + } + + public function test_merge_sorted_dedupes_and_preserves_ascending_order(): void { + $merge = new ReflectionMethod( WP_Parser_Grammar::class, 'merge_sorted' ); + // setAccessible() is required on PHP < 8.1 and deprecated (no-op) from 8.5. + if ( PHP_VERSION_ID < 80100 ) { + $merge->setAccessible( true ); + } + + $this->assertSame( array( 1, 2, 3 ), $merge->invoke( null, array( 1, 3 ), array( 2, 3 ) ) ); + $this->assertSame( array( 2 ), $merge->invoke( null, array(), array( 2 ) ) ); + $this->assertSame( array( 1, 2 ), $merge->invoke( null, array( 1, 2 ), array() ) ); + $this->assertSame( array( 0, 1 ), $merge->invoke( null, array( 0, 1 ), array( 1 ) ) ); + } + + public function test_real_mysql_grammar_invariants(): void { + $g = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); + + // Epsilon markers are fully stripped from every branch. The parser's + // end-of-input sentinel relies on no real branch symbol being 0. + foreach ( $g->rules as $rule_id => $branches ) { + foreach ( $branches as $branch ) { + $this->assertNotContains( + WP_Parser_Grammar::EMPTY_RULE_ID, + $branch, + "Rule {$rule_id} still contains an epsilon marker." + ); + } + } + + // Every single-candidate rule has a selector, and each of its token + // entries points to exactly one branch sequence (what the fast path + // assumes when it reads $candidate_branches[0]). + foreach ( array_keys( $g->single_candidate_rules ) as $rule_id ) { + $this->assertArrayHasKey( $rule_id, $g->branches_for_token ); + foreach ( $g->branches_for_token[ $rule_id ] as $token_id => $sequences ) { + $this->assertCount( + 1, + $sequences, + "Single-candidate rule {$rule_id} has multiple branches for token {$token_id}." + ); + } + } + } +} From 00a502ae6fd31108639bbb0589c2d7131575c006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Thu, 4 Jun 2026 14:56:11 +0200 Subject: [PATCH 26/30] Refresh native-extension benchmark numbers Re-measure the documented lexer/parser benchmarks on this branch (PHP 8.5.5, current extension build) and replace the stale trunk/PHP-8.4.5 figures. The parser native row drops from 108,354 QPS (15.45x) to 58,111 QPS (2.00x): trunk's native parser returned a lazy wrapper, so the parse-only benchmark never built the tree. This branch materializes the full WP_Parser_Node tree eagerly, so the number now reflects producing a complete AST. The lexer pure-PHP row rises (71,553 -> 178,409 QPS) thanks to the lexer optimizations on this branch, narrowing the native lexer speedup to 2.00x. Note the default-CLI (no JIT) methodology and that under opcache + tracing JIT the native edge narrows further (lexer ~1.08x, parser ~1.13x). --- packages/php-ext-wp-mysql-parser/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/packages/php-ext-wp-mysql-parser/README.md b/packages/php-ext-wp-mysql-parser/README.md index 2df0b54c..b6adf413 100644 --- a/packages/php-ext-wp-mysql-parser/README.md +++ b/packages/php-ext-wp-mysql-parser/README.md @@ -86,14 +86,22 @@ The GitHub Pages demo reads published benchmark data from: -Latest local measurement (Apple Silicon macOS, PHP 8.4.5 CLI, 2026-05-26): +Latest local measurement (Apple Silicon macOS, PHP 8.5.5 CLI, default config without JIT, 2026-06-04): | Benchmark | Implementation | Queries | QPS | Speedup | | --- | --- | ---: | ---: | ---: | -| MySQL lexer | Pure PHP | 69,577 | 71,553 | — | -| MySQL lexer | Native extension | 69,577 | 343,124 | 4.80x | -| MySQL parser | Pure PHP | 69,577 | 7,015 | — | -| MySQL parser | Native extension | 69,577 | 108,354 | 15.45x | +| MySQL lexer | Pure PHP | 69,577 | 178,409 | — | +| MySQL lexer | Native extension | 69,577 | 355,084 | 2.00x | +| MySQL parser | Pure PHP | 69,577 | 29,048 | — | +| MySQL parser | Native extension | 69,577 | 58,111 | 2.00x | + +The parser rows are parse-only. On this branch the native parser materializes the full +`WP_Parser_Node` tree eagerly, so the number reflects building a complete AST rather than a +deferred handle (the earlier 15x figure measured a lazy parse that never built the tree). + +These are default-CLI numbers, matching the published `benchmark.json` environment. The native +code is JIT-independent, while the pure-PHP path speeds up substantially under opcache + tracing +JIT, so the native edge there narrows to roughly 1.1x (lexer ~1.08x, parser ~1.13x). That file should be updated whenever a new extension build or benchmark environment is published. From 096a44e8431b68cc4598aa06c7e82a42ece769d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Thu, 4 Jun 2026 20:55:20 +0200 Subject: [PATCH 27/30] Fall back to pure PHP when the native extension grammar ABI mismatches The PHP bridge now exports the parser grammar as per-token branch selectors (`branches_for_token` / `nullable_branches`) instead of the previous coarse `lookahead_is_match_possible` table - a backward-incompatible change to the ABI shared between the extension binary and the PHP driver. Until now load.php selected the native lexer/parser purely on class existence, so an extension built against a different grammar ABI - most commonly a plugin update that outpaces the installed binary - would be selected and then fatal during native parser construction, with no fallback. Track grammar-ABI compatibility by the extension's minor version (the 0.x line) and bump it to 0.2.0 for this change. Gate native selection on `phpversion( 'wp_mysql_parser' )` falling within the supported line (0.2.x); the native lexer and parser are a matched pair (the native lexer emits a token stream only the native parser can consume), so select both or neither. An unsupported or absent version falls back cleanly to pure PHP, erring on the safe side for unknown binaries. Document the versioning contract in the extension README and add a unit test covering the gate's boundaries. --- packages/mysql-on-sqlite/src/load.php | 54 ++++++++++++++---- .../WP_MySQL_Native_Grammar_Abi_Tests.php | 55 +++++++++++++++++++ packages/php-ext-wp-mysql-parser/Cargo.lock | 2 +- packages/php-ext-wp-mysql-parser/Cargo.toml | 2 +- packages/php-ext-wp-mysql-parser/README.md | 22 ++++++++ 5 files changed, 123 insertions(+), 12 deletions(-) create mode 100644 packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php diff --git a/packages/mysql-on-sqlite/src/load.php b/packages/mysql-on-sqlite/src/load.php index 2ab15276..fb0c8c3b 100644 --- a/packages/mysql-on-sqlite/src/load.php +++ b/packages/mysql-on-sqlite/src/load.php @@ -13,23 +13,57 @@ require_once __DIR__ . '/parser/class-wp-parser-token.php'; require_once __DIR__ . '/mysql/class-wp-mysql-token.php'; -/* - * The MySQL lexer and parser have an optional native (e.g. Rust) implementation. - * When the native extension is loaded, it pre-declares WP_MySQL_Native_Lexer / - * WP_MySQL_Native_Parser; otherwise we fall back to the pure-PHP classes shipped - * here. WP_MySQL_Lexer / WP_MySQL_Parser is the public entrypoint either way. +/** + * Whether the loaded "wp_mysql_parser" extension speaks a grammar ABI that this + * code supports. + * + * The native parser and PHP exchange the parser grammar via + * "wp_sqlite_mysql_native_export_grammar()"; the shape of that data is an ABI. + * Compatibility is tracked by the extension's minor version (the "x" in "0.x"): + * a backward-incompatible change to the grammar ABI bumps the minor version. + * This code supports the "0.2.x" line. A version outside the supported range - + * e.g. an older extension binary lagging a plugin update - cannot exchange the + * grammar safely and must fall back to the pure-PHP path. + * + * Keep the supported range in sync with the extension's "Cargo.toml" version + * (see "packages/php-ext-wp-mysql-parser/README.md"). + * + * @param string|false $extension_version Version reported by "phpversion( 'wp_mysql_parser' )". + * @return bool Whether the native lexer/parser path can be used. */ -if ( class_exists( 'WP_MySQL_Native_Lexer', false ) ) { - require_once __DIR__ . '/mysql/native/class-wp-mysql-lexer.php'; -} else { - require_once __DIR__ . '/mysql/class-wp-mysql-lexer.php'; +function wp_sqlite_mysql_native_grammar_abi_supported( $extension_version ): bool { + if ( ! is_string( $extension_version ) ) { + return false; + } + return version_compare( $extension_version, '0.2.0', '>=' ) + && version_compare( $extension_version, '0.3.0', '<' ); } -if ( class_exists( 'WP_MySQL_Native_Parser', false ) ) { +/* + * The MySQL lexer and parser have an optional native (e.g. Rust) implementation, + * registered by the "wp_mysql_parser" extension. When loaded, it pre-declares + * WP_MySQL_Native_Lexer / WP_MySQL_Native_Parser; otherwise we use the pure-PHP + * classes shipped here. WP_MySQL_Lexer / WP_MySQL_Parser is the public entrypoint + * either way. + * + * The native lexer and parser are a matched pair - the native lexer emits a token + * stream that only the native parser can consume - so they are selected together + * or not at all. We only select the native path when the loaded extension speaks a + * grammar ABI this code supports; otherwise (including a stale extension binary) we + * fall back to the pure-PHP path cleanly instead of failing at parse time. + */ +$wp_sqlite_use_native_parser = + class_exists( 'WP_MySQL_Native_Lexer', false ) + && class_exists( 'WP_MySQL_Native_Parser', false ) + && wp_sqlite_mysql_native_grammar_abi_supported( phpversion( 'wp_mysql_parser' ) ); + +if ( $wp_sqlite_use_native_parser ) { + require_once __DIR__ . '/mysql/native/class-wp-mysql-lexer.php'; require_once __DIR__ . '/mysql/native/mysql-rust-bridge.php'; require_once __DIR__ . '/mysql/native/trait-wp-mysql-native-parser-impl.php'; require_once __DIR__ . '/mysql/native/class-wp-mysql-parser.php'; } else { + require_once __DIR__ . '/mysql/class-wp-mysql-lexer.php'; require_once __DIR__ . '/mysql/class-wp-mysql-parser.php'; } require_once __DIR__ . '/sqlite/class-wp-sqlite-connection.php'; diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php new file mode 100644 index 00000000..32395b7f --- /dev/null +++ b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php @@ -0,0 +1,55 @@ +assertTrue( wp_sqlite_mysql_native_grammar_abi_supported( $version ) ); + } + + /** + * @dataProvider unsupported_versions + * @param string|false $version + */ + public function test_unsupported_versions_are_rejected( $version ): void { + $this->assertFalse( wp_sqlite_mysql_native_grammar_abi_supported( $version ) ); + } + + /** + * @return array + */ + public function supported_versions(): array { + return array( + 'minor line lower bound' => array( '0.2.0' ), + 'patch within the line' => array( '0.2.1' ), + 'higher patch' => array( '0.2.99' ), + ); + } + + /** + * @return array + */ + public function unsupported_versions(): array { + return array( + 'extension not loaded' => array( false ), + 'older ABI line' => array( '0.1.0' ), + 'older ABI line high patch' => array( '0.1.99' ), + 'next (breaking) ABI line' => array( '0.3.0' ), + 'future major' => array( '1.0.0' ), + ); + } +} diff --git a/packages/php-ext-wp-mysql-parser/Cargo.lock b/packages/php-ext-wp-mysql-parser/Cargo.lock index baf9d849..c027761e 100644 --- a/packages/php-ext-wp-mysql-parser/Cargo.lock +++ b/packages/php-ext-wp-mysql-parser/Cargo.lock @@ -1596,7 +1596,7 @@ dependencies = [ [[package]] name = "wp_mysql_parser" -version = "0.1.0" +version = "0.2.0" dependencies = [ "ext-php-rs", "libc", diff --git a/packages/php-ext-wp-mysql-parser/Cargo.toml b/packages/php-ext-wp-mysql-parser/Cargo.toml index 6646c110..57465d0a 100644 --- a/packages/php-ext-wp-mysql-parser/Cargo.toml +++ b/packages/php-ext-wp-mysql-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "wp_mysql_parser" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "GPL-2.0-or-later" diff --git a/packages/php-ext-wp-mysql-parser/README.md b/packages/php-ext-wp-mysql-parser/README.md index b6adf413..fca07f4c 100644 --- a/packages/php-ext-wp-mysql-parser/README.md +++ b/packages/php-ext-wp-mysql-parser/README.md @@ -4,6 +4,28 @@ When the extension is loaded before `packages/mysql-on-sqlite/src/load.php`, it registers native base classes used by the public `WP_MySQL_Lexer` and `WP_MySQL_Parser` wrappers. Without the extension, those public wrappers extend the pure-PHP implementations instead. +## Versioning and the grammar ABI + +The native parser and the PHP driver exchange the parser grammar at runtime via +`wp_sqlite_mysql_native_export_grammar()`. The shape of that data is an ABI shared +between the extension binary and the PHP code, and it can change between releases +(for example, the move from a coarse lookahead table to per-token branch selectors). + +Compatibility is tracked by the extension's **minor** version (the `x` in `0.x`): + +- **Bump the minor version on any backward-incompatible change to the grammar ABI** + (the data exchanged by `wp_sqlite_mysql_native_export_grammar()` or consumed by the + native parser). Patch releases must keep the ABI unchanged. +- The PHP side (`packages/mysql-on-sqlite/src/load.php`) pins the supported minor + line and selects the native lexer/parser only when `phpversion( 'wp_mysql_parser' )` + falls within it. A mismatch — most commonly a plugin update that outpaces the + installed extension binary — falls back cleanly to the pure-PHP path instead of + failing at parse time. + +When you change the grammar ABI, bump `version` in `Cargo.toml` and update the +supported range in `wp_sqlite_mysql_native_grammar_abi_supported()` in `load.php` +together. + ## Published WASM build for Playground Published WASM builds are listed on this repository's GitHub Pages site, with manifest links and a “Run in Playground” link for each release: From 067990834bf13c4ce5e7b5a69d886a3f1ed3d7e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 5 Jun 2026 12:43:50 +0200 Subject: [PATCH 28/30] Speed up grammar construction with a worklist fixpoint and lazy selectors The grammar is rebuilt on every request (PHP's shared-nothing model resets the static cache between requests), and that build dominated the lex+parse pipeline. Cut it from ~40 ms to ~6.6 ms for a typical request, with parsing unchanged: - Replace the naive iterate-to-fixpoint FIRST/NULLABLE computation with a worklist that recomputes a rule only when a rule it references grows, plus a C-level array union. ~40 ms -> ~18 ms; the grammar output is byte-identical. - Denormalize the per-token branch selectors lazily, per rule, on first descent (ensure_rule_selector) instead of eagerly for all ~1,900 rules. A typical request touches ~7% of rules, so the build drops to ~6.6 ms. The parser materializes a rule's selector on a lookup miss, keeping the common hit path a single array access (warm parse throughput within ~1% of before). - branches_for_token / single_candidate_rules are now lazily populated; build_all_selectors() forces a full build for consumers that read the table directly (the grammar tests). - Export the eager per-rule FIRST sets to the native parser instead of the lazily-built per-token table. The native parser only needs FIRST sets (it builds its own candidates from rules), so it skips the PHP denormalization entirely and no longer depends on a forced full build. - Reuse one parser across the parser benchmark corpus (resetting tokens), mirroring the driver, and refresh the published native-extension numbers. --- .../src/mysql/native/mysql-rust-bridge.php | 7 +- .../src/parser/class-wp-parser-grammar.php | 274 ++++++++++++------ .../src/parser/class-wp-parser.php | 35 ++- .../tests/parser/WP_Parser_Grammar_Tests.php | 34 ++- .../tests/tools/run-parser-benchmark.php | 14 +- packages/php-ext-wp-mysql-parser/README.md | 33 ++- packages/php-ext-wp-mysql-parser/src/lib.rs | 29 +- 7 files changed, 297 insertions(+), 129 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php index 13f8e52f..c8e14e07 100644 --- a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php +++ b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php @@ -12,10 +12,15 @@ * @return array */ function wp_sqlite_mysql_native_export_grammar( WP_Parser_Grammar $grammar ): array { + // The native parser only needs each rule's FIRST set to decide early + // whether a rule can start with the current token; it builds its own + // branch candidates from `rules`. Export the eagerly-computed FIRST sets + // directly so the lazy per-token selector table is never materialized for + // the native path. return array( 'highest_terminal_id' => $grammar->highest_terminal_id, 'rules' => $grammar->rules, - 'branches_for_token' => $grammar->branches_for_token, + 'first_sets' => $grammar->first_sets, 'nullable_branches' => $grammar->nullable_branches, 'rule_names' => $grammar->rule_names, 'fragment_ids' => $grammar->fragment_ids, diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index e41991fb..8368d303 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -78,6 +78,31 @@ class WP_Parser_Grammar { */ private $cached_rule_ids = array(); + /** + * Per-rule FIRST sets from the fixpoint. + * + * Kept so per-rule selectors can be denormalized lazily on first use, and + * exported to the native parser (which needs only each rule's FIRST set, + * not the lazily-built per-token selector table). + * + * @var array> + */ + public $first_sets = array(); + + /** + * Per-rule NULLABLE flags from the fixpoint. + * + * @var array + */ + private $rule_nullable = array(); + + /** + * Rules whose branch selector has already been built. + * + * @var array + */ + private $rule_selector_built = array(); + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -275,122 +300,193 @@ private function build_branch_selectors() { $first_sets[ $rule_id ] = array(); } - // Iterate to fixpoint. FIRST and NULLABLE set monotonically grow. - do { - $changed = false; - foreach ( $rule_ids as $rule_id ) { - $branches = $rules[ $rule_id ]; - foreach ( $branches as $branch ) { - $branch_nullable = true; - foreach ( $branch as $symbol ) { - if ( $empty_rule === $symbol ) { - // ε: contributes nothing to FIRST, stays nullable. - continue; - } - if ( $symbol < $low_nt ) { - // Terminal. - if ( ! isset( $first_sets[ $rule_id ][ $symbol ] ) ) { - $first_sets[ $rule_id ][ $symbol ] = true; - $changed = true; - } - $branch_nullable = false; - break; - } - // Non-terminal. - foreach ( $first_sets[ $symbol ] as $tid => $_ ) { - if ( ! isset( $first_sets[ $rule_id ][ $tid ] ) ) { - $first_sets[ $rule_id ][ $tid ] = true; - $changed = true; - } - } - if ( ! $nullable[ $symbol ] ) { - $branch_nullable = false; - break; - } - } - if ( $branch_nullable && ! $nullable[ $rule_id ] ) { - $nullable[ $rule_id ] = true; - $changed = true; + // Reverse-dependency map: for each non-terminal, the rules that + // reference it. FIRST/NULLABLE grow monotonically, so a rule can only + // be affected when one of the rules it references grows. + $dependents = array(); + foreach ( $rule_ids as $rule_id ) { + $seen = array(); + foreach ( $rules[ $rule_id ] as $branch ) { + foreach ( $branch as $symbol ) { + if ( $symbol >= $low_nt && ! isset( $seen[ $symbol ] ) ) { + $seen[ $symbol ] = true; + $dependents[ $symbol ][] = $rule_id; } } } - } while ( $changed ); + } - // Build per-(rule, token) branch indices. - foreach ( $rule_ids as $rule_id ) { - $branches = $rules[ $rule_id ]; - $selector = array(); - $nullable_branch_ids = array(); - foreach ( $branches as $idx => $branch ) { - $branch_first = array(); + // Worklist fixpoint. Recompute a rule's FIRST/NULLABLE only when a rule + // it references has grown, instead of rescanning every rule on every + // pass until the whole grammar stabilizes. + $queued = array_fill_keys( $rule_ids, true ); + $worklist = $rule_ids; + while ( $worklist ) { + $rule_id = array_pop( $worklist ); + unset( $queued[ $rule_id ] ); + + $first = $first_sets[ $rule_id ]; + $before = count( $first ); + $was_nullable = $nullable[ $rule_id ]; + $is_nullable = $was_nullable; + foreach ( $rules[ $rule_id ] as $branch ) { $branch_nullable = true; foreach ( $branch as $symbol ) { if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. continue; } if ( $symbol < $low_nt ) { - $branch_first[ $symbol ] = true; - $branch_nullable = false; + // Terminal. + $first[ $symbol ] = true; + $branch_nullable = false; break; } - foreach ( $first_sets[ $symbol ] as $tid => $_ ) { - $branch_first[ $tid ] = true; - } + // Non-terminal: union FIRST(symbol) in one operation. + $first += $first_sets[ $symbol ]; if ( ! $nullable[ $symbol ] ) { $branch_nullable = false; break; } } - foreach ( $branch_first as $tid => $_ ) { - $selector[ $tid ][] = $idx; - } if ( $branch_nullable ) { - $nullable_branch_ids[] = $idx; + $is_nullable = true; } } - // Nullable branches also match when the current token is not in - // any branch's FIRST set. Fold them into every populated entry - // so the runtime lookup is a single array access. - if ( $nullable_branch_ids ) { - $merged = array(); - foreach ( $selector as $tid => $idx_list ) { - $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); + // Re-enqueue dependents only when this rule actually grew. + if ( count( $first ) > $before || ( $is_nullable && ! $was_nullable ) ) { + $first_sets[ $rule_id ] = $first; + $nullable[ $rule_id ] = $is_nullable; + if ( isset( $dependents[ $rule_id ] ) ) { + foreach ( $dependents[ $rule_id ] as $dependent ) { + if ( ! isset( $queued[ $dependent ] ) ) { + $queued[ $dependent ] = true; + $worklist[] = $dependent; + } + } } - $selector = $merged; + } + } + + // FIRST/NULLABLE are now final. A rule is nullable exactly when it has + // a nullable branch, so publish nullable_branches eagerly; the parser's + // nullable fallback consults it for every rule. branches_for_token and + // single_candidate_rules are built lazily per rule (ensure_rule_selector) + // because a typical query touches only a few percent of all rules, so + // denormalizing the whole grammar up front is mostly wasted work. + $this->first_sets = $first_sets; + $this->rule_nullable = $nullable; + foreach ( $nullable as $rule_id => $is_nullable ) { + if ( $is_nullable ) { $this->nullable_branches[ $rule_id ] = true; } - if ( $selector ) { - // Embed the branch symbol sequences directly so the parser can - // iterate candidate branches without a $branches[$idx] lookup on - // every attempt. Many tokens in a rule share the same branch-id - // list, so deduplicate by signature and let copy-on-write share - // one sequences array across them. This dedup matters: unshared, - // the table would be ~35 MiB on the MySQL grammar; shared, it is - // a few MiB, built once per process (not per query). - $by_signature = array(); - $all_single_candidates = true; - foreach ( $selector as $tid => $idx_list ) { - if ( 1 !== count( $idx_list ) ) { - $all_single_candidates = false; - } - $sig = implode( ',', $idx_list ); - if ( isset( $by_signature[ $sig ] ) ) { - $selector[ $tid ] = $by_signature[ $sig ]; - } else { - $seqs = array(); - foreach ( $idx_list as $idx ) { - $seqs[] = $branches[ $idx ]; - } - $by_signature[ $sig ] = $seqs; - $selector[ $tid ] = $seqs; - } + } + } + + /** + * Build the per-token branch selector for one rule on first use. + * + * Denormalizes the rule's branches into `token_id => branch sequences[]` + * from the precomputed FIRST/NULLABLE sets, populating branches_for_token + * (and single_candidate_rules). Memoized, so repeated calls are cheap. + * + * @param int $rule_id + */ + public function ensure_rule_selector( $rule_id ): void { + if ( isset( $this->rule_selector_built[ $rule_id ] ) ) { + return; + } + $this->rule_selector_built[ $rule_id ] = true; + + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $first_sets = $this->first_sets; + $nullable = $this->rule_nullable; + $branches = $this->rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; + break; } - $this->branches_for_token[ $rule_id ] = $selector; - if ( $all_single_candidates ) { - $this->single_candidate_rules[ $rule_id ] = true; + $branch_first += $first_sets[ $symbol ]; + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; } } + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; + } + } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + // nullable_branches is already published eagerly from the fixpoint; + // here we only fold the nullable branches into each selector entry. + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); + } + $selector = $merged; + } + if ( $selector ) { + // Embed the branch symbol sequences directly so the parser can + // iterate candidate branches without a $branches[$idx] lookup on + // every attempt. Many tokens in a rule share the same branch-id + // list, so deduplicate by signature and let copy-on-write share + // one sequences array across them. This dedup matters: unshared, + // the table would be ~35 MiB on the MySQL grammar; shared, it is + // a few MiB, built once per process (not per query). + $by_signature = array(); + $all_single_candidates = true; + foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } + $sig = isset( $idx_list[1] ) ? implode( ',', $idx_list ) : $idx_list[0]; + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; + } + } + $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + } + + /** + * Eagerly build every rule's selector. + * + * The pure-PHP parser builds selectors lazily and the native bridge exports + * the FIRST sets instead, so this is only for consumers that read the full + * branches_for_token table directly (currently the grammar tests). + */ + public function build_all_selectors(): void { + foreach ( array_keys( $this->rules ) as $rule_id ) { + $this->ensure_rule_selector( $rule_id ); } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 992a01ae..7c855dd5 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -24,14 +24,18 @@ class WP_Parser { private $select_statement_rule_id; private $single_candidate_rules; + // Rules whose selector has been pulled from the grammar into the caches + // above. Selectors are built lazily on first descent into a rule. + private $built_rules = array(); + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; $this->rule_names = $grammar->rule_names; $this->fragment_ids = $grammar->fragment_ids; - $this->branches_for_token = $grammar->branches_for_token; + $this->branches_for_token = array(); $this->nullable_branches = $grammar->nullable_branches; $this->highest_terminal_id = $grammar->highest_terminal_id; - $this->single_candidate_rules = $grammar->single_candidate_rules; + $this->single_candidate_rules = array(); // The INTO negative-lookahead only fires for selectStatement. Cache // the rule id so the per-call check is an int compare instead of a @@ -91,16 +95,31 @@ private function parse_recursive( $rule_id ) { $position = $this->position; // Narrow the set of branches worth trying using the precomputed FIRST - // sets. When no entry exists for the current token but the rule is - // nullable, all candidate branches would match empty, so we return - // immediately without entering any branch. + // sets. branches_for_token is built lazily per rule, so a lookup miss + // means either "this token cannot start the rule" or "the rule is not + // denormalized yet". The hit path stays a single array access; only a + // miss consults built_rules and builds the rule's selector on first touch. $tid = $tokens[ $position ]->id; if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; - } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { - return true; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + // Rule already built; this token simply does not start it. + return isset( $this->nullable_branches[ $rule_id ] ); } else { - return false; + // First descent into this rule: build its selector, then resolve. + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); + } } $highest_terminal_id = $this->highest_terminal_id; diff --git a/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php index 7db79e2a..1861159d 100644 --- a/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php +++ b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php @@ -27,13 +27,17 @@ class WP_Parser_Grammar_Tests extends TestCase { * @param array $grammar Branches by index; each branch is an int[]. */ private function build_grammar( int $rules_offset, array $names, array $grammar ): WP_Parser_Grammar { - return new WP_Parser_Grammar( + $g = new WP_Parser_Grammar( array( 'rules_offset' => $rules_offset, 'rules_names' => $names, 'grammar' => $grammar, ) ); + // Selectors are denormalized lazily per rule; force a full build so the + // assertions below can read the complete branches_for_token table. + $g->build_all_selectors(); + return $g; } public function test_strip_epsilon_markers_and_nullable_fallback(): void { @@ -151,8 +155,36 @@ public function test_merge_sorted_dedupes_and_preserves_ascending_order(): void $this->assertSame( array( 0, 1 ), $merge->invoke( null, array( 0, 1 ), array( 1 ) ) ); } + public function test_lazy_selector_matches_full_build(): void { + // child ::= A | B ; top ::= child (A=1, B=2) + $g = $this->build_grammar( + 10, + array( 'top', 'child' ), + array( + array( array( 11 ) ), + array( array( 1 ), array( 2 ) ), + ) + ); + $expected = $g->branches_for_token[10]; + + // A fresh grammar that never forces a full build must produce the same + // selector for a rule the moment it is requested, and be idempotent. + $lazy = new WP_Parser_Grammar( + array( + 'rules_offset' => 10, + 'rules_names' => array( 'top', 'child' ), + 'grammar' => array( array( array( 11 ) ), array( array( 1 ), array( 2 ) ) ), + ) + ); + $this->assertArrayNotHasKey( 10, $lazy->branches_for_token ); + $lazy->ensure_rule_selector( 10 ); + $lazy->ensure_rule_selector( 10 ); + $this->assertSame( $expected, $lazy->branches_for_token[10] ); + } + public function test_real_mysql_grammar_invariants(): void { $g = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); + $g->build_all_selectors(); // Epsilon markers are fully stripped from every branch. The parser's // end-of-input sentinel relies on no real branch symbol being 0. diff --git a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php index 6b77ea89..7df9029e 100644 --- a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php +++ b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php @@ -64,7 +64,11 @@ function get_stats( $total, $failures, $exceptions ) { $failures = array(); $exceptions = array(); $processed = 0; -$start = microtime( true ); +// Reuse a single parser across queries, mirroring the driver +// (WP_PDO_MySQL_On_SQLite::reset_or_create_parser), which resets tokens on the +// same instance rather than constructing a fresh parser per query. +$parser = null; +$start = microtime( true ); foreach ( $queries as $query ) { try { $lexer = new WP_MySQL_Lexer( $query ); @@ -75,8 +79,12 @@ function get_stats( $total, $failures, $exceptions ) { throw new Exception( 'Failed to tokenize query: ' . $query ); } - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $ast = $parser->parse(); + if ( null === $parser ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + } else { + $parser->reset_tokens( $tokens ); + } + $ast = $parser->parse(); if ( null === $ast ) { $failures[] = $query; } diff --git a/packages/php-ext-wp-mysql-parser/README.md b/packages/php-ext-wp-mysql-parser/README.md index fca07f4c..d47e0812 100644 --- a/packages/php-ext-wp-mysql-parser/README.md +++ b/packages/php-ext-wp-mysql-parser/README.md @@ -108,22 +108,33 @@ The GitHub Pages demo reads published benchmark data from: -Latest local measurement (Apple Silicon macOS, PHP 8.5.5 CLI, default config without JIT, 2026-06-04): +Latest local measurement (Apple Silicon macOS, PHP 8.5.5 CLI, over the 69,577-query corpus, +best of five runs, 2026-06-05): -| Benchmark | Implementation | Queries | QPS | Speedup | -| --- | --- | ---: | ---: | ---: | -| MySQL lexer | Pure PHP | 69,577 | 178,409 | — | -| MySQL lexer | Native extension | 69,577 | 355,084 | 2.00x | -| MySQL parser | Pure PHP | 69,577 | 29,048 | — | -| MySQL parser | Native extension | 69,577 | 58,111 | 2.00x | +**Without JIT:** -The parser rows are parse-only. On this branch the native parser materializes the full +| Benchmark | Pure PHP [QPS] | Native [QPS] | Speedup | +| --- | ---: | ---: | ---: | +| MySQL lexer | 178,619 | 354,058 | 1.98x | +| MySQL parser | 28,640 | 60,119 | 2.10x | + +**With opcache + tracing JIT:** + +| Benchmark | Pure PHP [QPS] | Native [QPS] | Speedup | +| --- | ---: | ---: | ---: | +| MySQL lexer | 332,974 | 364,365 | 1.09x | +| MySQL parser | 50,088 | 60,253 | 1.20x | + +The parser rows are parse-only and reuse a single parser instance across the corpus (resetting +tokens per query), mirroring the driver, which reuses its parser across a request's queries. +On this branch the native parser materializes the full `WP_Parser_Node` tree eagerly, so the number reflects building a complete AST rather than a deferred handle (the earlier 15x figure measured a lazy parse that never built the tree). -These are default-CLI numbers, matching the published `benchmark.json` environment. The native -code is JIT-independent, while the pure-PHP path speeds up substantially under opcache + tracing -JIT, so the native edge there narrows to roughly 1.1x (lexer ~1.08x, parser ~1.13x). +The native code is essentially JIT-independent, while the pure-PHP path speeds up substantially +under opcache + tracing JIT — so the native edge narrows from roughly 2x to about 1.1x for the +lexer and 1.2x for the parser. The published `benchmark.json` environment matches the without-JIT +numbers. That file should be updated whenever a new extension build or benchmark environment is published. diff --git a/packages/php-ext-wp-mysql-parser/src/lib.rs b/packages/php-ext-wp-mysql-parser/src/lib.rs index 3bf92c22..8425ac35 100644 --- a/packages/php-ext-wp-mysql-parser/src/lib.rs +++ b/packages/php-ext-wp-mysql-parser/src/lib.rs @@ -1337,11 +1337,11 @@ fn export_grammar(grammar_zval: &mut Zval) -> PhpResult> { .and_then(Zval::array) .ok_or_else(|| php_error("Missing grammar rules"))?, )?; - let parsed_first_sets = parse_branches_for_token_first_sets( + let parsed_first_sets = parse_first_sets( array - .get("branches_for_token") + .get("first_sets") .and_then(Zval::array) - .ok_or_else(|| php_error("Missing grammar branches_for_token"))?, + .ok_or_else(|| php_error("Missing grammar first_sets"))?, )?; let parsed_nullable = parse_id_set( array @@ -1516,22 +1516,19 @@ fn parse_rules(array: &ZendHashTable) -> PhpResult>>> Ok(rules) } -/// Build a per-rule FIRST set from `branches_for_token`, which is keyed -/// `[rule_id => [token_id => array]]`. Only the inner keys -/// (the token ids) are needed here; the branch sequences are the -/// pure-PHP parser's per-token candidate set, irrelevant to the native -/// parser's early-bailout. -fn parse_branches_for_token_first_sets( - array: &ZendHashTable, -) -> PhpResult>> { +/// Parse the per-rule FIRST sets, keyed `[rule_id => [token_id => true]]`. +/// The native parser uses these to decide early whether a rule can start +/// with the current token; it builds its own branch candidates from +/// `rules`, so the pure-PHP parser's per-token selector table is not needed. +fn parse_first_sets(array: &ZendHashTable) -> PhpResult>> { let mut first_sets = HashMap::new(); - for (rule_key, selector_zval) in array { + for (rule_key, tokens_zval) in array { let rule_id = array_key_to_i64(rule_key)?; - let selector = selector_zval + let tokens = tokens_zval .array() - .ok_or_else(|| php_error("Grammar branches_for_token entry must be an array"))?; - let mut set = HashSet::with_capacity(selector.len()); - for (token_key, _) in selector { + .ok_or_else(|| php_error("Grammar first_sets entry must be an array"))?; + let mut set = HashSet::with_capacity(tokens.len()); + for (token_key, _) in tokens { set.insert(array_key_to_i64(token_key)?); } first_sets.insert(rule_id, set); From 7368e342d3c4958ea4988c1c86142d9256fd3703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 5 Jun 2026 14:59:00 +0200 Subject: [PATCH 29/30] Unify unit-test CI into one matrix and disable Xdebug Merge the "PHPUnit Tests" (pure-PHP) and "MySQL Parser Extension Tests" workflows into a single "PHPUnit Tests" matrix that runs the mysql-on-sqlite suite with and without the native Rust parser extension: pure on PHP 7.2-8.5, plus the extension on PHP 8.0+ (its minimum). Job names read "PHP 8.2 / SQLite 3.45.1" and "PHP 8.2 + ext-wp-mysql-parser / SQLite 3.45.1". This drops the redundant pure-on-extension jobs (the old extension workflow re-ran the plain suite on 7.2-7.4, duplicating "PHPUnit Tests") and removes the reusable phpunit-tests-run.yml. The native jobs build the extension in release mode (cargo build --release) so the suite exercises it at realistic speed rather than the slow debug build. All setup-php steps now pass `coverage: none`. setup-php enables Xdebug by default, and the old pure-suite path left it on, instrumenting every call and running the suite ~4x slower (PHP 7.3: ~59s -> ~14s) while no coverage report was ever produced or consumed. Also set `coverage: none` on the MySQL Proxy and release-publish PHP setups. The merged workflow is path-filtered to the parser/driver/extension packages (plus root composer) like the extension workflow was, and triggers on push to trunk (the old phpunit-tests trigger referenced a non-existent "main" branch). --- .../mysql-parser-extension-tests.yml | 189 ------------------ .github/workflows/mysql-proxy-tests.yml | 1 + .github/workflows/phpunit-tests-run.yml | 113 ----------- .github/workflows/phpunit-tests.yml | 189 +++++++++++++++--- .github/workflows/release-publish.yml | 1 + 5 files changed, 161 insertions(+), 332 deletions(-) delete mode 100644 .github/workflows/mysql-parser-extension-tests.yml delete mode 100644 .github/workflows/phpunit-tests-run.yml diff --git a/.github/workflows/mysql-parser-extension-tests.yml b/.github/workflows/mysql-parser-extension-tests.yml deleted file mode 100644 index 45425bb8..00000000 --- a/.github/workflows/mysql-parser-extension-tests.yml +++ /dev/null @@ -1,189 +0,0 @@ -name: MySQL Parser Extension Tests - -on: - push: - branches: - - trunk - paths: - - '.github/workflows/mysql-parser-extension-tests.yml' - - 'packages/mysql-on-sqlite/**' - - 'packages/php-ext-wp-mysql-parser/**' - pull_request: - paths: - - '.github/workflows/mysql-parser-extension-tests.yml' - - 'packages/mysql-on-sqlite/**' - - 'packages/php-ext-wp-mysql-parser/**' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -# Disable permissions for all available scopes by default. -# Any needed permissions should be configured at the job level. -permissions: {} - -jobs: - extension-tests: - name: PHP ${{ matrix.php }} / ${{ matrix.coverage }} / ubuntu-latest - runs-on: ubuntu-latest - timeout-minutes: 30 - permissions: - contents: read # Required to clone the repo. - strategy: - fail-fast: false - matrix: - include: - - php: '7.2' - sqlite: '3.27.0' - native: false - coverage: SQLite integration - - php: '7.3' - sqlite: '3.31.1' - native: false - coverage: SQLite integration - - php: '7.4' - sqlite: '3.34.1' - native: false - coverage: SQLite integration - - php: '8.0' - sqlite: '3.37.0' - native: true - coverage: SQLite integration + Rust extension - - php: '8.1' - sqlite: '3.40.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.2' - sqlite: '3.45.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.3' - sqlite: '3.46.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.4' - sqlite: '3.51.2' - native: true - coverage: SQLite integration + Rust extension - - php: '8.5' - sqlite: latest - native: true - coverage: SQLite integration + Rust extension - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up SQLite - run: | - VERSION='${{ matrix.sqlite }}' - if [ "$VERSION" = 'latest' ]; then - TAG='release' - else - TAG="version-${VERSION}" - fi - SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" - SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" - DOWNLOADED=0 - for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do - for attempt in 1 2 3 4 5; do - if wget -O sqlite.tar.gz "$url"; then - DOWNLOADED=1 - break 2 - fi - if [ "$attempt" -lt 5 ]; then - sleep $(( attempt * 10 )) - fi - done - done - if [ "$DOWNLOADED" -ne 1 ]; then - exit 1 - fi - tar xzf sqlite.tar.gz - if [ ! -d sqlite ]; then - SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) - if [ -z "$SQLITE_DIR" ]; then - exit 1 - fi - mv "$SQLITE_DIR" sqlite - fi - cd sqlite - ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" - make -j$(nproc) - sudo make install - sudo ldconfig - - - name: Set up PHP - uses: shivammathur/setup-php@v2 - with: - php-version: ${{ matrix.php }} - coverage: none - tools: phpunit-polyfills - - - name: Verify SQLite version in PHP - run: | - EXPECTED='${{ matrix.sqlite }}' - if [ "$EXPECTED" = 'latest' ]; then - EXPECTED=$(cat sqlite/VERSION) - fi - PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") - echo "Expected SQLite version: $EXPECTED" - echo "PHP PDO SQLite version: $PDO" - if [ "$EXPECTED" != "$PDO" ]; then - echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" - exit 1 - fi - - - name: Set up Rust - if: matrix.native - uses: dtolnay/rust-toolchain@stable - - - name: Install native build dependencies - if: matrix.native - run: | - sudo apt-get update - sudo apt-get install -y libclang-dev - echo "PHP_CONFIG=$(command -v php-config)" >> "$GITHUB_ENV" - LIBCLANG_SO="$(find /usr/lib -name 'libclang.so*' | head -n 1)" - echo "LIBCLANG_PATH=$(dirname "$LIBCLANG_SO")" >> "$GITHUB_ENV" - - - name: Install Composer dependencies (root) - uses: ramsey/composer-install@v3 - with: - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Install Composer dependencies (mysql-on-sqlite) - uses: ramsey/composer-install@v3 - with: - working-directory: packages/mysql-on-sqlite - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Check Rust formatting - if: matrix.php == '8.2' && matrix.native - run: cargo fmt --check - working-directory: packages/php-ext-wp-mysql-parser - - - name: Build parser extension - if: matrix.native - run: cargo build - working-directory: packages/php-ext-wp-mysql-parser - - - name: Verify native parser extension - if: matrix.native - run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/debug/libwp_mysql_parser.so" tests/tools/verify-native-parser-extension.php - working-directory: packages/mysql-on-sqlite - - - name: Run full PHPUnit suite with parser extension - if: matrix.native - env: - WP_SQLITE_REQUIRE_NATIVE_PARSER_EXTENSION: '1' - run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/debug/libwp_mysql_parser.so" ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite - - - name: Run full PHPUnit suite - if: ${{ ! matrix.native }} - run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/mysql-proxy-tests.yml b/.github/workflows/mysql-proxy-tests.yml index e24c36f3..6e6a0afa 100644 --- a/.github/workflows/mysql-proxy-tests.yml +++ b/.github/workflows/mysql-proxy-tests.yml @@ -26,6 +26,7 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '7.4' + coverage: none - name: Install Composer dependencies uses: ramsey/composer-install@v3 diff --git a/.github/workflows/phpunit-tests-run.yml b/.github/workflows/phpunit-tests-run.yml deleted file mode 100644 index 2eec8ee2..00000000 --- a/.github/workflows/phpunit-tests-run.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: Run PHPUnit tests - -on: - workflow_call: - inputs: - os: - description: 'Operating system to run tests on' - required: false - type: 'string' - default: 'ubuntu-latest' - php: - description: 'The version of PHP to use, in the format of X.Y' - required: true - type: 'string' - sqlite: - description: 'SQLite version to install (e.g., 3.24.0). Leave empty for latest version.' - required: false - type: 'string' - default: 'latest' -env: - LOCAL_PHP: ${{ inputs.php }}-fpm - -# Disable permissions for all available scopes by default. -# Any needed permissions should be configured at the job level. -permissions: {} - -jobs: - phpunit-tests: - name: ${{ inputs.os }} - runs-on: ${{ inputs.os }} - timeout-minutes: 20 - permissions: - contents: read # Required to clone the repo. - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up SQLite - run: | - VERSION='${{ inputs.sqlite }}' - if [ "$VERSION" = 'latest' ]; then - TAG='release' - else - TAG="version-${VERSION}" - fi - SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" - SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" - DOWNLOADED=0 - for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do - for attempt in 1 2 3 4 5; do - if wget -O sqlite.tar.gz "$url"; then - DOWNLOADED=1 - break 2 - fi - if [ "$attempt" -lt 5 ]; then - sleep $(( attempt * 10 )) - fi - done - done - if [ "$DOWNLOADED" -ne 1 ]; then - exit 1 - fi - tar xzf sqlite.tar.gz - if [ ! -d sqlite ]; then - SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) - if [ -z "$SQLITE_DIR" ]; then - exit 1 - fi - mv "$SQLITE_DIR" sqlite - fi - cd sqlite - ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" - make -j$(nproc) - sudo make install - sudo ldconfig - - - name: Set up PHP - uses: shivammathur/setup-php@v2 - with: - php-version: '${{ inputs.php }}' - tools: phpunit-polyfills - - - name: Verify SQLite version in PHP - run: | - EXPECTED='${{ inputs.sqlite }}' - if [ "$EXPECTED" = 'latest' ]; then - EXPECTED=$(cat sqlite/VERSION) - fi - PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") - echo "Expected SQLite version: $EXPECTED" - echo "PHP PDO SQLite version: $PDO" - if [ "$EXPECTED" != "$PDO" ]; then - echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" - exit 1 - fi - - - name: Install Composer dependencies (root) - uses: ramsey/composer-install@v3 - with: - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Install Composer dependencies (mysql-on-sqlite) - uses: ramsey/composer-install@v3 - with: - working-directory: packages/mysql-on-sqlite - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Run PHPUnit tests - run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/phpunit-tests.yml b/.github/workflows/phpunit-tests.yml index 23293087..b0890317 100644 --- a/.github/workflows/phpunit-tests.yml +++ b/.github/workflows/phpunit-tests.yml @@ -3,8 +3,25 @@ name: PHPUnit Tests on: push: branches: - - main + - trunk + paths: + - '.github/workflows/phpunit-tests.yml' + - 'packages/mysql-on-sqlite/**' + - 'packages/php-ext-wp-mysql-parser/**' + - 'composer.json' + - 'composer.lock' pull_request: + paths: + - '.github/workflows/phpunit-tests.yml' + - 'packages/mysql-on-sqlite/**' + - 'packages/php-ext-wp-mysql-parser/**' + - 'composer.json' + - 'composer.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true # Disable permissions for all available scopes by default. # Any needed permissions should be configured at the job level. @@ -12,38 +29,150 @@ permissions: {} jobs: test: - name: PHP ${{ matrix.php }} / SQLite ${{ matrix.sqlite || 'latest' }} - uses: ./.github/workflows/phpunit-tests-run.yml + # The pure-PHP parser is exercised across the full PHP/SQLite range; the + # native Rust parser extension is exercised on PHP 8.0+ (its minimum). Both + # run the same mysql-on-sqlite suite, just with a different parser engine. + name: PHP ${{ matrix.php }}${{ matrix.extension && ' + ext-wp-mysql-parser' || '' }} / SQLite ${{ matrix.sqlite }} + runs-on: ubuntu-latest + timeout-minutes: 30 permissions: contents: read # Required to clone the repo. - secrets: inherit strategy: fail-fast: false matrix: - os: [ ubuntu-latest ] - php: [ '7.2', '7.3', '7.4', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5' ] include: - # Add specific SQLite versions for specific PHP versions here: - - php: '7.2' - sqlite: '3.27.0' # minimum version with WP_SQLITE_UNSAFE_ENABLE_UNSUPPORTED_VERSIONS - - php: '7.3' - sqlite: '3.31.1' # Ubuntu 20.04 LTS - - php: '7.4' - sqlite: '3.34.1' # Debian 11 (Bullseye), common with PHP < 8.1 - - php: '8.0' - sqlite: '3.37.0' # minimum supported version (STRICT table support), Ubuntu 22.04 LTS (3.37.2) - - php: '8.1' - sqlite: '3.40.1' # Debian 12 (Bookworm) - - php: '8.2' - sqlite: '3.45.1' # Ubuntu 24.04 LTS - - php: '8.3' - sqlite: '3.46.1' # Debian 13 (Trixie), Ubuntu >= 24.10 - - php: '8.4' - sqlite: '3.51.2' # First 2026 release - - php: '8.5' - sqlite: 'latest' - - with: - os: ${{ matrix.os }} - php: ${{ matrix.php }} - sqlite: ${{ matrix.sqlite || 'latest' }} + # Pure-PHP parser, across the supported PHP versions, each pinned to a + # representative SQLite version spanning the supported range. + - { php: '7.2', sqlite: '3.27.0', extension: false } # minimum with WP_SQLITE_UNSAFE_ENABLE_UNSUPPORTED_VERSIONS + - { php: '7.3', sqlite: '3.31.1', extension: false } # Ubuntu 20.04 LTS + - { php: '7.4', sqlite: '3.34.1', extension: false } # Debian 11 (Bullseye) + - { php: '8.0', sqlite: '3.37.0', extension: false } # minimum supported version (STRICT tables) + - { php: '8.1', sqlite: '3.40.1', extension: false } # Debian 12 (Bookworm) + - { php: '8.2', sqlite: '3.45.1', extension: false } # Ubuntu 24.04 LTS + - { php: '8.3', sqlite: '3.46.1', extension: false } # Debian 13 (Trixie) + - { php: '8.4', sqlite: '3.51.2', extension: false } # First 2026 release + - { php: '8.5', sqlite: 'latest', extension: false } + # Native Rust parser extension (requires PHP 8.0+). + - { php: '8.0', sqlite: '3.37.0', extension: true } + - { php: '8.1', sqlite: '3.40.1', extension: true } + - { php: '8.2', sqlite: '3.45.1', extension: true } + - { php: '8.3', sqlite: '3.46.1', extension: true } + - { php: '8.4', sqlite: '3.51.2', extension: true } + - { php: '8.5', sqlite: 'latest', extension: true } + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up SQLite + run: | + VERSION='${{ matrix.sqlite }}' + if [ "$VERSION" = 'latest' ]; then + TAG='release' + else + TAG="version-${VERSION}" + fi + SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" + SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" + DOWNLOADED=0 + for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do + for attempt in 1 2 3 4 5; do + if wget -O sqlite.tar.gz "$url"; then + DOWNLOADED=1 + break 2 + fi + if [ "$attempt" -lt 5 ]; then + sleep $(( attempt * 10 )) + fi + done + done + if [ "$DOWNLOADED" -ne 1 ]; then + exit 1 + fi + tar xzf sqlite.tar.gz + if [ ! -d sqlite ]; then + SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) + if [ -z "$SQLITE_DIR" ]; then + exit 1 + fi + mv "$SQLITE_DIR" sqlite + fi + cd sqlite + ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" + make -j$(nproc) + sudo make install + sudo ldconfig + + - name: Set up PHP + uses: shivammathur/setup-php@v2 + with: + php-version: ${{ matrix.php }} + coverage: none + tools: phpunit-polyfills + + - name: Verify SQLite version in PHP + run: | + EXPECTED='${{ matrix.sqlite }}' + if [ "$EXPECTED" = 'latest' ]; then + EXPECTED=$(cat sqlite/VERSION) + fi + PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") + echo "Expected SQLite version: $EXPECTED" + echo "PHP PDO SQLite version: $PDO" + if [ "$EXPECTED" != "$PDO" ]; then + echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" + exit 1 + fi + + - name: Set up Rust + if: matrix.extension + uses: dtolnay/rust-toolchain@stable + + - name: Install native build dependencies + if: matrix.extension + run: | + sudo apt-get update + sudo apt-get install -y libclang-dev + echo "PHP_CONFIG=$(command -v php-config)" >> "$GITHUB_ENV" + LIBCLANG_SO="$(find /usr/lib -name 'libclang.so*' | head -n 1)" + echo "LIBCLANG_PATH=$(dirname "$LIBCLANG_SO")" >> "$GITHUB_ENV" + + - name: Install Composer dependencies (root) + uses: ramsey/composer-install@v3 + with: + ignore-cache: "yes" + composer-options: "--optimize-autoloader" + + - name: Install Composer dependencies (mysql-on-sqlite) + uses: ramsey/composer-install@v3 + with: + working-directory: packages/mysql-on-sqlite + ignore-cache: "yes" + composer-options: "--optimize-autoloader" + + - name: Check Rust formatting + if: ${{ matrix.extension && matrix.php == '8.2' }} + run: cargo fmt --check + working-directory: packages/php-ext-wp-mysql-parser + + - name: Build parser extension + if: matrix.extension + run: cargo build --release + working-directory: packages/php-ext-wp-mysql-parser + + - name: Verify native parser extension + if: matrix.extension + run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/release/libwp_mysql_parser.so" tests/tools/verify-native-parser-extension.php + working-directory: packages/mysql-on-sqlite + + - name: Run PHPUnit suite with parser extension + if: matrix.extension + env: + WP_SQLITE_REQUIRE_NATIVE_PARSER_EXTENSION: '1' + run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/release/libwp_mysql_parser.so" ./vendor/bin/phpunit -c ./phpunit.xml.dist + working-directory: packages/mysql-on-sqlite + + - name: Run PHPUnit suite + if: ${{ ! matrix.extension }} + run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist + working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/release-publish.yml b/.github/workflows/release-publish.yml index 7df54551..d02c88e3 100644 --- a/.github/workflows/release-publish.yml +++ b/.github/workflows/release-publish.yml @@ -54,6 +54,7 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '8.2' + coverage: none - name: Build plugin zip run: composer run build-sqlite-plugin-zip From d3716d3e024328744474c7dea6a408a0809d469b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 5 Jun 2026 15:59:42 +0200 Subject: [PATCH 30/30] Cache the Rust extension build in CI The native matrix jobs compile the extension with `cargo build --release`, which rebuilds the whole dependency tree from scratch each run. Add Swatinem/rust-cache for the parser-extension workspace so the cargo registry and target dir are cached across runs, cutting the release-compile time on warm runs without affecting the (now realistic) test-step timings. --- .github/workflows/phpunit-tests.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/phpunit-tests.yml b/.github/workflows/phpunit-tests.yml index b0890317..5126e2ea 100644 --- a/.github/workflows/phpunit-tests.yml +++ b/.github/workflows/phpunit-tests.yml @@ -128,6 +128,16 @@ jobs: if: matrix.extension uses: dtolnay/rust-toolchain@stable + - name: Cache Rust build + if: matrix.extension + uses: Swatinem/rust-cache@v2 + with: + workspaces: packages/php-ext-wp-mysql-parser + # Segregate by PHP version: the extension links against the PHP headers + # of the matrix's php-config, so a build cached for one PHP version is + # ABI-incompatible with another (Zend module API mismatch on load). + key: php-${{ matrix.php }} + - name: Install native build dependencies if: matrix.extension run: |