From 8c02c3f002746a86796f95e0be2574b012caa452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 6 Jun 2026 00:46:13 +0200 Subject: [PATCH] Use native-backed AST nodes in Rust parser --- packages/mysql-on-sqlite/src/load.php | 1 + .../class-wp-mysql-native-parser-node.php | 174 +++++ .../src/parser/class-wp-parser-node.php | 2 +- .../WP_MySQL_Parser_Instanceof_Tests.php | 24 + .../tests/tools/run-parser-benchmark.php | 32 +- .../src/lexer_constants.rs | 27 +- packages/php-ext-wp-mysql-parser/src/lib.rs | 733 ++++++++++++++++-- 7 files changed, 911 insertions(+), 82 deletions(-) create mode 100644 packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php diff --git a/packages/mysql-on-sqlite/src/load.php b/packages/mysql-on-sqlite/src/load.php index fb0c8c3b8..37b8b84d0 100644 --- a/packages/mysql-on-sqlite/src/load.php +++ b/packages/mysql-on-sqlite/src/load.php @@ -61,6 +61,7 @@ class_exists( 'WP_MySQL_Native_Lexer', false ) require_once __DIR__ . '/mysql/native/class-wp-mysql-lexer.php'; require_once __DIR__ . '/mysql/native/mysql-rust-bridge.php'; require_once __DIR__ . '/mysql/native/trait-wp-mysql-native-parser-impl.php'; + require_once __DIR__ . '/mysql/native/class-wp-mysql-native-parser-node.php'; require_once __DIR__ . '/mysql/native/class-wp-mysql-parser.php'; } else { require_once __DIR__ . '/mysql/class-wp-mysql-lexer.php'; diff --git a/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php b/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php new file mode 100644 index 000000000..25d839c50 --- /dev/null +++ b/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php @@ -0,0 +1,174 @@ +materialize_native_children(); + parent::append_child( $node ); + } + + /** @inheritDoc */ + public function has_child(): bool { + if ( $this->was_mutated() ) { + return parent::has_child(); + } + return wp_sqlite_mysql_native_ast_has_child( $this ); + } + + /** @inheritDoc */ + public function has_child_node( ?string $rule_name = null ): bool { + if ( $this->was_mutated() ) { + return parent::has_child_node( $rule_name ); + } + return wp_sqlite_mysql_native_ast_has_child_node( $this, $rule_name ); + } + + /** @inheritDoc */ + public function has_child_token( ?int $token_id = null ): bool { + if ( $this->was_mutated() ) { + return parent::has_child_token( $token_id ); + } + return wp_sqlite_mysql_native_ast_has_child_token( $this, $token_id ); + } + + /** @inheritDoc */ + public function get_first_child() { + if ( $this->was_mutated() ) { + return parent::get_first_child(); + } + return wp_sqlite_mysql_native_ast_get_first_child( $this ); + } + + /** @inheritDoc */ + public function get_first_child_node( ?string $rule_name = null ): ?WP_Parser_Node { + if ( $this->was_mutated() ) { + return parent::get_first_child_node( $rule_name ); + } + return wp_sqlite_mysql_native_ast_get_first_child_node( $this, $rule_name ); + } + + /** @inheritDoc */ + public function get_first_child_token( ?int $token_id = null ): ?WP_Parser_Token { + if ( $this->was_mutated() ) { + return parent::get_first_child_token( $token_id ); + } + return wp_sqlite_mysql_native_ast_get_first_child_token( $this, $token_id ); + } + + /** @inheritDoc */ + public function get_first_descendant_node( ?string $rule_name = null ): ?WP_Parser_Node { + if ( $this->was_mutated() ) { + return parent::get_first_descendant_node( $rule_name ); + } + return wp_sqlite_mysql_native_ast_get_first_descendant_node( $this, $rule_name ); + } + + /** @inheritDoc */ + public function get_first_descendant_token( ?int $token_id = null ): ?WP_Parser_Token { + if ( $this->was_mutated() ) { + return parent::get_first_descendant_token( $token_id ); + } + return wp_sqlite_mysql_native_ast_get_first_descendant_token( $this, $token_id ); + } + + /** @inheritDoc */ + public function get_children(): array { + if ( $this->was_mutated() ) { + return parent::get_children(); + } + return wp_sqlite_mysql_native_ast_get_children( $this ); + } + + /** @inheritDoc */ + public function get_child_nodes( ?string $rule_name = null ): array { + if ( $this->was_mutated() ) { + return parent::get_child_nodes( $rule_name ); + } + return wp_sqlite_mysql_native_ast_get_child_nodes( $this, $rule_name ); + } + + /** @inheritDoc */ + public function get_child_tokens( ?int $token_id = null ): array { + if ( $this->was_mutated() ) { + return parent::get_child_tokens( $token_id ); + } + return wp_sqlite_mysql_native_ast_get_child_tokens( $this, $token_id ); + } + + /** @inheritDoc */ + public function get_descendants(): array { + if ( $this->was_mutated() ) { + return parent::get_descendants(); + } + return wp_sqlite_mysql_native_ast_get_descendants( $this ); + } + + /** @inheritDoc */ + public function get_descendant_nodes( ?string $rule_name = null ): array { + if ( $this->was_mutated() ) { + return parent::get_descendant_nodes( $rule_name ); + } + return wp_sqlite_mysql_native_ast_get_descendant_nodes( $this, $rule_name ); + } + + /** @inheritDoc */ + public function get_descendant_tokens( ?int $token_id = null ): array { + if ( $this->was_mutated() ) { + return parent::get_descendant_tokens( $token_id ); + } + return wp_sqlite_mysql_native_ast_get_descendant_tokens( $this, $token_id ); + } + + /** @inheritDoc */ + public function get_start(): int { + if ( $this->was_mutated() ) { + return parent::get_start(); + } + return wp_sqlite_mysql_native_ast_get_start( $this ); + } + + /** @inheritDoc */ + public function get_length(): int { + if ( $this->was_mutated() ) { + return parent::get_length(); + } + return wp_sqlite_mysql_native_ast_get_length( $this ); + } + + private function was_mutated(): bool { + return $this->was_mutated; + } + + private function materialize_native_children(): void { + if ( $this->was_mutated ) { + return; + } + + $this->children = wp_sqlite_mysql_native_ast_get_children( $this ); + $this->was_mutated = true; + if ( function_exists( 'wp_sqlite_mysql_native_ast_materialize_wrapper' ) ) { + wp_sqlite_mysql_native_ast_materialize_wrapper( $this ); + } + } +} diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 67ff851e0..2ed046a59 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,7 +9,7 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -final class WP_Parser_Node { +class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Parser_Instanceof_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Parser_Instanceof_Tests.php index d308d5585..0af85168e 100644 --- a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Parser_Instanceof_Tests.php +++ b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Parser_Instanceof_Tests.php @@ -36,4 +36,28 @@ public function test_parser_returns_an_ast(): void { $this->assertNotNull( $ast ); $this->assertInstanceOf( WP_Parser_Node::class, $ast ); } + + public function test_native_ast_node_identity_survives_mutation(): void { + if ( ! class_exists( 'WP_MySQL_Native_Parser_Node', false ) ) { + $this->markTestSkipped( 'Native parser extension is not active.' ); + } + + $grammar = new WP_Parser_Grammar( include __DIR__ . '/../../../src/mysql/mysql-grammar.php' ); + $lexer = new WP_MySQL_Lexer( 'SELECT 1' ); + $parser = new WP_MySQL_Parser( $grammar, $lexer->native_token_stream() ); + + $ast = $parser->parse(); + $this->assertInstanceOf( WP_MySQL_Native_Parser_Node::class, $ast ); + + $first_child = $ast->get_first_child_node(); + $this->assertInstanceOf( WP_Parser_Node::class, $first_child ); + $this->assertSame( $first_child, $ast->get_first_child_node() ); + + $synthetic = new WP_Parser_Node( 0, 'synthetic' ); + $first_child->append_child( $synthetic ); + + $same_first_child = $ast->get_first_child_node(); + $this->assertSame( $first_child, $same_first_child ); + $this->assertTrue( in_array( $synthetic, $same_first_child->get_children(), true ) ); + } } diff --git a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php index 7df9029e8..25303c6a8 100644 --- a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php +++ b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php @@ -8,6 +8,10 @@ * Options: * --json Print machine-readable benchmark output. * --limit=N Only benchmark the first N queries. + * --consume=MODE + * How much AST data to consume after parsing: + * none Only require parse() to return an AST (default). + * descendants Walk all descendants with get_descendants(). */ // Throw exception if anything fails. @@ -17,12 +21,20 @@ function ( $severity, $message, $file, $line ) { } ); -$json = in_array( '--json', $argv, true ); -$limit = null; +$json = in_array( '--json', $argv, true ); +$limit = null; +$consume = 'none'; foreach ( $argv as $arg ) { if ( 0 === strpos( $arg, '--limit=' ) ) { $limit = max( 1, (int) substr( $arg, strlen( '--limit=' ) ) ); } + if ( 0 === strpos( $arg, '--consume=' ) ) { + $consume = substr( $arg, strlen( '--consume=' ) ); + } +} + +if ( ! in_array( $consume, array( 'none', 'descendants' ), true ) ) { + throw new InvalidArgumentException( sprintf( 'Unsupported --consume mode: %s', $consume ) ); } // Use the integration loader so an already-loaded native extension selects @@ -61,9 +73,10 @@ function get_stats( $total, $failures, $exceptions ) { } // Run the parser. -$failures = array(); -$exceptions = array(); -$processed = 0; +$failures = array(); +$exceptions = array(); +$processed = 0; +$descendants = 0; // Reuse a single parser across queries, mirroring the driver // (WP_PDO_MySQL_On_SQLite::reset_or_create_parser), which resets tokens on the // same instance rather than constructing a fresh parser per query. @@ -87,6 +100,8 @@ function get_stats( $total, $failures, $exceptions ) { $ast = $parser->parse(); if ( null === $ast ) { $failures[] = $query; + } elseif ( 'descendants' === $consume ) { + $descendants += count( $ast->get_descendants() ); } } catch ( Exception $e ) { $exceptions[] = $query; @@ -107,6 +122,8 @@ function get_stats( $total, $failures, $exceptions ) { 'implementation' => class_exists( 'WP_MySQL_Native_Parser', false ) ? 'native-extension' : 'php', 'extension_loaded' => extension_loaded( 'wp_mysql_parser' ), 'queries' => $processed, + 'consume' => $consume, + 'descendants' => $descendants, 'duration' => $duration, 'qps' => $qps, 'failures' => count( $failures ), @@ -119,6 +136,11 @@ function get_stats( $total, $failures, $exceptions ) { } echo get_stats( $processed, count( $failures ), count( $exceptions ) ), "\n"; +printf( "AST consumption: %s", $consume ); +if ( 'descendants' === $consume ) { + printf( " (%d descendants)", $descendants ); +} +echo "\n"; // Print the results. printf( "\nParsed %d queries in %.5fs @ %d QPS.\n", $processed, $duration, $qps ); diff --git a/packages/php-ext-wp-mysql-parser/src/lexer_constants.rs b/packages/php-ext-wp-mysql-parser/src/lexer_constants.rs index 5a3d3a40b..f2b4d604f 100644 --- a/packages/php-ext-wp-mysql-parser/src/lexer_constants.rs +++ b/packages/php-ext-wp-mysql-parser/src/lexer_constants.rs @@ -1,7 +1,9 @@ #![allow(dead_code)] +use std::collections::HashMap; use std::mem; use std::ptr; +use std::sync::OnceLock; use ext_php_rs::boxed::ZBox; use ext_php_rs::builders::ClassBuilder; @@ -3944,6 +3946,10 @@ pub const TOKEN_SYNONYMS: &[(i64, i64)] = &[ (630i64, 629i64), ]; +static KEYWORD_TOKEN_MAP: OnceLock> = OnceLock::new(); +static VERSION_RULE_MAP: OnceLock> = OnceLock::new(); +static TOKEN_SYNONYM_MAP: OnceLock> = OnceLock::new(); + pub const UNDERSCORE_CHARSET_NAMES: &[&str] = &[ "_armscii8", "_ascii", @@ -4003,15 +4009,17 @@ pub fn token_name(id: i64) -> Option<&'static str> { } pub fn keyword_token(keyword: &str) -> Option { - KEYWORD_TOKENS - .iter() - .find_map(|(candidate, id)| (*candidate == keyword).then_some(*id)) + KEYWORD_TOKEN_MAP + .get_or_init(|| KEYWORD_TOKENS.iter().copied().collect()) + .get(keyword) + .copied() } pub fn version_rule(token_id: i64) -> Option { - VERSION_RULES - .iter() - .find_map(|(candidate, version)| (*candidate == token_id).then_some(*version)) + VERSION_RULE_MAP + .get_or_init(|| VERSION_RULES.iter().copied().collect()) + .get(&token_id) + .copied() } pub fn is_function_token(token_id: i64) -> bool { @@ -4019,9 +4027,10 @@ pub fn is_function_token(token_id: i64) -> bool { } pub fn token_synonym(token_id: i64) -> Option { - TOKEN_SYNONYMS - .iter() - .find_map(|(candidate, synonym)| (*candidate == token_id).then_some(*synonym)) + TOKEN_SYNONYM_MAP + .get_or_init(|| TOKEN_SYNONYMS.iter().copied().collect()) + .get(&token_id) + .copied() } pub fn is_underscore_charset(name: &str) -> bool { diff --git a/packages/php-ext-wp-mysql-parser/src/lib.rs b/packages/php-ext-wp-mysql-parser/src/lib.rs index 8425ac35d..2a2e8f360 100644 --- a/packages/php-ext-wp-mysql-parser/src/lib.rs +++ b/packages/php-ext-wp-mysql-parser/src/lib.rs @@ -1,5 +1,6 @@ #![cfg_attr(windows, feature(abi_vectorcall))] +use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::os::raw::c_char; use std::ptr; @@ -60,7 +61,7 @@ fn php_function(name: &str) -> PhpResult> { struct PhpClasses { parser_token: &'static ClassEntry, mysql_token: &'static ClassEntry, - parser_node: &'static ClassEntry, + native_parser_node: &'static ClassEntry, } fn php_classes() -> PhpResult { @@ -69,8 +70,8 @@ fn php_classes() -> PhpResult { .ok_or_else(|| php_error("Missing WP_Parser_Token class"))?, mysql_token: ClassEntry::try_find("WP_MySQL_Token") .ok_or_else(|| php_error("Missing WP_MySQL_Token class"))?, - parser_node: ClassEntry::try_find("WP_Parser_Node") - .ok_or_else(|| php_error("Missing WP_Parser_Node class"))?, + native_parser_node: ClassEntry::try_find("WP_MySQL_Native_Parser_Node") + .ok_or_else(|| php_error("Missing WP_MySQL_Native_Parser_Node class"))?, }) } @@ -180,17 +181,19 @@ fn span_until(bytes: &[u8], mut pos: usize, needles: &[u8]) -> usize { } fn bytes_ascii_upper(bytes: &[u8]) -> String { - bytes - .iter() - .map(|byte| byte.to_ascii_uppercase() as char) - .collect() + let mut upper = Vec::with_capacity(bytes.len()); + upper.extend(bytes.iter().map(u8::to_ascii_uppercase)); + // The lexer only calls this for identifier slices. ASCII bytes remain + // ASCII and non-ASCII identifier bytes have already passed the UTF-8 + // shape checks in read_identifier(). + unsafe { String::from_utf8_unchecked(upper) } } fn bytes_ascii_lower(bytes: &[u8]) -> String { - bytes - .iter() - .map(|byte| byte.to_ascii_lowercase() as char) - .collect() + let mut lower = Vec::with_capacity(bytes.len()); + lower.extend(bytes.iter().map(u8::to_ascii_lowercase)); + // See bytes_ascii_upper(). + unsafe { String::from_utf8_unchecked(lower) } } #[derive(Clone, Copy)] @@ -498,9 +501,10 @@ impl WpMySqlNativeLexer { { token = Some(lex::UNDERSCORE_CHARSET); } else { - let identifier = - self.sql[self.token_starts_at..self.bytes_already_read].to_vec(); - token = Some(self.determine_identifier_or_keyword_type(&identifier)); + token = Some(self.determine_identifier_or_keyword_type( + self.token_starts_at, + self.bytes_already_read, + )); } } token @@ -863,9 +867,20 @@ impl WpMySqlNativeLexer { } } - fn determine_identifier_or_keyword_type(&mut self, value: &[u8]) -> i64 { - let upper = bytes_ascii_upper(value); - let mut token_type = match lex::keyword_token(&upper) { + fn determine_identifier_or_keyword_type(&mut self, start: usize, end: usize) -> i64 { + let value = &self.sql[start..end]; + let upper; + let keyword = if value.iter().any(u8::is_ascii_lowercase) { + upper = bytes_ascii_upper(value); + upper.as_str() + } else { + match std::str::from_utf8(value) { + Ok(value) => value, + Err(_) => return lex::IDENTIFIER, + } + }; + + let mut token_type = match lex::keyword_token(keyword) { Some(token_type) => token_type, None => return lex::IDENTIFIER, }; @@ -920,16 +935,70 @@ struct Grammar { struct Rule { branches: Vec>, - /// Sorted FIRST set: token ids that can start a match for this rule. + /// FIRST set: token ids that can start a match for this rule. /// `None` means the rule has no FIRST entry at all (cannot match the /// non-empty case); see `nullable` for the empty case. - first_set: Option>, + first_set: Option, /// At least one branch is nullable (matches empty input). nullable: bool, rule_name: String, is_fragment: bool, } +enum FirstSet { + One(i64), + Two(i64, i64), + Sorted(Vec), + Bits(Box<[u64]>), +} + +impl FirstSet { + fn from_ids(mut values: Vec) -> Option { + values.sort_unstable(); + values.dedup(); + + match values.len() { + 0 => None, + 1 => Some(Self::One(values[0])), + 2 => Some(Self::Two(values[0], values[1])), + 3..=31 => Some(Self::Sorted(values)), + _ => { + if values.iter().any(|token_id| *token_id < 0) { + return Some(Self::Sorted(values)); + } + let max = values.iter().copied().max().unwrap_or(0); + let Ok(max) = usize::try_from(max) else { + return Some(Self::Sorted(values)); + }; + let mut bits = vec![0; max / 64 + 1]; + for token_id in values { + let Ok(token_id) = usize::try_from(token_id) else { + continue; + }; + bits[token_id / 64] |= 1 << (token_id % 64); + } + Some(Self::Bits(bits.into_boxed_slice())) + } + } + } + + fn contains(&self, token_id: i64) -> bool { + match self { + Self::One(expected) => *expected == token_id, + Self::Two(first, second) => *first == token_id || *second == token_id, + Self::Sorted(values) => values.binary_search(&token_id).is_ok(), + Self::Bits(bits) => { + let Ok(token_id) = usize::try_from(token_id) else { + return false; + }; + bits.get(token_id / 64) + .map(|word| (word & (1 << (token_id % 64))) != 0) + .unwrap_or(false) + } + } + } +} + impl Grammar { fn rule(&self, rule_id: i64) -> Option<&Rule> { usize::try_from(rule_id) @@ -974,6 +1043,38 @@ impl ParserTokenSource { } } } + + fn token_info(&self, index: usize) -> PhpResult { + match self { + Self::Php(tokens) => { + let token = tokens + .get(index) + .ok_or_else(|| php_error("Parser token index is out of range"))?; + let token_object = token + .object() + .ok_or_else(|| php_error("Parser token must be an object"))?; + let id = token_object.get_property::("id").map_err(php_error)?; + let start = token_object + .get_property::("start") + .map_err(php_error)?; + let length = token_object + .get_property::("length") + .map_err(php_error)?; + let start = usize::try_from(start).map_err(php_error)?; + let length = usize::try_from(length).map_err(php_error)?; + + Ok(TokenInfo { + id, + start, + end: start.saturating_add(length), + }) + } + Self::Native { tokens, .. } => tokens + .get(index) + .copied() + .ok_or_else(|| php_error("Parser token index is out of range")), + } + } } #[derive(Clone, Copy)] @@ -1001,6 +1102,9 @@ enum NativeParseMatch { struct NativeAstNode { rule_id: i64, children: Vec, + first_token: Option, + last_token: Option, + descendant_count: usize, } struct NativeAstArena { @@ -1012,6 +1116,28 @@ struct NativeAstArena { struct NativeAstState { arena: Arc, + /// Per-AST identity map: node arena index → live PHP wrapper pointer. + /// + /// `WP_Parser_Node` callers expect stable child identity (mutate a child + /// once, walk past, walk back, the mutation is still there). Each + /// accessor in this extension constructs a fresh wrapper unless we + /// intern it here. Arena node indexes are dense, so a vector avoids + /// hashing in hot AST-walk paths. The cache intentionally stores raw + /// wrapper pointers, not strong PHP references, so Rust can preserve + /// identity without pinning wrappers after PHP drops them. + node_cache: RefCell>>, +} + +struct NativeAstWrapperEntry { + ast: Rc, + node_index: usize, + /// Materialized wrappers still participate in identity lookups but no + /// longer delegate reads through the native AST bridge. + is_materialized: bool, +} + +thread_local! { + static NATIVE_AST_WRAPPERS: RefCell> = RefCell::new(HashMap::new()); } impl NativeAstArena { @@ -1026,7 +1152,39 @@ impl NativeAstArena { fn push_node(&mut self, rule_id: i64, children: Vec) -> usize { let index = self.nodes.len(); - self.nodes.push(NativeAstNode { rule_id, children }); + let mut first_token = None; + let mut last_token = None; + let mut descendant_count = 0; + for child in &children { + match child { + NativeAstChild::Node(child_index) => { + if let Some(node) = self.nodes.get(*child_index) { + descendant_count += 1 + node.descendant_count; + if first_token.is_none() { + first_token = node.first_token; + } + if node.last_token.is_some() { + last_token = node.last_token; + } + } + } + NativeAstChild::Token(token_index) => { + if first_token.is_none() { + first_token = Some(*token_index); + } + last_token = Some(*token_index); + descendant_count += 1; + } + } + } + + self.nodes.push(NativeAstNode { + rule_id, + children, + first_token, + last_token, + descendant_count, + }); index } @@ -1035,11 +1193,133 @@ impl NativeAstArena { .get(index) .ok_or_else(|| php_error("Native AST node index is out of range")) } + + fn child_node_matches(&self, child: NativeAstChild, rule_name: Option<&str>) -> bool { + let NativeAstChild::Node(index) = child else { + return false; + }; + let Ok(node) = self.node(index) else { + return false; + }; + match rule_name { + Some(expected) => self + .grammar + .rule(node.rule_id) + .map(|rule| rule.rule_name == expected) + .unwrap_or(false), + None => true, + } + } + + fn child_token_matches(&self, child: NativeAstChild, token_id: Option) -> bool { + let NativeAstChild::Token(index) = child else { + return false; + }; + match token_id { + Some(expected) => self + .token_source + .token_info(index) + .map(|token| token.id == expected) + .unwrap_or(false), + None => true, + } + } + + fn descendant_stack(&self, index: usize) -> PhpResult> { + let node = self.node(index)?; + let mut stack = Vec::with_capacity(node.descendant_count); + stack.extend(node.children.iter().rev().copied()); + Ok(stack) + } +} + +fn native_ast_wrapper_key(wrapper_zval: &Zval) -> PhpResult { + let object = wrapper_zval + .object() + .ok_or_else(|| php_error("Missing native AST wrapper"))?; + Ok(ptr::from_ref(object) as usize) +} + +fn native_ast_from_wrapper(wrapper_zval: &Zval) -> PhpResult<(Rc, usize)> { + let key = native_ast_wrapper_key(wrapper_zval)?; + NATIVE_AST_WRAPPERS + .with(|wrappers| { + wrappers.borrow().get(&key).and_then(|entry| { + (!entry.is_materialized).then(|| (Rc::clone(&entry.ast), entry.node_index)) + }) + }) + .ok_or_else(|| php_error("Missing native AST handle")) +} + +fn register_native_ast_wrapper( + object: &ZendObject, + ast: &Rc, + node_index: usize, +) -> usize { + let key = ptr::from_ref(object) as usize; + NATIVE_AST_WRAPPERS.with(|wrappers| { + wrappers.borrow_mut().insert( + key, + NativeAstWrapperEntry { + ast: Rc::clone(ast), + node_index, + is_materialized: false, + }, + ); + }); + if let Some(slot) = ast.node_cache.borrow_mut().get_mut(node_index) { + *slot = Some(key); + } + key +} + +fn mark_native_ast_wrapper_materialized_key(key: usize) { + NATIVE_AST_WRAPPERS.with(|wrappers| { + if let Some(entry) = wrappers.borrow_mut().get_mut(&key) { + entry.is_materialized = true; + } + }); +} + +fn release_native_ast_wrapper_key(key: usize) { + let entry = NATIVE_AST_WRAPPERS.with(|wrappers| wrappers.borrow_mut().remove(&key)); + if let Some(entry) = entry { + let mut cache = entry.ast.node_cache.borrow_mut(); + if let Some(slot) = cache.get_mut(entry.node_index) { + if *slot == Some(key) { + *slot = None; + } + } + } +} + +fn native_ast_wrapper_matches(key: usize, ast: &Rc, node_index: usize) -> bool { + NATIVE_AST_WRAPPERS.with(|wrappers| { + wrappers + .borrow() + .get(&key) + .is_some_and(|entry| Rc::ptr_eq(&entry.ast, ast) && entry.node_index == node_index) + }) +} + +/// Build a Zval that references an existing PHP object. +/// +/// Used on cache hits to hand a live wrapper back to PHP without allocating a +/// new object. `Zval::set_object()` bumps the object refcount for the returned +/// zval; the Rust cache only stores the pointer and does not own a reference. +unsafe fn zval_from_cached_object(key: usize) -> Zval { + let obj = &mut *(key as *mut ZendObject); + let mut zv = Zval::new(); + zv.set_object(obj); + zv } impl NativeAstState { fn new(arena: Arc) -> Rc { - Rc::new(Self { arena }) + Rc::new(Self { + node_cache: RefCell::new(vec![None; arena.nodes.len()]), + arena, + }) } fn create_php_ast(self: &Rc) -> PhpResult { @@ -1055,54 +1335,346 @@ impl NativeAstState { zval.set_bool(true); Ok(zval) } - NativeAstRoot::Node(index) => create_php_node_with_classes(&self.arena, index, classes), + NativeAstRoot::Node(index) => self.create_php_node_with_classes(index, classes), NativeAstRoot::Token(index) => self .arena .token_source .create_php_token_with_classes(index, classes), } } -} -/// Build a complete PHP `WP_Parser_Node` Zval, recursively materializing -/// children from the Rust arena. The returned object is a plain -/// `WP_Parser_Node` instance with `rule_id`, `rule_name`, and `children` -/// populated, so callers see no difference from the pure-PHP parser's output. -fn create_php_node_with_classes( - arena: &NativeAstArena, - index: usize, - classes: &PhpClasses, -) -> PhpResult { - let node = arena.node(index)?; - let rule_name = arena - .grammar - .rule(node.rule_id) - .map(|rule| rule.rule_name.as_str()) - .unwrap_or_default(); - - let mut children: Vec = Vec::with_capacity(node.children.len()); - for child in &node.children { - let child_zval = match child { - NativeAstChild::Node(child_index) => { - create_php_node_with_classes(arena, *child_index, classes)? - } - NativeAstChild::Token(token_index) => arena + /// Resolve a child slot to a Zval, going through the per-AST identity + /// cache for nodes. Tokens are not yet cached — they have no public + /// mutators and no caller in this repo relies on token identity. + fn cached_child_zval( + self: &Rc, + child: NativeAstChild, + classes: &PhpClasses, + ) -> PhpResult { + match child { + NativeAstChild::Node(index) => self.cached_node_zval(index, classes), + NativeAstChild::Token(index) => self + .arena .token_source - .create_php_token_with_classes(*token_index, classes)?, + .create_php_token_with_classes(index, classes), + } + } + + fn cached_node_zval(self: &Rc, index: usize, classes: &PhpClasses) -> PhpResult { + let cached_key = { + let cache = self.node_cache.borrow(); + cache.get(index).and_then(|entry| *entry) }; - children.push(child_zval); + if let Some(key) = cached_key { + if native_ast_wrapper_matches(key, self, index) { + return Ok(unsafe { zval_from_cached_object(key) }); + } + if let Some(slot) = self.node_cache.borrow_mut().get_mut(index) { + *slot = None; + } + } + + self.create_php_node_with_classes(index, classes) + } + + fn create_php_node_with_classes( + self: &Rc, + index: usize, + classes: &PhpClasses, + ) -> PhpResult { + let node = self.arena.node(index)?; + let mut object = classes.native_parser_node.new(); + let rule_name = self + .arena + .grammar + .rule(node.rule_id) + .map(|rule| rule.rule_name.as_str()) + .unwrap_or_default(); + + update_object_property( + &mut object, + classes.native_parser_node, + "rule_id", + node.rule_id, + )?; + update_object_property( + &mut object, + classes.native_parser_node, + "rule_name", + rule_name.to_owned(), + )?; + + register_native_ast_wrapper(object.as_ref(), self, index); + object.into_zval(false).map_err(php_error) } +} - let mut object = classes.parser_node.new(); - update_object_property(&mut object, classes.parser_node, "rule_id", node.rule_id)?; - update_object_property( - &mut object, - classes.parser_node, - "rule_name", - rule_name.to_owned(), - )?; - update_object_property(&mut object, classes.parser_node, "children", children)?; - object.into_zval(false).map_err(php_error) +#[php_function] +pub fn wp_sqlite_mysql_native_ast_release_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { + let key = native_ast_wrapper_key(wrapper_zval)?; + release_native_ast_wrapper_key(key); + Ok(()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_materialize_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { + let key = native_ast_wrapper_key(wrapper_zval)?; + mark_native_ast_wrapper_materialized_key(key); + Ok(()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_has_child(wrapper_zval: &Zval) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + Ok(!ast.arena.node(node_index)?.children.is_empty()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_has_child_node( + wrapper_zval: &Zval, + rule_name: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + Ok(ast + .arena + .node(node_index)? + .children + .iter() + .copied() + .any(|child| ast.arena.child_node_matches(child, rule_name.as_deref()))) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_has_child_token( + wrapper_zval: &Zval, + token_id: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + Ok(ast + .arena + .node(node_index)? + .children + .iter() + .copied() + .any(|child| ast.arena.child_token_matches(child, token_id))) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_first_child(wrapper_zval: &Zval) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let Some(child) = ast.arena.node(node_index)?.children.first().copied() else { + return Ok(Zval::null()); + }; + ast.cached_child_zval(child, &classes) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_first_child_node( + wrapper_zval: &Zval, + rule_name: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + for child in &ast.arena.node(node_index)?.children { + if ast.arena.child_node_matches(*child, rule_name.as_deref()) { + return ast.cached_child_zval(*child, &classes); + } + } + Ok(Zval::null()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_first_child_token( + wrapper_zval: &Zval, + token_id: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + for child in &ast.arena.node(node_index)?.children { + if ast.arena.child_token_matches(*child, token_id) { + return ast.cached_child_zval(*child, &classes); + } + } + Ok(Zval::null()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_first_descendant_node( + wrapper_zval: &Zval, + rule_name: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let mut stack = ast.arena.descendant_stack(node_index)?; + while let Some(child) = stack.pop() { + if ast.arena.child_node_matches(child, rule_name.as_deref()) { + return ast.cached_child_zval(child, &classes); + } + if let NativeAstChild::Node(index) = child { + for child in ast.arena.node(index)?.children.iter().rev() { + stack.push(*child); + } + } + } + Ok(Zval::null()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_first_descendant_token( + wrapper_zval: &Zval, + token_id: Option, +) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let mut stack = ast.arena.descendant_stack(node_index)?; + while let Some(child) = stack.pop() { + if ast.arena.child_token_matches(child, token_id) { + return ast.cached_child_zval(child, &classes); + } + if let NativeAstChild::Node(index) = child { + for child in ast.arena.node(index)?.children.iter().rev() { + stack.push(*child); + } + } + } + Ok(Zval::null()) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_children(wrapper_zval: &Zval) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + ast.arena + .node(node_index)? + .children + .iter() + .copied() + .map(|child| ast.cached_child_zval(child, &classes)) + .collect() +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_child_nodes( + wrapper_zval: &Zval, + rule_name: Option, +) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + ast.arena + .node(node_index)? + .children + .iter() + .copied() + .filter(|child| ast.arena.child_node_matches(*child, rule_name.as_deref())) + .map(|child| ast.cached_child_zval(child, &classes)) + .collect() +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_child_tokens( + wrapper_zval: &Zval, + token_id: Option, +) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + ast.arena + .node(node_index)? + .children + .iter() + .copied() + .filter(|child| ast.arena.child_token_matches(*child, token_id)) + .map(|child| ast.cached_child_zval(child, &classes)) + .collect() +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_descendants(wrapper_zval: &Zval) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let root = ast.arena.node(node_index)?; + let mut descendants = Vec::with_capacity(root.descendant_count); + let mut stack = ast.arena.descendant_stack(node_index)?; + while let Some(child) = stack.pop() { + descendants.push(ast.cached_child_zval(child, &classes)?); + if let NativeAstChild::Node(index) = child { + for child in ast.arena.node(index)?.children.iter().rev() { + stack.push(*child); + } + } + } + Ok(descendants) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_descendant_nodes( + wrapper_zval: &Zval, + rule_name: Option, +) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let mut descendants = Vec::new(); + let mut stack = ast.arena.descendant_stack(node_index)?; + while let Some(child) = stack.pop() { + if ast.arena.child_node_matches(child, rule_name.as_deref()) { + descendants.push(ast.cached_child_zval(child, &classes)?); + } + if let NativeAstChild::Node(index) = child { + for child in ast.arena.node(index)?.children.iter().rev() { + stack.push(*child); + } + } + } + Ok(descendants) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_descendant_tokens( + wrapper_zval: &Zval, + token_id: Option, +) -> PhpResult> { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let classes = php_classes()?; + let mut descendants = Vec::new(); + let mut stack = ast.arena.descendant_stack(node_index)?; + while let Some(child) = stack.pop() { + if ast.arena.child_token_matches(child, token_id) { + descendants.push(ast.cached_child_zval(child, &classes)?); + } + if let NativeAstChild::Node(index) = child { + for child in ast.arena.node(index)?.children.iter().rev() { + stack.push(*child); + } + } + } + Ok(descendants) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_start(wrapper_zval: &Zval) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let node = ast.arena.node(node_index)?; + let token_index = node + .first_token + .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; + let token = ast.arena.token_source.token_info(token_index)?; + i64::try_from(token.start).map_err(php_error) +} + +#[php_function] +pub fn wp_sqlite_mysql_native_ast_get_length(wrapper_zval: &Zval) -> PhpResult { + let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; + let node = ast.arena.node(node_index)?; + let first_token_index = node + .first_token + .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; + let last_token_index = node + .last_token + .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; + let first_token = ast.arena.token_source.token_info(first_token_index)?; + let last_token = ast.arena.token_source.token_info(last_token_index)?; + let length = last_token.end.saturating_sub(first_token.start); + i64::try_from(length).map_err(php_error) } #[php_class] @@ -1243,7 +1815,7 @@ impl WpMySqlNativeParser { if let Some(first_set) = rule.first_set.as_ref() { let token_id = self.token_ids.get(self.position).copied().unwrap_or(0); - if first_set.binary_search(&token_id).is_err() && !rule.nullable { + if !first_set.contains(token_id) && !rule.nullable { return Ok(NativeParseMatch::No); } } else if !rule.nullable { @@ -1468,14 +2040,9 @@ fn build_rules( for (rule_id, branches) in rules { let index = usize::try_from(rule_id).map_err(php_error)?; - let mut first_set = first_sets.get(&rule_id).map(|set| { - let mut values: Vec = set.iter().copied().collect(); - values.sort_unstable(); - values - }); - if let Some(values) = first_set.as_mut() { - values.dedup(); - } + let first_set = first_sets + .get(&rule_id) + .and_then(|set| FirstSet::from_ids(set.iter().copied().collect())); dense_rules[index] = Some(Rule { branches, @@ -1578,5 +2145,37 @@ pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { .class::() .class::() .class::() + .function(wrap_function!(wp_sqlite_mysql_native_ast_release_wrapper)) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_materialize_wrapper + )) + .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_node)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_token)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_first_child)) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_first_child_node + )) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_first_child_token + )) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_first_descendant_node + )) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_first_descendant_token + )) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_children)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_nodes)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_tokens)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_descendants)) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_descendant_nodes + )) + .function(wrap_function!( + wp_sqlite_mysql_native_ast_get_descendant_tokens + )) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_start)) + .function(wrap_function!(wp_sqlite_mysql_native_ast_get_length)) .info_function(php_module_info) }