diff --git a/.github/workflows/mysql-parser-extension-tests.yml b/.github/workflows/mysql-parser-extension-tests.yml deleted file mode 100644 index 45425bb8..00000000 --- a/.github/workflows/mysql-parser-extension-tests.yml +++ /dev/null @@ -1,189 +0,0 @@ -name: MySQL Parser Extension Tests - -on: - push: - branches: - - trunk - paths: - - '.github/workflows/mysql-parser-extension-tests.yml' - - 'packages/mysql-on-sqlite/**' - - 'packages/php-ext-wp-mysql-parser/**' - pull_request: - paths: - - '.github/workflows/mysql-parser-extension-tests.yml' - - 'packages/mysql-on-sqlite/**' - - 'packages/php-ext-wp-mysql-parser/**' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -# Disable permissions for all available scopes by default. -# Any needed permissions should be configured at the job level. -permissions: {} - -jobs: - extension-tests: - name: PHP ${{ matrix.php }} / ${{ matrix.coverage }} / ubuntu-latest - runs-on: ubuntu-latest - timeout-minutes: 30 - permissions: - contents: read # Required to clone the repo. - strategy: - fail-fast: false - matrix: - include: - - php: '7.2' - sqlite: '3.27.0' - native: false - coverage: SQLite integration - - php: '7.3' - sqlite: '3.31.1' - native: false - coverage: SQLite integration - - php: '7.4' - sqlite: '3.34.1' - native: false - coverage: SQLite integration - - php: '8.0' - sqlite: '3.37.0' - native: true - coverage: SQLite integration + Rust extension - - php: '8.1' - sqlite: '3.40.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.2' - sqlite: '3.45.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.3' - sqlite: '3.46.1' - native: true - coverage: SQLite integration + Rust extension - - php: '8.4' - sqlite: '3.51.2' - native: true - coverage: SQLite integration + Rust extension - - php: '8.5' - sqlite: latest - native: true - coverage: SQLite integration + Rust extension - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up SQLite - run: | - VERSION='${{ matrix.sqlite }}' - if [ "$VERSION" = 'latest' ]; then - TAG='release' - else - TAG="version-${VERSION}" - fi - SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" - SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" - DOWNLOADED=0 - for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do - for attempt in 1 2 3 4 5; do - if wget -O sqlite.tar.gz "$url"; then - DOWNLOADED=1 - break 2 - fi - if [ "$attempt" -lt 5 ]; then - sleep $(( attempt * 10 )) - fi - done - done - if [ "$DOWNLOADED" -ne 1 ]; then - exit 1 - fi - tar xzf sqlite.tar.gz - if [ ! -d sqlite ]; then - SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) - if [ -z "$SQLITE_DIR" ]; then - exit 1 - fi - mv "$SQLITE_DIR" sqlite - fi - cd sqlite - ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" - make -j$(nproc) - sudo make install - sudo ldconfig - - - name: Set up PHP - uses: shivammathur/setup-php@v2 - with: - php-version: ${{ matrix.php }} - coverage: none - tools: phpunit-polyfills - - - name: Verify SQLite version in PHP - run: | - EXPECTED='${{ matrix.sqlite }}' - if [ "$EXPECTED" = 'latest' ]; then - EXPECTED=$(cat sqlite/VERSION) - fi - PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") - echo "Expected SQLite version: $EXPECTED" - echo "PHP PDO SQLite version: $PDO" - if [ "$EXPECTED" != "$PDO" ]; then - echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" - exit 1 - fi - - - name: Set up Rust - if: matrix.native - uses: dtolnay/rust-toolchain@stable - - - name: Install native build dependencies - if: matrix.native - run: | - sudo apt-get update - sudo apt-get install -y libclang-dev - echo "PHP_CONFIG=$(command -v php-config)" >> "$GITHUB_ENV" - LIBCLANG_SO="$(find /usr/lib -name 'libclang.so*' | head -n 1)" - echo "LIBCLANG_PATH=$(dirname "$LIBCLANG_SO")" >> "$GITHUB_ENV" - - - name: Install Composer dependencies (root) - uses: ramsey/composer-install@v3 - with: - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Install Composer dependencies (mysql-on-sqlite) - uses: ramsey/composer-install@v3 - with: - working-directory: packages/mysql-on-sqlite - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Check Rust formatting - if: matrix.php == '8.2' && matrix.native - run: cargo fmt --check - working-directory: packages/php-ext-wp-mysql-parser - - - name: Build parser extension - if: matrix.native - run: cargo build - working-directory: packages/php-ext-wp-mysql-parser - - - name: Verify native parser extension - if: matrix.native - run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/debug/libwp_mysql_parser.so" tests/tools/verify-native-parser-extension.php - working-directory: packages/mysql-on-sqlite - - - name: Run full PHPUnit suite with parser extension - if: matrix.native - env: - WP_SQLITE_REQUIRE_NATIVE_PARSER_EXTENSION: '1' - run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/debug/libwp_mysql_parser.so" ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite - - - name: Run full PHPUnit suite - if: ${{ ! matrix.native }} - run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/mysql-proxy-tests.yml b/.github/workflows/mysql-proxy-tests.yml index e24c36f3..6e6a0afa 100644 --- a/.github/workflows/mysql-proxy-tests.yml +++ b/.github/workflows/mysql-proxy-tests.yml @@ -26,6 +26,7 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '7.4' + coverage: none - name: Install Composer dependencies uses: ramsey/composer-install@v3 diff --git a/.github/workflows/phpunit-tests-run.yml b/.github/workflows/phpunit-tests-run.yml deleted file mode 100644 index 2eec8ee2..00000000 --- a/.github/workflows/phpunit-tests-run.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: Run PHPUnit tests - -on: - workflow_call: - inputs: - os: - description: 'Operating system to run tests on' - required: false - type: 'string' - default: 'ubuntu-latest' - php: - description: 'The version of PHP to use, in the format of X.Y' - required: true - type: 'string' - sqlite: - description: 'SQLite version to install (e.g., 3.24.0). Leave empty for latest version.' - required: false - type: 'string' - default: 'latest' -env: - LOCAL_PHP: ${{ inputs.php }}-fpm - -# Disable permissions for all available scopes by default. -# Any needed permissions should be configured at the job level. -permissions: {} - -jobs: - phpunit-tests: - name: ${{ inputs.os }} - runs-on: ${{ inputs.os }} - timeout-minutes: 20 - permissions: - contents: read # Required to clone the repo. - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up SQLite - run: | - VERSION='${{ inputs.sqlite }}' - if [ "$VERSION" = 'latest' ]; then - TAG='release' - else - TAG="version-${VERSION}" - fi - SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" - SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" - DOWNLOADED=0 - for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do - for attempt in 1 2 3 4 5; do - if wget -O sqlite.tar.gz "$url"; then - DOWNLOADED=1 - break 2 - fi - if [ "$attempt" -lt 5 ]; then - sleep $(( attempt * 10 )) - fi - done - done - if [ "$DOWNLOADED" -ne 1 ]; then - exit 1 - fi - tar xzf sqlite.tar.gz - if [ ! -d sqlite ]; then - SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) - if [ -z "$SQLITE_DIR" ]; then - exit 1 - fi - mv "$SQLITE_DIR" sqlite - fi - cd sqlite - ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" - make -j$(nproc) - sudo make install - sudo ldconfig - - - name: Set up PHP - uses: shivammathur/setup-php@v2 - with: - php-version: '${{ inputs.php }}' - tools: phpunit-polyfills - - - name: Verify SQLite version in PHP - run: | - EXPECTED='${{ inputs.sqlite }}' - if [ "$EXPECTED" = 'latest' ]; then - EXPECTED=$(cat sqlite/VERSION) - fi - PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") - echo "Expected SQLite version: $EXPECTED" - echo "PHP PDO SQLite version: $PDO" - if [ "$EXPECTED" != "$PDO" ]; then - echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" - exit 1 - fi - - - name: Install Composer dependencies (root) - uses: ramsey/composer-install@v3 - with: - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Install Composer dependencies (mysql-on-sqlite) - uses: ramsey/composer-install@v3 - with: - working-directory: packages/mysql-on-sqlite - ignore-cache: "yes" - composer-options: "--optimize-autoloader" - - - name: Run PHPUnit tests - run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist - working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/phpunit-tests.yml b/.github/workflows/phpunit-tests.yml index 23293087..5126e2ea 100644 --- a/.github/workflows/phpunit-tests.yml +++ b/.github/workflows/phpunit-tests.yml @@ -3,8 +3,25 @@ name: PHPUnit Tests on: push: branches: - - main + - trunk + paths: + - '.github/workflows/phpunit-tests.yml' + - 'packages/mysql-on-sqlite/**' + - 'packages/php-ext-wp-mysql-parser/**' + - 'composer.json' + - 'composer.lock' pull_request: + paths: + - '.github/workflows/phpunit-tests.yml' + - 'packages/mysql-on-sqlite/**' + - 'packages/php-ext-wp-mysql-parser/**' + - 'composer.json' + - 'composer.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true # Disable permissions for all available scopes by default. # Any needed permissions should be configured at the job level. @@ -12,38 +29,160 @@ permissions: {} jobs: test: - name: PHP ${{ matrix.php }} / SQLite ${{ matrix.sqlite || 'latest' }} - uses: ./.github/workflows/phpunit-tests-run.yml + # The pure-PHP parser is exercised across the full PHP/SQLite range; the + # native Rust parser extension is exercised on PHP 8.0+ (its minimum). Both + # run the same mysql-on-sqlite suite, just with a different parser engine. + name: PHP ${{ matrix.php }}${{ matrix.extension && ' + ext-wp-mysql-parser' || '' }} / SQLite ${{ matrix.sqlite }} + runs-on: ubuntu-latest + timeout-minutes: 30 permissions: contents: read # Required to clone the repo. - secrets: inherit strategy: fail-fast: false matrix: - os: [ ubuntu-latest ] - php: [ '7.2', '7.3', '7.4', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5' ] include: - # Add specific SQLite versions for specific PHP versions here: - - php: '7.2' - sqlite: '3.27.0' # minimum version with WP_SQLITE_UNSAFE_ENABLE_UNSUPPORTED_VERSIONS - - php: '7.3' - sqlite: '3.31.1' # Ubuntu 20.04 LTS - - php: '7.4' - sqlite: '3.34.1' # Debian 11 (Bullseye), common with PHP < 8.1 - - php: '8.0' - sqlite: '3.37.0' # minimum supported version (STRICT table support), Ubuntu 22.04 LTS (3.37.2) - - php: '8.1' - sqlite: '3.40.1' # Debian 12 (Bookworm) - - php: '8.2' - sqlite: '3.45.1' # Ubuntu 24.04 LTS - - php: '8.3' - sqlite: '3.46.1' # Debian 13 (Trixie), Ubuntu >= 24.10 - - php: '8.4' - sqlite: '3.51.2' # First 2026 release - - php: '8.5' - sqlite: 'latest' - - with: - os: ${{ matrix.os }} - php: ${{ matrix.php }} - sqlite: ${{ matrix.sqlite || 'latest' }} + # Pure-PHP parser, across the supported PHP versions, each pinned to a + # representative SQLite version spanning the supported range. + - { php: '7.2', sqlite: '3.27.0', extension: false } # minimum with WP_SQLITE_UNSAFE_ENABLE_UNSUPPORTED_VERSIONS + - { php: '7.3', sqlite: '3.31.1', extension: false } # Ubuntu 20.04 LTS + - { php: '7.4', sqlite: '3.34.1', extension: false } # Debian 11 (Bullseye) + - { php: '8.0', sqlite: '3.37.0', extension: false } # minimum supported version (STRICT tables) + - { php: '8.1', sqlite: '3.40.1', extension: false } # Debian 12 (Bookworm) + - { php: '8.2', sqlite: '3.45.1', extension: false } # Ubuntu 24.04 LTS + - { php: '8.3', sqlite: '3.46.1', extension: false } # Debian 13 (Trixie) + - { php: '8.4', sqlite: '3.51.2', extension: false } # First 2026 release + - { php: '8.5', sqlite: 'latest', extension: false } + # Native Rust parser extension (requires PHP 8.0+). + - { php: '8.0', sqlite: '3.37.0', extension: true } + - { php: '8.1', sqlite: '3.40.1', extension: true } + - { php: '8.2', sqlite: '3.45.1', extension: true } + - { php: '8.3', sqlite: '3.46.1', extension: true } + - { php: '8.4', sqlite: '3.51.2', extension: true } + - { php: '8.5', sqlite: 'latest', extension: true } + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up SQLite + run: | + VERSION='${{ matrix.sqlite }}' + if [ "$VERSION" = 'latest' ]; then + TAG='release' + else + TAG="version-${VERSION}" + fi + SQLITE_SOURCE="https://sqlite.org/src/tarball/sqlite.tar.gz?r=${TAG}" + SQLITE_MIRROR="https://github.com/sqlite/sqlite/archive/refs/tags/${TAG}.tar.gz" + DOWNLOADED=0 + for url in "$SQLITE_SOURCE" "$SQLITE_MIRROR"; do + for attempt in 1 2 3 4 5; do + if wget -O sqlite.tar.gz "$url"; then + DOWNLOADED=1 + break 2 + fi + if [ "$attempt" -lt 5 ]; then + sleep $(( attempt * 10 )) + fi + done + done + if [ "$DOWNLOADED" -ne 1 ]; then + exit 1 + fi + tar xzf sqlite.tar.gz + if [ ! -d sqlite ]; then + SQLITE_DIR=$(find . -maxdepth 1 -type d -name 'sqlite-*' | head -n 1) + if [ -z "$SQLITE_DIR" ]; then + exit 1 + fi + mv "$SQLITE_DIR" sqlite + fi + cd sqlite + ./configure --prefix=/usr/local CFLAGS="-DSQLITE_ENABLE_COLUMN_METADATA -DSQLITE_ENABLE_FTS5 -DSQLITE_USE_URI -DSQLITE_ENABLE_JSON1" LDFLAGS="-lm" + make -j$(nproc) + sudo make install + sudo ldconfig + + - name: Set up PHP + uses: shivammathur/setup-php@v2 + with: + php-version: ${{ matrix.php }} + coverage: none + tools: phpunit-polyfills + + - name: Verify SQLite version in PHP + run: | + EXPECTED='${{ matrix.sqlite }}' + if [ "$EXPECTED" = 'latest' ]; then + EXPECTED=$(cat sqlite/VERSION) + fi + PDO=$(php -r "echo (new PDO('sqlite::memory'))->query('SELECT SQLITE_VERSION();')->fetch()[0];") + echo "Expected SQLite version: $EXPECTED" + echo "PHP PDO SQLite version: $PDO" + if [ "$EXPECTED" != "$PDO" ]; then + echo "Error: Expected SQLite version $EXPECTED, but PHP PDO uses $PDO" + exit 1 + fi + + - name: Set up Rust + if: matrix.extension + uses: dtolnay/rust-toolchain@stable + + - name: Cache Rust build + if: matrix.extension + uses: Swatinem/rust-cache@v2 + with: + workspaces: packages/php-ext-wp-mysql-parser + # Segregate by PHP version: the extension links against the PHP headers + # of the matrix's php-config, so a build cached for one PHP version is + # ABI-incompatible with another (Zend module API mismatch on load). + key: php-${{ matrix.php }} + + - name: Install native build dependencies + if: matrix.extension + run: | + sudo apt-get update + sudo apt-get install -y libclang-dev + echo "PHP_CONFIG=$(command -v php-config)" >> "$GITHUB_ENV" + LIBCLANG_SO="$(find /usr/lib -name 'libclang.so*' | head -n 1)" + echo "LIBCLANG_PATH=$(dirname "$LIBCLANG_SO")" >> "$GITHUB_ENV" + + - name: Install Composer dependencies (root) + uses: ramsey/composer-install@v3 + with: + ignore-cache: "yes" + composer-options: "--optimize-autoloader" + + - name: Install Composer dependencies (mysql-on-sqlite) + uses: ramsey/composer-install@v3 + with: + working-directory: packages/mysql-on-sqlite + ignore-cache: "yes" + composer-options: "--optimize-autoloader" + + - name: Check Rust formatting + if: ${{ matrix.extension && matrix.php == '8.2' }} + run: cargo fmt --check + working-directory: packages/php-ext-wp-mysql-parser + + - name: Build parser extension + if: matrix.extension + run: cargo build --release + working-directory: packages/php-ext-wp-mysql-parser + + - name: Verify native parser extension + if: matrix.extension + run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/release/libwp_mysql_parser.so" tests/tools/verify-native-parser-extension.php + working-directory: packages/mysql-on-sqlite + + - name: Run PHPUnit suite with parser extension + if: matrix.extension + env: + WP_SQLITE_REQUIRE_NATIVE_PARSER_EXTENSION: '1' + run: php -d extension="$GITHUB_WORKSPACE/packages/php-ext-wp-mysql-parser/target/release/libwp_mysql_parser.so" ./vendor/bin/phpunit -c ./phpunit.xml.dist + working-directory: packages/mysql-on-sqlite + + - name: Run PHPUnit suite + if: ${{ ! matrix.extension }} + run: php ./vendor/bin/phpunit -c ./phpunit.xml.dist + working-directory: packages/mysql-on-sqlite diff --git a/.github/workflows/release-publish.yml b/.github/workflows/release-publish.yml index 7df54551..d02c88e3 100644 --- a/.github/workflows/release-publish.yml +++ b/.github/workflows/release-publish.yml @@ -54,6 +54,7 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '8.2' + coverage: none - name: Build plugin zip run: composer run build-sqlite-plugin-zip diff --git a/.github/workflows/wp-tests-phpunit-native-extension-setup.sh b/.github/workflows/wp-tests-phpunit-native-extension-setup.sh index 943b06c5..c6ba3972 100644 --- a/.github/workflows/wp-tests-phpunit-native-extension-setup.sh +++ b/.github/workflows/wp-tests-phpunit-native-extension-setup.sh @@ -145,7 +145,7 @@ $parser = new WP_MySQL_Parser( $grammar, $tokens ); wp_sqlite_assert_native_parser_delegate( $parser, 'WordPress PHP test container did not select the native parser delegate.' ); $parser_ast = $parser->parse(); -if ( ! ( $parser_ast instanceof WP_MySQL_Native_Parser_Node ) ) { +if ( ! ( $parser_ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'Native parser did not produce a native-backed AST in the WordPress PHP test container.' ); } @@ -155,18 +155,13 @@ wp_sqlite_assert_native_parser_delegate( $parser, 'WordPress PHP test container $parser->next_query(); $ast = $parser->get_query_ast(); -if ( ! ( $ast instanceof WP_MySQL_Native_Parser_Node ) ) { +if ( ! ( $ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'WordPress PHP test container did not select the native-backed AST.' ); } -$reflection = new ReflectionObject( $ast ); -if ( $reflection->hasProperty( 'native_ast' ) || $reflection->hasProperty( 'native_node_index' ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper still stores Rust AST handle properties.' ); -} - $first = $ast->get_first_child_node(); -if ( ! ( $first instanceof WP_MySQL_Native_Parser_Node ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a native-backed child node.' ); +if ( ! ( $first instanceof WP_Parser_Node ) ) { + wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a child node.' ); } if ( $first !== $ast->get_first_child_node() ) { @@ -177,7 +172,7 @@ $synthetic = new WP_Parser_Node( 0, 'synthetic' ); $first->append_child( $synthetic ); $same_first = $ast->get_first_child_node(); if ( $same_first !== $first || ! in_array( $synthetic, $same_first->get_children(), true ) ) { - wp_sqlite_native_parser_verification_fail( 'Materialized native wrapper was lost from the parent cache.' ); + wp_sqlite_native_parser_verification_fail( 'Mutated child was lost from the parent.' ); } EOF diff --git a/packages/mysql-on-sqlite/src/load.php b/packages/mysql-on-sqlite/src/load.php index 62387a2e..fb0c8c3b 100644 --- a/packages/mysql-on-sqlite/src/load.php +++ b/packages/mysql-on-sqlite/src/load.php @@ -13,24 +13,57 @@ require_once __DIR__ . '/parser/class-wp-parser-token.php'; require_once __DIR__ . '/mysql/class-wp-mysql-token.php'; -/* - * The MySQL lexer and parser have an optional native (e.g. Rust) implementation. - * When the native extension is loaded, it pre-declares WP_MySQL_Native_Lexer / - * WP_MySQL_Native_Parser; otherwise we fall back to the pure-PHP classes shipped - * here. WP_MySQL_Lexer / WP_MySQL_Parser is the public entrypoint either way. +/** + * Whether the loaded "wp_mysql_parser" extension speaks a grammar ABI that this + * code supports. + * + * The native parser and PHP exchange the parser grammar via + * "wp_sqlite_mysql_native_export_grammar()"; the shape of that data is an ABI. + * Compatibility is tracked by the extension's minor version (the "x" in "0.x"): + * a backward-incompatible change to the grammar ABI bumps the minor version. + * This code supports the "0.2.x" line. A version outside the supported range - + * e.g. an older extension binary lagging a plugin update - cannot exchange the + * grammar safely and must fall back to the pure-PHP path. + * + * Keep the supported range in sync with the extension's "Cargo.toml" version + * (see "packages/php-ext-wp-mysql-parser/README.md"). + * + * @param string|false $extension_version Version reported by "phpversion( 'wp_mysql_parser' )". + * @return bool Whether the native lexer/parser path can be used. */ -if ( class_exists( 'WP_MySQL_Native_Lexer', false ) ) { - require_once __DIR__ . '/mysql/native/class-wp-mysql-lexer.php'; -} else { - require_once __DIR__ . '/mysql/class-wp-mysql-lexer.php'; +function wp_sqlite_mysql_native_grammar_abi_supported( $extension_version ): bool { + if ( ! is_string( $extension_version ) ) { + return false; + } + return version_compare( $extension_version, '0.2.0', '>=' ) + && version_compare( $extension_version, '0.3.0', '<' ); } -if ( class_exists( 'WP_MySQL_Native_Parser', false ) ) { +/* + * The MySQL lexer and parser have an optional native (e.g. Rust) implementation, + * registered by the "wp_mysql_parser" extension. When loaded, it pre-declares + * WP_MySQL_Native_Lexer / WP_MySQL_Native_Parser; otherwise we use the pure-PHP + * classes shipped here. WP_MySQL_Lexer / WP_MySQL_Parser is the public entrypoint + * either way. + * + * The native lexer and parser are a matched pair - the native lexer emits a token + * stream that only the native parser can consume - so they are selected together + * or not at all. We only select the native path when the loaded extension speaks a + * grammar ABI this code supports; otherwise (including a stale extension binary) we + * fall back to the pure-PHP path cleanly instead of failing at parse time. + */ +$wp_sqlite_use_native_parser = + class_exists( 'WP_MySQL_Native_Lexer', false ) + && class_exists( 'WP_MySQL_Native_Parser', false ) + && wp_sqlite_mysql_native_grammar_abi_supported( phpversion( 'wp_mysql_parser' ) ); + +if ( $wp_sqlite_use_native_parser ) { + require_once __DIR__ . '/mysql/native/class-wp-mysql-lexer.php'; require_once __DIR__ . '/mysql/native/mysql-rust-bridge.php'; - require_once __DIR__ . '/mysql/native/class-wp-mysql-native-parser-node.php'; require_once __DIR__ . '/mysql/native/trait-wp-mysql-native-parser-impl.php'; require_once __DIR__ . '/mysql/native/class-wp-mysql-parser.php'; } else { + require_once __DIR__ . '/mysql/class-wp-mysql-lexer.php'; require_once __DIR__ . '/mysql/class-wp-mysql-parser.php'; } require_once __DIR__ . '/sqlite/class-wp-sqlite-connection.php'; diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 10ecd90a..37a79ffa 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { */ private $sql; + /** + * Byte length of the SQL payload. + * + * @var int + */ + private $sql_length; + /** * The version of the MySQL server that the SQL payload is intended for. * @@ -2189,6 +2196,7 @@ public function __construct( array $sql_modes = array() ) { $this->sql = $sql; + $this->sql_length = strlen( $sql ); $this->mysql_version = $mysql_version; foreach ( $sql_modes as $sql_mode ) { @@ -2227,6 +2235,9 @@ public function next_token(): bool { return false; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); @@ -2281,13 +2292,59 @@ public function get_token(): ?WP_MySQL_Token { * This method can be used to tokenize the whole SQL payload at once, at the * expense of storing all token objects in memory at the same time. * + * This deliberately inlines the same tokenizer step as next_token() instead + * of looping over next_token()/get_token(), to avoid a method call and a + * token-object round trip per token. Keep the EOF/invalid-input guard, the + * whitespace skip, and the comment-skip do-while in sync with next_token(). + * * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { - $tokens = array(); - while ( true === $this->next_token() ) { - $token = $this->get_token(); - $tokens[] = $token; + $tokens = array(); + $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( + self::SQL_MODE_NO_BACKSLASH_ESCAPES + ); + + while ( true ) { + // Bail on EOF, or on a null token type once at least one byte has + // been consumed (read_next_token() hit invalid input mid-stream). + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + break; + } + + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + + do { + $this->token_starts_at = $this->bytes_already_read; + $this->token_type = $this->read_next_token(); + } while ( + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type + ); + + if ( null === $this->token_type ) { + break; + } + + $tokens[] = new WP_MySQL_Token( + $this->token_type, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at, + $this->sql, + $no_backslash_escapes_sql_mode_set + ); + + if ( self::EOF === $this->token_type ) { + $this->token_type = null; + break; + } } return $tokens; } @@ -2354,20 +2411,61 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; - if ( "'" === $byte || '"' === $byte || '`' === $byte ) { + // A map for a single-byte symbol fast path. + static $single_byte_ops = array( + '(' => self::OPEN_PAR_SYMBOL, + ')' => self::CLOSE_PAR_SYMBOL, + ',' => self::COMMA_SYMBOL, + ';' => self::SEMICOLON_SYMBOL, + '+' => self::PLUS_OPERATOR, + '~' => self::BITWISE_NOT_OPERATOR, + '%' => self::MOD_OPERATOR, + '^' => self::BITWISE_XOR_OPERATOR, + '?' => self::PARAM_MARKER, + '{' => self::OPEN_CURLY_SYMBOL, + '}' => self::CLOSE_CURLY_SYMBOL, + '=' => self::EQUAL_OPERATOR, + ); + + // Fast path for keywords and identifiers. + // `$byte > "\x7F"` catches any non-ASCII byte (0x80-0xFF); read_identifier() + // restricts the accepted identifier codepoints to U+0080-U+FFFF. + // `"'" !== $next_byte` defers x'..', n'..' and similar special + // literals to their dedicated branches below; only single quotes + // form those, regardless of SQL mode. + if ( + ( + ( $byte >= 'a' && $byte <= 'z' ) + || ( $byte >= 'A' && $byte <= 'Z' ) + || $byte > "\x7F" + ) + && "'" !== $next_byte + ) { + $started_at = $this->bytes_already_read; + $type = $this->read_identifier(); + if ( self::IDENTIFIER === $type ) { + // When preceded by a dot, it is always an identifier. + if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { + $type = self::IDENTIFIER; + } else { + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + } + } + } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { + // Fast path for single-byte symbols. + $this->bytes_already_read += 1; + $type = $single_byte_ops[ $byte ]; + } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); - } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { + } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); } elseif ( '.' === $byte ) { - if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) { + if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) { $type = $this->read_number(); } else { $this->bytes_already_read += 1; $type = self::DOT_SYMBOL; } - } elseif ( '=' === $byte ) { - $this->bytes_already_read += 1; - $type = self::EQUAL_OPERATOR; } elseif ( ':' === $byte ) { $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $next_byte ) { @@ -2414,14 +2512,17 @@ private function read_next_token(): ?int { } else { $type = self::LOGICAL_NOT_OPERATOR; } - } elseif ( '+' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PLUS_OPERATOR; } elseif ( '-' === $byte ) { + $third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null; if ( '-' === $next_byte - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 + && ( + ' ' === $third_byte + || "\t" === $third_byte + || "\n" === $third_byte + || "\r" === $third_byte + || "\f" === $third_byte + ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { @@ -2466,9 +2567,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DIV_OPERATOR; } - } elseif ( '%' === $byte ) { - $this->bytes_already_read += 1; - $type = self::MOD_OPERATOR; } elseif ( '&' === $byte ) { $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $next_byte ) { @@ -2477,9 +2575,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_AND_OPERATOR; } - } elseif ( '^' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $byte ) { $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $next_byte ) { @@ -2490,27 +2585,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_OR_OPERATOR; } - } elseif ( '~' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_NOT_OPERATOR; - } elseif ( ',' === $byte ) { - $this->bytes_already_read += 1; - $type = self::COMMA_SYMBOL; - } elseif ( ';' === $byte ) { - $this->bytes_already_read += 1; - $type = self::SEMICOLON_SYMBOL; - } elseif ( '(' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_PAR_SYMBOL; - } elseif ( ')' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_PAR_SYMBOL; - } elseif ( '{' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_CURLY_SYMBOL; - } elseif ( '}' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $byte ) { $this->bytes_already_read += 1; // Consume the '@'. @@ -2534,9 +2608,6 @@ private function read_next_token(): ?int { $type = self::AT_SIGN_SYMBOL; } } - } elseif ( '?' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PARAM_MARKER; } elseif ( '\\' === $byte ) { $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $next_byte ) { @@ -2547,7 +2618,13 @@ private function read_next_token(): ?int { } } elseif ( '#' === $byte ) { $type = $this->read_line_comment(); - } elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) { + } elseif ( + ' ' === $byte + || "\t" === $byte + || "\n" === $byte + || "\r" === $byte + || "\f" === $byte + ) { $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); $type = self::WHITESPACE; } elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) { @@ -2675,7 +2752,7 @@ private function read_number(): ?int { '0' === $byte && 'x' === $next_byte && null !== $third_byte - && strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0 + && false !== strpos( self::HEX_DIGIT_MASK, $third_byte ) ) // HEX number in the form of x'N' or X'N'. || ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte ) @@ -2685,7 +2762,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2708,7 +2785,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2737,11 +2814,12 @@ private function read_number(): ?int { ( 'e' === $byte || 'E' === $byte ) && null !== $next_byte && ( - strspn( $next_byte, self::DIGIT_MASK ) > 0 + ( $next_byte >= '0' && $next_byte <= '9' ) || ( ( '+' === $next_byte || '-' === $next_byte ) - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && $this->sql[ $this->bytes_already_read + 2 ] >= '0' + && $this->sql[ $this->bytes_already_read + 2 ] <= '9' ) ); if ( $has_exponent ) { @@ -2838,12 +2916,11 @@ private function read_quoted_text(): ?int { // in which case the escape sequence is consumed and the loop continues. $at = $this->bytes_already_read; while ( true ) { - $at += strcspn( $this->sql, $quote, $at ); - - // Unclosed string - unexpected EOF. - if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { + $quote_at = strpos( $this->sql, $quote, $at ); + if ( false === $quote_at ) { return null; // Invalid input. } + $at = $quote_at; /* * By default, quotes can be escaped with a "\". @@ -2853,9 +2930,17 @@ private function read_quoted_text(): ?int { * The quote is escaped only when the number of preceding backslashes * is odd - "\" is an escape sequence, "\\" is an escaped backslash, * "\\\" is an escaped backslash and an escape sequence, and so on. + * + * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- + * offset wraparound (PHP 7.1+) when the closing-quote candidate + * sits at the very start of the input. The `?? null` covers + * positive out-of-range indexes belt-and-suspenders. */ if ( ! $no_backslash_escapes ) { - for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); + $i = 0; + while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { + $i += 1; + } if ( 1 === $i % 2 ) { $at += 1; continue; @@ -2920,17 +3005,11 @@ private function read_mysql_comment(): int { } private function read_comment_content(): void { - while ( true ) { - $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); - $this->bytes_already_read += 1; // Consume the '*'. - $byte = $this->sql[ $this->bytes_already_read ] ?? null; - if ( null === $byte ) { - break; - } - if ( '/' === $byte ) { - $this->bytes_already_read += 1; // Consume the '/'. - break; - } + $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); + if ( false === $comment_end ) { + $this->bytes_already_read = $this->sql_length; + } else { + $this->bytes_already_read = $comment_end + 2; } } diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index 69282b9c..4b74a904 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -14,8 +14,7 @@ class WP_MySQL_Parser extends WP_Parser { * @param array $tokens The parser tokens. */ public function reset_tokens( array $tokens ): void { - $this->tokens = $tokens; - $this->position = 0; + $this->set_tokens( $tokens ); $this->current_ast = null; } @@ -40,7 +39,7 @@ public function reset_tokens( array $tokens ): void { * @return bool Whether a query was successfully parsed. */ public function next_query(): bool { - if ( $this->position >= count( $this->tokens ) ) { + if ( $this->position >= $this->token_count ) { return false; } $this->current_ast = $this->parse(); diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 1fb25ab4..2853c7c6 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,7 +30,15 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { - parent::__construct( $id, $start, $length, $input ); + // Assign the inherited fields directly instead of calling + // parent::__construct(). The lexer builds one token per recognized + // token, so skipping the parent call is a measurable hot-path win. + // Keep these assignments in sync with WP_Parser_Token's fields. + $this->id = $id; + $this->start = $start; + $this->length = $length; + $this->input = $input; + $this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled; } diff --git a/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php b/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php deleted file mode 100644 index 47c2b240..00000000 --- a/packages/mysql-on-sqlite/src/mysql/native/class-wp-mysql-native-parser-node.php +++ /dev/null @@ -1,179 +0,0 @@ -materialize_native_children(); - parent::append_child( $node ); - } - - /** @inheritDoc */ - public function merge_fragment( $node ) { - $this->materialize_native_children(); - if ( $node instanceof self ) { - $node->materialize_native_children(); - } - parent::merge_fragment( $node ); - } - - /** @inheritDoc */ - public function has_child(): bool { - if ( $this->was_mutated ) { - return parent::has_child(); - } - return wp_sqlite_mysql_native_ast_has_child( $this ); - } - - /** @inheritDoc */ - public function has_child_node( ?string $rule_name = null ): bool { - if ( $this->was_mutated ) { - return parent::has_child_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_has_child_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function has_child_token( ?int $token_id = null ): bool { - if ( $this->was_mutated ) { - return parent::has_child_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_has_child_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_first_child() { - if ( $this->was_mutated ) { - return parent::get_first_child(); - } - return wp_sqlite_mysql_native_ast_get_first_child( $this ); - } - - /** @inheritDoc */ - public function get_first_child_node( ?string $rule_name = null ): ?WP_Parser_Node { - if ( $this->was_mutated ) { - return parent::get_first_child_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_first_child_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_first_child_token( ?int $token_id = null ): ?WP_Parser_Token { - if ( $this->was_mutated ) { - return parent::get_first_child_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_first_child_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_first_descendant_node( ?string $rule_name = null ): ?WP_Parser_Node { - if ( $this->was_mutated ) { - return parent::get_first_descendant_node( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_first_descendant_node( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_first_descendant_token( ?int $token_id = null ): ?WP_Parser_Token { - if ( $this->was_mutated ) { - return parent::get_first_descendant_token( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_first_descendant_token( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_children(): array { - if ( $this->was_mutated ) { - return parent::get_children(); - } - return wp_sqlite_mysql_native_ast_get_children( $this ); - } - - /** @inheritDoc */ - public function get_child_nodes( ?string $rule_name = null ): array { - if ( $this->was_mutated ) { - return parent::get_child_nodes( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_child_nodes( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_child_tokens( ?int $token_id = null ): array { - if ( $this->was_mutated ) { - return parent::get_child_tokens( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_child_tokens( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_descendants(): array { - if ( $this->was_mutated ) { - return parent::get_descendants(); - } - return wp_sqlite_mysql_native_ast_get_descendants( $this ); - } - - /** @inheritDoc */ - public function get_descendant_nodes( ?string $rule_name = null ): array { - if ( $this->was_mutated ) { - return parent::get_descendant_nodes( $rule_name ); - } - return wp_sqlite_mysql_native_ast_get_descendant_nodes( $this, $rule_name ); - } - - /** @inheritDoc */ - public function get_descendant_tokens( ?int $token_id = null ): array { - if ( $this->was_mutated ) { - return parent::get_descendant_tokens( $token_id ); - } - return wp_sqlite_mysql_native_ast_get_descendant_tokens( $this, $token_id ); - } - - /** @inheritDoc */ - public function get_start(): int { - if ( $this->was_mutated ) { - return parent::get_start(); - } - return wp_sqlite_mysql_native_ast_get_start( $this ); - } - - /** @inheritDoc */ - public function get_length(): int { - if ( $this->was_mutated ) { - return parent::get_length(); - } - return wp_sqlite_mysql_native_ast_get_length( $this ); - } - - private function materialize_native_children(): void { - if ( $this->was_mutated ) { - return; - } - - $this->children = wp_sqlite_mysql_native_ast_get_children( $this ); - $this->was_mutated = true; - if ( function_exists( 'wp_sqlite_mysql_native_ast_materialize_wrapper' ) ) { - wp_sqlite_mysql_native_ast_materialize_wrapper( $this ); - } - } -} diff --git a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php index 974cfa66..c8e14e07 100644 --- a/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php +++ b/packages/mysql-on-sqlite/src/mysql/native/mysql-rust-bridge.php @@ -12,11 +12,17 @@ * @return array */ function wp_sqlite_mysql_native_export_grammar( WP_Parser_Grammar $grammar ): array { + // The native parser only needs each rule's FIRST set to decide early + // whether a rule can start with the current token; it builds its own + // branch candidates from `rules`. Export the eagerly-computed FIRST sets + // directly so the lazy per-token selector table is never materialized for + // the native path. return array( - 'highest_terminal_id' => $grammar->highest_terminal_id, - 'rules' => $grammar->rules, - 'lookahead_is_match_possible' => $grammar->lookahead_is_match_possible, - 'rule_names' => $grammar->rule_names, - 'fragment_ids' => $grammar->fragment_ids, + 'highest_terminal_id' => $grammar->highest_terminal_id, + 'rules' => $grammar->rules, + 'first_sets' => $grammar->first_sets, + 'nullable_branches' => $grammar->nullable_branches, + 'rule_names' => $grammar->rule_names, + 'fragment_ids' => $grammar->fragment_ids, ); } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 9bf30b97..8368d303 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -28,12 +28,81 @@ class WP_Parser_Grammar { */ public $rules; public $rule_names; - public $fragment_ids; - public $lookahead_is_match_possible = array(); + public $fragment_ids = array(); + + /** + * Per-rule branch selector keyed by the next token id. + * + * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list + * of candidate branch symbol sequences (drawn from `$rules[$rule_id]`) + * that can possibly match when the current token has the given id. + * Nullable branches appear in every entry. + * + * If an entry does not exist for the current token, `$nullable_branches` + * is consulted. If neither has an entry for this rule, the rule cannot + * match and the parser returns immediately. + * + * @var array> + */ + public $branches_for_token = array(); + + /** + * Per-rule marker indicating the rule has at least one nullable branch. + * + * @var array + */ + public $nullable_branches = array(); + + /** + * Per-rule flag indicating every (rule, token) selector entry points + * to exactly one branch. The parser uses this to skip the outer + * foreach when a single candidate is the only possibility. + * + * @var array + */ + public $single_candidate_rules = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; + /** + * Memoized rule-id lookups, keyed by rule name. + * + * `get_rule_id()` is a linear `array_search` over `$rule_names` and + * costs a few microseconds per call on the MySQL grammar. The parser + * looks up its start rule and the `selectStatement` rule on a hot path, + * so the results are memoized via `get_or_cache_rule_id()`. + * + * @var array + */ + private $cached_rule_ids = array(); + + /** + * Per-rule FIRST sets from the fixpoint. + * + * Kept so per-rule selectors can be denormalized lazily on first use, and + * exported to the native parser (which needs only each rule's FIRST set, + * not the lazily-built per-token selector table). + * + * @var array> + */ + public $first_sets = array(); + + /** + * Per-rule NULLABLE flags from the fixpoint. + * + * @var array + */ + private $rule_nullable = array(); + + /** + * Rules whose branch selector has already been built. + * + * @var array + */ + private $rule_selector_built = array(); + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -46,6 +115,25 @@ public function get_rule_id( $rule_name ) { return array_search( $rule_name, $this->rule_names, true ); } + /** + * Return the rule id for a given rule name, memoizing the result. + * + * Equivalent to `get_rule_id()` but caches the lookup so repeated + * queries for the same rule name (typically the start rule and a few + * grammar-specific rules consulted on the parser hot path) avoid + * the linear scan over `$rule_names`. Returns `false` for unknown + * rule names, mirroring `get_rule_id()`. + * + * @param string $rule_name + * @return int|false + */ + public function get_or_cache_rule_id( $rule_name ) { + if ( ! array_key_exists( $rule_name, $this->cached_rule_ids ) ) { + $this->cached_rule_ids[ $rule_name ] = $this->get_rule_id( $rule_name ); + } + return $this->cached_rule_ids[ $rule_name ]; + } + /** * Inflate the grammar to an internal representation optimized for parsing. * @@ -57,8 +145,8 @@ private function inflate( $grammar ) { $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rule_names[ $rule_id ] = $rule_name; /** * Treat all intermediate rules as fragments to inline before returning @@ -76,7 +164,7 @@ private function inflate( $grammar ) { * They are prefixed with a "%" to be distinguished from the original rules. */ if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + $this->fragment_ids[ $rule_id ] = true; } } @@ -86,55 +174,353 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bail out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whopping 80%! - * - * @TODO: Explore these possible next steps: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + $this->inline_single_branch_fragments(); + $this->strip_epsilon_markers(); + $this->build_branch_selectors(); + } + + /** + * Inline single-branch fragment rules into their call sites. + * + * The grammar contains many single-branch fragment rules that exist only + * to factor shared sub-sequences out of larger productions. At runtime + * the parser would descend into each such fragment via a recursive call + * just to walk the same symbol sequence and splice the results back into + * the parent. Expanding them in-place at build time eliminates that call + * chain without changing the resulting AST because fragment children are + * already flattened into the parent node. + * + * Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are + * left intact because they represent real choices that must be evaluated + * against the current token. + */ + private function inline_single_branch_fragments() { + $rules = $this->rules; + $fragment_ids = $this->fragment_ids; + $low_nt = $this->lowest_non_terminal_id; + + // Precompute the set of single-branch fragments that are candidates + // for inlining. + $inlinable = array(); + foreach ( $fragment_ids as $rule_id => $_ ) { + if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) { + $inlinable[ $rule_id ] = true; + } + } + + // Depth-first expansion memoized per rule, with cycle detection. + $expanded = array(); + $visiting = array(); + $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + $out[] = $sym; continue; } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; - foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + if ( ! isset( $inlinable[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( isset( $visiting[ $sym ] ) ) { + // Cycle: leave the reference in place. + $out[] = $sym; + continue; + } + if ( ! isset( $expanded[ $sym ] ) ) { + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + unset( $visiting[ $sym ] ); + } + foreach ( $expanded[ $sym ] as $s ) { + $out[] = $s; + } + } + return $out; + }; + + // Rewrite every rule's branches with fragments inlined. + foreach ( $this->rules as $rule_id => $branches ) { + $new_branches = array(); + foreach ( $branches as $branch ) { + $new_branches[] = $expand_branch( $branch ); + } + $this->rules[ $rule_id ] = $new_branches; + } + } + + /** + * Remove explicit `EMPTY_RULE_ID` markers from branches. + * + * The epsilon marker is a zero-width, always-matching symbol used in the + * grammar to express optional productions. At parse time it would still + * be walked and "continued" over for no effect, so stripping it ahead of + * time removes a per-symbol branch in the hot loop. + * + * A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`) + * which the parser already handles: the inner symbol loop does nothing and + * the rule returns a successful empty match. + */ + private function strip_epsilon_markers() { + foreach ( $this->rules as $rule_id => $branches ) { + foreach ( $branches as $i => $branch ) { + if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) { + $stripped = array(); + foreach ( $branch as $symbol ) { + if ( self::EMPTY_RULE_ID !== $symbol ) { + $stripped[] = $symbol; + } } + $this->rules[ $rule_id ][ $i ] = $stripped; + } + } + } + } - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; + /** + * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize + * them into a per-rule map of `token_id => branch_index[]` so the parser + * can jump straight to the branches that can possibly match the current + * token. + * + * This replaces the previous coarse "can any branch match this token?" + * lookahead. On the MySQL corpus the fine-grained selector skips ~60% + * of the branch attempts that the parser used to try and fail. + */ + private function build_branch_selectors() { + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); + + foreach ( $rule_ids as $rule_id ) { + $nullable[ $rule_id ] = false; + $first_sets[ $rule_id ] = array(); + } + + // Reverse-dependency map: for each non-terminal, the rules that + // reference it. FIRST/NULLABLE grow monotonically, so a rule can only + // be affected when one of the rules it references grows. + $dependents = array(); + foreach ( $rule_ids as $rule_id ) { + $seen = array(); + foreach ( $rules[ $rule_id ] as $branch ) { + foreach ( $branch as $symbol ) { + if ( $symbol >= $low_nt && ! isset( $seen[ $symbol ] ) ) { + $seen[ $symbol ] = true; + $dependents[ $symbol ][] = $rule_id; + } + } + } + } + + // Worklist fixpoint. Recompute a rule's FIRST/NULLABLE only when a rule + // it references has grown, instead of rescanning every rule on every + // pass until the whole grammar stabilizes. + $queued = array_fill_keys( $rule_ids, true ); + $worklist = $rule_ids; + while ( $worklist ) { + $rule_id = array_pop( $worklist ); + unset( $queued[ $rule_id ] ); + + $first = $first_sets[ $rule_id ]; + $before = count( $first ); + $was_nullable = $nullable[ $rule_id ]; + $is_nullable = $was_nullable; + foreach ( $rules[ $rule_id ] as $branch ) { + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. + continue; + } + if ( $symbol < $low_nt ) { + // Terminal. + $first[ $symbol ] = true; + $branch_nullable = false; break; } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; + // Non-terminal: union FIRST(symbol) in one operation. + $first += $first_sets[ $symbol ]; + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; } } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + if ( $branch_nullable ) { + $is_nullable = true; + } + } + + // Re-enqueue dependents only when this rule actually grew. + if ( count( $first ) > $before || ( $is_nullable && ! $was_nullable ) ) { + $first_sets[ $rule_id ] = $first; + $nullable[ $rule_id ] = $is_nullable; + if ( isset( $dependents[ $rule_id ] ) ) { + foreach ( $dependents[ $rule_id ] as $dependent ) { + if ( ! isset( $queued[ $dependent ] ) ) { + $queued[ $dependent ] = true; + $worklist[] = $dependent; + } + } } } } + + // FIRST/NULLABLE are now final. A rule is nullable exactly when it has + // a nullable branch, so publish nullable_branches eagerly; the parser's + // nullable fallback consults it for every rule. branches_for_token and + // single_candidate_rules are built lazily per rule (ensure_rule_selector) + // because a typical query touches only a few percent of all rules, so + // denormalizing the whole grammar up front is mostly wasted work. + $this->first_sets = $first_sets; + $this->rule_nullable = $nullable; + foreach ( $nullable as $rule_id => $is_nullable ) { + if ( $is_nullable ) { + $this->nullable_branches[ $rule_id ] = true; + } + } + } + + /** + * Build the per-token branch selector for one rule on first use. + * + * Denormalizes the rule's branches into `token_id => branch sequences[]` + * from the precomputed FIRST/NULLABLE sets, populating branches_for_token + * (and single_candidate_rules). Memoized, so repeated calls are cheap. + * + * @param int $rule_id + */ + public function ensure_rule_selector( $rule_id ): void { + if ( isset( $this->rule_selector_built[ $rule_id ] ) ) { + return; + } + $this->rule_selector_built[ $rule_id ] = true; + + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $first_sets = $this->first_sets; + $nullable = $this->rule_nullable; + $branches = $this->rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; + break; + } + $branch_first += $first_sets[ $symbol ]; + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; + } + } + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; + } + } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + // nullable_branches is already published eagerly from the fixpoint; + // here we only fold the nullable branches into each selector entry. + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); + } + $selector = $merged; + } + if ( $selector ) { + // Embed the branch symbol sequences directly so the parser can + // iterate candidate branches without a $branches[$idx] lookup on + // every attempt. Many tokens in a rule share the same branch-id + // list, so deduplicate by signature and let copy-on-write share + // one sequences array across them. This dedup matters: unshared, + // the table would be ~35 MiB on the MySQL grammar; shared, it is + // a few MiB, built once per process (not per query). + $by_signature = array(); + $all_single_candidates = true; + foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } + $sig = isset( $idx_list[1] ) ? implode( ',', $idx_list ) : $idx_list[0]; + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; + } + } + $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + } + + /** + * Eagerly build every rule's selector. + * + * The pure-PHP parser builds selectors lazily and the native bridge exports + * the FIRST sets instead, so this is only for consumers that read the full + * branches_for_token table directly (currently the grammar tests). + */ + public function build_all_selectors(): void { + foreach ( array_keys( $this->rules ) as $rule_id ) { + $this->ensure_rule_selector( $rule_id ); + } + } + + /** + * Merge two ascending int arrays into one ascending int array without + * duplicates. Preserves original branch order as required by the parser. + * + * @param int[] $a + * @param int[] $b + * @return int[] + */ + private static function merge_sorted( array $a, array $b ): array { + $i = 0; + $j = 0; + $na = count( $a ); + $nb = count( $b ); + $out = array(); + while ( $i < $na && $j < $nb ) { + if ( $a[ $i ] < $b[ $j ] ) { + $out[] = $a[ $i++ ]; + } elseif ( $a[ $i ] > $b[ $j ] ) { + $out[] = $b[ $j++ ]; + } else { + $out[] = $a[ $i ]; + ++$i; + ++$j; + } + } + while ( $i < $na ) { + $out[] = $a[ $i++ ]; + } + while ( $j < $nb ) { + $out[] = $b[ $j++ ]; + } + return $out; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index b61f38d5..67ff851e 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,106 +9,31 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -class WP_Parser_Node { +final class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ public $rule_id; public $rule_name; - protected $children = array(); + protected $children; - public function __construct( $rule_id, $rule_name ) { + public function __construct( $rule_id, $rule_name, array $children = array() ) { $this->rule_id = $rule_id; $this->rule_name = $rule_name; + $this->children = $children; } public function append_child( $node ) { $this->children[] = $node; } - /** - * Flatten the matched rule fragments as if their children were direct - * descendants of the current rule. - * - * What are rule fragments? - * - * When we initially parse the grammar file, it has compound rules such - * as this one: - * - * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) - * - * Building a parser that can understand such rules is way more complex than building - * a parser that only follows simple rules, so we flatten those compound rules into - * simpler ones. The above rule would be flattened to: - * - * query ::= EOF | %query0 - * %query0 ::= %%query01 %%query02 - * %%query01 ::= simpleStatement | beginWork - * %%query02 ::= SEMICOLON_SYMBOL EOF_zero_or_one | EOF - * EOF_zero_or_one ::= EOF | ε - * - * This factorization happens in "convert-grammar.php". - * - * "Fragments" are intermediate artifacts whose names are not in the original grammar. - * They are extremely useful for the parser, but the API consumer should never have to - * worry about them. Fragment names start with a percent sign ("%"). - * - * The code below inlines every fragment back in its parent rule. - * - * We could optimize this. The current $match may be discarded later on so any inlining - * effort here would be wasted. However, inlining seems cheap and doing it bottom-up here - * is **much** easier than reprocessing the parse tree top-down later on. - * - * The following parse tree: - * - * [ - * 'query' => [ - * [ - * '%query01' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ], - * '%query02' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * ] - * ] - * ] - * ] - * ] - * - * Would be inlined as: - * - * [ - * 'query' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ] - * ] - * ] - */ - public function merge_fragment( $node ) { - $this->children = array_merge( $this->children, $node->children ); - } - /** * Check if this node has any child nodes or tokens. * * @return bool True if this node has any child nodes or tokens, false otherwise. */ public function has_child(): bool { - return count( $this->children ) > 0; + return ! empty( $this->children ); } /** diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php index b7726189..4132ba38 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php @@ -35,7 +35,7 @@ class WP_Parser_Token { * * @var string */ - private $input; + protected $input; /** * Constructor. diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 4436892f..7c855dd5 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -11,85 +11,207 @@ class WP_Parser { protected $grammar; protected $tokens; + protected $token_count; protected $position; + // Grammar data cached as instance fields so the hot path avoids an extra + // property hop via $this->grammar on every recursive call. + private $rule_names; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + + // Rules whose selector has been pulled from the grammar into the caches + // above. Selectors are built lazily on first descent into a rule. + private $built_rules = array(); + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = array(); + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = array(); + + // The INTO negative-lookahead only fires for selectStatement. Cache + // the rule id so the per-call check is an int compare instead of a + // string compare. + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + + $this->set_tokens( $tokens ); + } + + /** + * Initialize the parser's token state. + * + * Stores the given token array, resets the position cursor, and appends + * an end-of-input sentinel token whose id is `EMPTY_RULE_ID` (0). The + * hot path can then read `$tokens[$pos]->id` unconditionally when + * `$pos` is the current cursor, because the sentinel naturally fails + * to match any real grammar terminal while feeding the nullable-fallback + * branch of the selector check. + * + * Invariants the hot path relies on: + * - The sentinel id (0) cannot match any grammar terminal. + * `strip_epsilon_markers()` removes id 0 from every branch at grammar + * build time, so no `$subrule_id` in the inner loop ever equals 0 + * and `++$this->position` can never advance past the sentinel. + * - The sentinel must never be appended to a node's children. It is + * only inspected via `$tokens[$pos]->id`; tokens are pushed into + * `$children` only on terminal-id equality, which the sentinel + * cannot satisfy. + * - `WP_MySQL_Parser::next_query()` bounds at `$position < $token_count` + * (set below, before the sentinel append), so the sentinel sits at + * index `$token_count` and is never fed into a parse round. + * + * @param array $tokens + */ + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; } public function parse() { // @TODO: Make the starting rule lookup non-grammar-specific. - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - $ast = $this->parse_recursive( $query_rule_id ); + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); return false === $ast ? null : $ast; } + /** + * Parse a single non-terminal rule. + * + * This function is only called for non-terminal rule ids. Terminals are + * matched inline inside the branch loop below to avoid a function-call + * round trip per consumed token. + */ private function parse_recursive( $rule_id ) { - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - if ( $this->position >= count( $this->tokens ) ) { - return false; - } + $tokens = $this->tokens; + $position = $this->position; - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) { - return true; + // Narrow the set of branches worth trying using the precomputed FIRST + // sets. branches_for_token is built lazily per rule, so a lookup miss + // means either "this token cannot start the rule" or "the rule is not + // denormalized yet". The hit path stays a single array access; only a + // miss consults built_rules and builds the rule's selector on first touch. + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + // Rule already built; this token simply does not start it. + return isset( $this->nullable_branches[ $rule_id ] ); + } else { + // First descent into this rule: build its selector, then resolve. + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } } - - if ( $this->tokens[ $this->position ]->id === $rule_id ) { - ++$this->position; - return $this->tokens[ $this->position - 1 ]; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); } - return false; } - $branches = $this->grammar->rules[ $rule_id ]; - if ( ! count( $branches ) ) { - return false; - } + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { - $token_id = $this->tokens[ $this->position ]->id; - if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] ) - ) { + // Fast path for rules where every (rule, token) selector entry + // points to exactly one branch - about 55% of nonterminal calls + // on the MySQL corpus. Skip the outer foreach and the + // $branch_matches bookkeeping; every failure path just rewinds + // the position and returns false directly. + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; return false; } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } - $rule_name = $this->grammar->rule_names[ $rule_id ]; - $starting_position = $this->position; - foreach ( $branches as $branch ) { - $this->position = $starting_position; - $node = new WP_Parser_Node( $rule_id, $rule_name ); + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + // The sentinel at $tokens[$token_count] has id 0 so it + // cannot match any real terminal, making the range check + // unnecessary here. + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); if ( false === $subnode ) { $branch_matches = false; break; - } elseif ( true === $subnode ) { - /* - * The subrule was matched without actually matching a token. - * This means a special empty "ε" (epsilon) rule was matched. - * An "ε" rule in a grammar matches an empty input of 0 bytes. - * It is used to represent optional grammar productions. - */ - continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; } - if ( is_array( $subnode ) && ! count( $subnode ) ) { + if ( true === $subnode ) { continue; } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); + if ( is_array( $subnode ) ) { + // Fragment results are returned directly as a children + // array so the parser does not allocate a Parser_Node + // that would immediately be unwrapped into the parent. + foreach ( $subnode as $c ) { + $children[] = $c; + } } else { - $node->append_child( $subnode ); + $children[] = $subnode; } } @@ -100,25 +222,36 @@ private function parse_recursive( $rule_id ) { // for right-associative rules, which could solve this. // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994 // See: https://github.com/antlr/antlr4/issues/488 - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) { + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { $branch_matches = false; } - if ( true === $branch_matches ) { + if ( $branch_matches ) { break; } } if ( ! $branch_matches ) { - $this->position = $starting_position; + $this->position = $position; return false; } - if ( ! $node->has_child() ) { + if ( ! $children ) { return true; } - return $node; + // Fragments exist only to group symbols for reuse; their "node" would + // get inlined into the parent on the very next step. Return the raw + // children array so the caller can splice it without allocating a + // throwaway WP_Parser_Node. + if ( $is_fragment ) { + return $children; + } + + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } } diff --git a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php index 8f18cf17..383b03f5 100644 --- a/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php +++ b/packages/mysql-on-sqlite/tests/mysql/WP_MySQL_Lexer_Tests.php @@ -367,6 +367,37 @@ function ( $severity, $message, $file, $line ) { $this->assertNull( $lexer->get_token() ); } + /** + * A charset-introducer-like name used as a qualified member (after a dot) + * must lex as an identifier. A real charset introducer only appears before + * a string literal, never as the member of a qualified reference. + * + * @dataProvider data_underscore_charset_after_dot + */ + public function test_underscore_charset_name_after_dot_is_identifier( string $sql, int $token_index, int $expected_id ): void { + $tokens = ( new WP_MySQL_Lexer( $sql ) )->remaining_tokens(); + $this->assertSame( + WP_MySQL_Lexer::get_token_name( $expected_id ), + $tokens[ $token_index ]->get_name(), + $sql + ); + } + + /** + * @return array + */ + public function data_underscore_charset_after_dot(): array { + return array( + // `t . _utf8` - the member name must be an identifier, not a charset. + 'charset name after dot is identifier' => array( 't._utf8', 2, WP_MySQL_Lexer::IDENTIFIER ), + 'other charset name after dot' => array( 'a._binary', 2, WP_MySQL_Lexer::IDENTIFIER ), + // A genuine charset introducer (before a string) stays a charset. + 'charset introducer before string' => array( "_utf8'x'", 0, WP_MySQL_Lexer::UNDERSCORE_CHARSET ), + // A non-charset underscore name after a dot stays an identifier. + 'non-charset underscore name after dot' => array( 't._foo', 2, WP_MySQL_Lexer::IDENTIFIER ), + ); + } + private function get_token_names( array $token_types ): array { return array_map( function ( $token_type ) { diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php new file mode 100644 index 00000000..32395b7f --- /dev/null +++ b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Grammar_Abi_Tests.php @@ -0,0 +1,55 @@ +assertTrue( wp_sqlite_mysql_native_grammar_abi_supported( $version ) ); + } + + /** + * @dataProvider unsupported_versions + * @param string|false $version + */ + public function test_unsupported_versions_are_rejected( $version ): void { + $this->assertFalse( wp_sqlite_mysql_native_grammar_abi_supported( $version ) ); + } + + /** + * @return array + */ + public function supported_versions(): array { + return array( + 'minor line lower bound' => array( '0.2.0' ), + 'patch within the line' => array( '0.2.1' ), + 'higher patch' => array( '0.2.99' ), + ); + } + + /** + * @return array + */ + public function unsupported_versions(): array { + return array( + 'extension not loaded' => array( false ), + 'older ABI line' => array( '0.1.0' ), + 'older ABI line high patch' => array( '0.1.99' ), + 'next (breaking) ABI line' => array( '0.3.0' ), + 'future major' => array( '1.0.0' ), + ); + } +} diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php deleted file mode 100644 index 57672162..00000000 --- a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Cycle_Tests.php +++ /dev/null @@ -1,267 +0,0 @@ -markTestSkipped( 'Native MySQL parser extension is not loaded.' ); - } - // Force a clean slate before each test — ASTs from earlier tests - // must not pollute the memory measurements below. - gc_collect_cycles(); - } - - private function parse( string $sql ): WP_Parser_Node { - static $grammar = null; - if ( null === $grammar ) { - $grammar = new WP_Parser_Grammar( include __DIR__ . '/../../../src/mysql/mysql-grammar.php' ); - } - $lexer = new WP_MySQL_Lexer( $sql ); - $tokens = $lexer instanceof WP_MySQL_Native_Lexer - ? $lexer->native_token_stream() - : $lexer->remaining_tokens(); - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $tree = $parser->parse(); - $this->assertNotNull( $tree, 'Failed to parse SQL: ' . $sql ); - return $tree; - } - - /** - * Hostile loop: parse and walk many ASTs in a tight loop, only - * `gc_collect_cycles()` between iterations. Memory must plateau. - * - * If wrapper registry entries or cache pointers are not released, peak - * memory grows linearly with iteration count. With cleanup in place, the - * working set stays bounded. - */ - public function test_repeated_parse_walk_drop_does_not_leak(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3) AND d = 4'; - - // Warm-up: do enough work that allocator overhead is amortized - // before we sample the floor. - for ( $i = 0; $i < 20; $i++ ) { - $ast = $this->parse( $sql ); - $ast->get_descendants(); - $ast = null; - gc_collect_cycles(); - } - $baseline = memory_get_usage(); - - // Now run substantially more iterations and assert the working - // set stays within a small multiple of the warm-up floor. - for ( $i = 0; $i < 500; $i++ ) { - $ast = $this->parse( $sql ); - $ast->get_descendants(); - $ast = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - - // 4 MB headroom — generous, but a leaking cache adds tens of MB - // across 500 iterations on this query. - $delta = $after - $baseline; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Memory grew %.1f MB across 500 parse-walk-drop cycles; the per-AST cache is not being collected.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * After dropping the AST and triggering GC, the entire wrapper - * graph must be reclaimable. We hand out one descendant, drop the - * root, then drop the descendant — the next gc cycle must reclaim - * the rest of the cached wrappers. - */ - public function test_drop_then_gc_reclaims_cached_wrappers(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3) AND d = 4'; - - // Establish a memory floor with no AST live. - gc_collect_cycles(); - $floor = memory_get_usage(); - - $ast = $this->parse( $sql ); - $descendant = $ast->get_first_descendant_node(); - $this->assertNotNull( $descendant ); - $ast = null; - $descendant = null; - gc_collect_cycles(); - - $after = memory_get_usage(); - $delta = $after - $floor; - // Generous bound — but tens of MB of leaked wrappers would blow it. - $this->assertLessThan( - 1 * 1024 * 1024, - $delta, - sprintf( - 'After dropping the AST and the descendant and running gc, %.1f MB of cached wrappers remain.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * Holding a child wrapper *outlives* the variable holding the root. - * The child's registry entry must keep the AST alive (no UAF when the - * bridge is called on the orphaned child). Once the child is also dropped, - * the registry entry must be released. - */ - public function test_orphaned_child_keeps_ast_alive_then_collects(): void { - $sql = 'SELECT a, b, c FROM t WHERE a + b * c IN (1, 2, 3)'; - $child = ( function () use ( $sql ) { - $ast = $this->parse( $sql ); - return $ast->get_first_descendant_node(); - } )(); - - // Root variable is gone; only the child reference remains, but the - // registry entry still pins the AST. The child must still be - // functional — accessing it must not crash. - $this->assertNotNull( $child ); - $this->assertIsString( $child->rule_name ); - // The child's own children should also resolve without UAF. - $grand = $child->get_first_child(); - $this->assertNotNull( $grand ); - - // Now drop the child too; the AST + cache should be reclaimable. - $child = null; - $grand = null; - gc_collect_cycles(); - // If the registry entry was released, this assertion always passes; - // the real signal is the absence of a segfault during teardown. - $this->addToAssertionCount( 1 ); - } - - /** - * Mutating a cached wrapper through `append_child` before dropping - * the AST must not block collection. The mutated wrapper's - * `$children` array now contains a non-cached node; that must not keep - * stale registry/cache entries alive. - */ - public function test_mutation_before_drop_does_not_block_collection(): void { - $sql = 'SELECT 1 + 2'; - - gc_collect_cycles(); - $floor = memory_get_usage(); - - for ( $i = 0; $i < 200; $i++ ) { - $ast = $this->parse( $sql ); - $child = $ast->get_first_child_node(); - $injected = new WP_Parser_Node( 0, 'synthetic-' . $i ); - $ast->append_child( $injected ); - // Touch the cache after mutation to keep wrappers live. - $ast->get_descendants(); - $ast = null; - $child = null; - $injected = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - $delta = $after - $floor; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Memory grew %.1f MB across 200 mutate-then-drop cycles.', - $delta / 1024 / 1024 - ) - ); - } - - /** - * Two ASTs alive simultaneously, then dropped in interleaved order. - * Dropping AST A must not affect AST B's cached wrappers; both must - * eventually collect once unreferenced. - */ - public function test_overlapping_asts_do_not_corrupt_each_other(): void { - $ast_a = $this->parse( 'SELECT a FROM ta WHERE a > 1' ); - $ast_b = $this->parse( 'SELECT b FROM tb WHERE b < 9' ); - - $child_a = $ast_a->get_first_descendant_node(); - $child_b = $ast_b->get_first_descendant_node(); - - // Drop A first and run gc; B must remain fully functional. - $ast_a = null; - $child_a = null; - gc_collect_cycles(); - - $this->assertNotNull( $child_b ); - $walk = $ast_b->get_descendants(); - $this->assertNotEmpty( $walk ); - - // Drop B too; walk one of its still-held descendants — the cache - // is still alive because $child_b pins it. - $ast_b = null; - $this->assertIsString( $child_b->rule_name ); - - $child_b = null; - $walk = null; - gc_collect_cycles(); - $this->addToAssertionCount( 1 ); - } - - /** - * Re-walk + drop + collect across many iterations. This is the - * "translator pass on each query" shape of real workloads. The wrapper - * registry and cache must not create a memory cliff under repeated walks. - */ - public function test_rewalk_loop_stays_bounded(): void { - $sql = 'SELECT a, b, c, d, e FROM t WHERE (a + b) * (c - d) > e AND f IN (1,2,3,4,5)'; - - gc_collect_cycles(); - // Warm-up. - for ( $i = 0; $i < 10; $i++ ) { - $ast = $this->parse( $sql ); - for ( $r = 0; $r < 10; $r++ ) { - $ast->get_descendants(); - } - $ast = null; - gc_collect_cycles(); - } - $floor = memory_get_usage(); - - for ( $i = 0; $i < 200; $i++ ) { - $ast = $this->parse( $sql ); - for ( $r = 0; $r < 10; $r++ ) { - $ast->get_descendants(); - } - $ast = null; - gc_collect_cycles(); - } - $after = memory_get_usage(); - $delta = $after - $floor; - $this->assertLessThan( - 4 * 1024 * 1024, - $delta, - sprintf( - 'Rewalk loop grew memory by %.1f MB; cache likely uncollectable.', - $delta / 1024 / 1024 - ) - ); - } -} diff --git a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php b/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php deleted file mode 100644 index 066fd38d..00000000 --- a/packages/mysql-on-sqlite/tests/mysql/native/WP_MySQL_Native_Parser_Node_Identity_Tests.php +++ /dev/null @@ -1,142 +0,0 @@ -markTestSkipped( 'Native MySQL parser extension is not loaded.' ); - } - } - - private function parse( string $sql ): WP_Parser_Node { - static $grammar = null; - if ( null === $grammar ) { - $grammar = new WP_Parser_Grammar( include __DIR__ . '/../../../src/mysql/mysql-grammar.php' ); - } - $lexer = new WP_MySQL_Lexer( $sql ); - $tokens = $lexer instanceof WP_MySQL_Native_Lexer - ? $lexer->native_token_stream() - : $lexer->remaining_tokens(); - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $tree = $parser->parse(); - $this->assertNotNull( $tree, 'Failed to parse SQL: ' . $sql ); - return $tree; - } - - public function test_get_first_child_node_returns_same_instance(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $first = $tree->get_first_child_node(); - $second = $tree->get_first_child_node(); - - $this->assertNotNull( $first ); - $this->assertSame( $first, $second ); - } - - public function test_native_wrapper_does_not_store_native_ast_handle(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $reflection = new ReflectionObject( $tree ); - - $this->assertFalse( $reflection->hasProperty( 'native_ast' ) ); - $this->assertFalse( $reflection->hasProperty( 'native_node_index' ) ); - } - - public function test_get_children_returns_same_instances_across_calls(): void { - $tree = $this->parse( 'SELECT 1, 2, 3' ); - - $first_pass = $tree->get_children(); - $second_pass = $tree->get_children(); - - $this->assertSameSize( $first_pass, $second_pass ); - foreach ( $first_pass as $i => $child ) { - if ( $child instanceof WP_Parser_Node ) { - $this->assertSame( $child, $second_pass[ $i ] ); - } - } - } - - public function test_descendant_lookup_shares_identity_with_child_lookup(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $descendant = $tree->get_first_descendant_node(); - $this->assertNotNull( $descendant ); - - // Walk down to the same node via direct children. We don't know the - // exact depth, so we descend until we hit the descendant we found. - $cursor = $tree; - while ( null !== $cursor && $cursor !== $descendant ) { - $next = $cursor->get_first_child_node(); - if ( $next === $cursor ) { - break; - } - $cursor = $next; - } - - $this->assertSame( $descendant, $cursor, 'Descendant and child lookups must return the same wrapper instance.' ); - } - - public function test_mutation_on_child_survives_re_read(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - - // Mutate via the public WP_Parser_Node API. This catches regressions - // where accessors hand back fresh wrappers and lose state written - // through a previously returned child. - $child->rule_name = 'mutated-rule'; - - $same_child = $tree->get_first_child_node(); - $this->assertSame( $child, $same_child ); - $this->assertSame( 'mutated-rule', $same_child->rule_name ); - } - - public function test_materialized_child_survives_re_read_from_native_parent(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - - $synthetic = new WP_Parser_Node( 0, 'synthetic' ); - $child->append_child( $synthetic ); - - $same_child = $tree->get_first_child_node(); - $this->assertSame( $child, $same_child ); - $this->assertTrue( - in_array( $synthetic, $same_child->get_children(), true ), - 'Materialized live child wrappers must stay discoverable through the parent native cache.' - ); - } - - public function test_mutation_survives_parent_materialization(): void { - $tree = $this->parse( 'SELECT 1 + 2' ); - - $child = $tree->get_first_child_node(); - $this->assertNotNull( $child ); - $child->rule_name = 'before-materialize'; - - // Force the parent to materialize its native children by appending - // a sibling. After this, the parent walks $this->children directly. - $sibling = new WP_Parser_Node( 0, 'synthetic' ); - $tree->append_child( $sibling ); - - $children = $tree->get_children(); - $this->assertContains( $child, $children, 'Materialized children must include the previously-mutated wrapper.' ); - $this->assertSame( 'before-materialize', $child->rule_name ); - } -} diff --git a/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php new file mode 100644 index 00000000..1861159d --- /dev/null +++ b/packages/mysql-on-sqlite/tests/parser/WP_Parser_Grammar_Tests.php @@ -0,0 +1,215 @@ + $rules_offset, + 'rules_names' => $names, + 'grammar' => $grammar, + ) + ); + // Selectors are denormalized lazily per rule; force a full build so the + // assertions below can read the complete branches_for_token table. + $g->build_all_selectors(); + return $g; + } + + public function test_strip_epsilon_markers_and_nullable_fallback(): void { + // opt ::= A ε | ε (A = 1) + $g = $this->build_grammar( + 10, + array( 'opt' ), + array( + array( array( 1, 0 ), array( 0 ) ), + ) + ); + + // Epsilon markers are removed; the pure-epsilon branch becomes empty. + $this->assertSame( array( array( 1 ), array() ), $g->rules[10] ); + + // The rule is nullable (it has an empty branch). + $this->assertArrayHasKey( 10, $g->nullable_branches ); + + // Token A selects both branches: the A-led one and the nullable one. + $this->assertSame( array( array( 1 ), array() ), $g->branches_for_token[10][1] ); + + // Two candidate branches for token A, so it is not single-candidate. + $this->assertArrayNotHasKey( 10, $g->single_candidate_rules ); + } + + public function test_inline_single_branch_fragment(): void { + // r ::= %f C ; %f ::= A B (A=1, B=2, C=3) + $g = $this->build_grammar( + 10, + array( 'r', '%f' ), + array( + array( array( 11, 3 ) ), + array( array( 1, 2 ) ), + ) + ); + + // The single-branch fragment is expanded in place. + $this->assertSame( array( array( 1, 2, 3 ) ), $g->rules[10] ); + + // The fragment rule itself is left intact. + $this->assertSame( array( array( 1, 2 ) ), $g->rules[11] ); + + // Only token A (the inlined first symbol) starts the rule. + $this->assertSame( array( 1 ), array_keys( $g->branches_for_token[10] ) ); + $this->assertSame( array( array( 1, 2, 3 ) ), $g->branches_for_token[10][1] ); + $this->assertArrayHasKey( 10, $g->single_candidate_rules ); + } + + public function test_multi_candidate_rule_is_not_single_candidate(): void { + // top ::= A B | A C (both branches start with A) + $g = $this->build_grammar( + 10, + array( 'top', 'alt' ), + array( + array( array( 1, 2 ), array( 1, 3 ) ), + array( array( 1 ) ), + ) + ); + + $this->assertSame( array( array( 1, 2 ), array( 1, 3 ) ), $g->branches_for_token[10][1] ); + $this->assertArrayNotHasKey( 10, $g->single_candidate_rules ); + + // The single-branch rule is single-candidate. + $this->assertArrayHasKey( 11, $g->single_candidate_rules ); + } + + public function test_first_set_propagates_through_non_terminal(): void { + // top ::= child ; child ::= A | B + $g = $this->build_grammar( + 10, + array( 'top', 'child' ), + array( + array( array( 11 ) ), + array( array( 1 ), array( 2 ) ), + ) + ); + + // FIRST(child) = {A, B} flows up into top's selector. + $this->assertSame( array( 1, 2 ), array_keys( $g->branches_for_token[10] ) ); + $this->assertSame( array( array( 11 ) ), $g->branches_for_token[10][1] ); + $this->assertSame( array( array( 11 ) ), $g->branches_for_token[10][2] ); + $this->assertArrayHasKey( 10, $g->single_candidate_rules ); + + $this->assertSame( array( array( 1 ) ), $g->branches_for_token[11][1] ); + $this->assertSame( array( array( 2 ) ), $g->branches_for_token[11][2] ); + } + + public function test_inlining_terminates_on_cyclic_fragments(): void { + // r ::= %a ; %a ::= %b ; %b ::= %a (mutually recursive fragments) + // The inliner must detect the cycle and leave a reference in place + // instead of recursing forever. + $g = $this->build_grammar( + 10, + array( 'r', '%a', '%b' ), + array( + array( array( 11 ) ), + array( array( 12 ) ), + array( array( 11 ) ), + ) + ); + + $this->assertSame( array( array( 11 ) ), $g->rules[10] ); + } + + public function test_merge_sorted_dedupes_and_preserves_ascending_order(): void { + $merge = new ReflectionMethod( WP_Parser_Grammar::class, 'merge_sorted' ); + // setAccessible() is required on PHP < 8.1 and deprecated (no-op) from 8.5. + if ( PHP_VERSION_ID < 80100 ) { + $merge->setAccessible( true ); + } + + $this->assertSame( array( 1, 2, 3 ), $merge->invoke( null, array( 1, 3 ), array( 2, 3 ) ) ); + $this->assertSame( array( 2 ), $merge->invoke( null, array(), array( 2 ) ) ); + $this->assertSame( array( 1, 2 ), $merge->invoke( null, array( 1, 2 ), array() ) ); + $this->assertSame( array( 0, 1 ), $merge->invoke( null, array( 0, 1 ), array( 1 ) ) ); + } + + public function test_lazy_selector_matches_full_build(): void { + // child ::= A | B ; top ::= child (A=1, B=2) + $g = $this->build_grammar( + 10, + array( 'top', 'child' ), + array( + array( array( 11 ) ), + array( array( 1 ), array( 2 ) ), + ) + ); + $expected = $g->branches_for_token[10]; + + // A fresh grammar that never forces a full build must produce the same + // selector for a rule the moment it is requested, and be idempotent. + $lazy = new WP_Parser_Grammar( + array( + 'rules_offset' => 10, + 'rules_names' => array( 'top', 'child' ), + 'grammar' => array( array( array( 11 ) ), array( array( 1 ), array( 2 ) ) ), + ) + ); + $this->assertArrayNotHasKey( 10, $lazy->branches_for_token ); + $lazy->ensure_rule_selector( 10 ); + $lazy->ensure_rule_selector( 10 ); + $this->assertSame( $expected, $lazy->branches_for_token[10] ); + } + + public function test_real_mysql_grammar_invariants(): void { + $g = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); + $g->build_all_selectors(); + + // Epsilon markers are fully stripped from every branch. The parser's + // end-of-input sentinel relies on no real branch symbol being 0. + foreach ( $g->rules as $rule_id => $branches ) { + foreach ( $branches as $branch ) { + $this->assertNotContains( + WP_Parser_Grammar::EMPTY_RULE_ID, + $branch, + "Rule {$rule_id} still contains an epsilon marker." + ); + } + } + + // Every single-candidate rule has a selector, and each of its token + // entries points to exactly one branch sequence (what the fast path + // assumes when it reads $candidate_branches[0]). + foreach ( array_keys( $g->single_candidate_rules ) as $rule_id ) { + $this->assertArrayHasKey( $rule_id, $g->branches_for_token ); + foreach ( $g->branches_for_token[ $rule_id ] as $token_id => $sequences ) { + $this->assertCount( + 1, + $sequences, + "Single-candidate rule {$rule_id} has multiple branches for token {$token_id}." + ); + } + } + } +} diff --git a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php index 6b77ea89..7df9029e 100644 --- a/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php +++ b/packages/mysql-on-sqlite/tests/tools/run-parser-benchmark.php @@ -64,7 +64,11 @@ function get_stats( $total, $failures, $exceptions ) { $failures = array(); $exceptions = array(); $processed = 0; -$start = microtime( true ); +// Reuse a single parser across queries, mirroring the driver +// (WP_PDO_MySQL_On_SQLite::reset_or_create_parser), which resets tokens on the +// same instance rather than constructing a fresh parser per query. +$parser = null; +$start = microtime( true ); foreach ( $queries as $query ) { try { $lexer = new WP_MySQL_Lexer( $query ); @@ -75,8 +79,12 @@ function get_stats( $total, $failures, $exceptions ) { throw new Exception( 'Failed to tokenize query: ' . $query ); } - $parser = new WP_MySQL_Parser( $grammar, $tokens ); - $ast = $parser->parse(); + if ( null === $parser ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + } else { + $parser->reset_tokens( $tokens ); + } + $ast = $parser->parse(); if ( null === $ast ) { $failures[] = $query; } diff --git a/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php b/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php index 84e99ba5..3d388742 100644 --- a/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php +++ b/packages/mysql-on-sqlite/tests/tools/verify-native-parser-extension.php @@ -59,7 +59,7 @@ function wp_sqlite_verify_native_parser_extension(): void { ); $parser_ast = $parser->parse(); - if ( ! ( $parser_ast instanceof WP_MySQL_Native_Parser_Node ) || 'query' !== $parser_ast->rule_name ) { + if ( ! ( $parser_ast instanceof WP_Parser_Node ) || 'query' !== $parser_ast->rule_name ) { wp_sqlite_native_parser_verification_fail( 'Native parser did not produce the expected query AST.' ); } @@ -72,29 +72,24 @@ function wp_sqlite_verify_native_parser_extension(): void { $parser->next_query(); $ast = $parser->get_query_ast(); - if ( ! ( $ast instanceof WP_MySQL_Native_Parser_Node ) ) { + if ( ! ( $ast instanceof WP_Parser_Node ) ) { wp_sqlite_native_parser_verification_fail( 'WP_PDO_MySQL_On_SQLite did not produce a native-backed AST.' ); } - $reflection = new ReflectionObject( $ast ); - if ( $reflection->hasProperty( 'native_ast' ) || $reflection->hasProperty( 'native_node_index' ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper still stores Rust AST handle properties.' ); - } - $first = $ast->get_first_child_node(); - if ( ! ( $first instanceof WP_MySQL_Native_Parser_Node ) ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a native-backed child node.' ); + if ( ! ( $first instanceof WP_Parser_Node ) ) { + wp_sqlite_native_parser_verification_fail( 'Native wrapper did not return a child node.' ); } if ( $first !== $ast->get_first_child_node() ) { - wp_sqlite_native_parser_verification_fail( 'Native wrapper identity is not stable across reads.' ); + wp_sqlite_native_parser_verification_fail( 'AST node identity is not stable across reads.' ); } $synthetic = new WP_Parser_Node( 0, 'synthetic' ); $first->append_child( $synthetic ); $same_first = $ast->get_first_child_node(); if ( $same_first !== $first || ! in_array( $synthetic, $same_first->get_children(), true ) ) { - wp_sqlite_native_parser_verification_fail( 'Materialized native wrapper was lost from the parent cache.' ); + wp_sqlite_native_parser_verification_fail( 'Mutated child was lost from the parent.' ); } } diff --git a/packages/php-ext-wp-mysql-parser/Cargo.lock b/packages/php-ext-wp-mysql-parser/Cargo.lock index baf9d849..c027761e 100644 --- a/packages/php-ext-wp-mysql-parser/Cargo.lock +++ b/packages/php-ext-wp-mysql-parser/Cargo.lock @@ -1596,7 +1596,7 @@ dependencies = [ [[package]] name = "wp_mysql_parser" -version = "0.1.0" +version = "0.2.0" dependencies = [ "ext-php-rs", "libc", diff --git a/packages/php-ext-wp-mysql-parser/Cargo.toml b/packages/php-ext-wp-mysql-parser/Cargo.toml index 6646c110..57465d0a 100644 --- a/packages/php-ext-wp-mysql-parser/Cargo.toml +++ b/packages/php-ext-wp-mysql-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "wp_mysql_parser" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "GPL-2.0-or-later" diff --git a/packages/php-ext-wp-mysql-parser/README.md b/packages/php-ext-wp-mysql-parser/README.md index 2df0b54c..d47e0812 100644 --- a/packages/php-ext-wp-mysql-parser/README.md +++ b/packages/php-ext-wp-mysql-parser/README.md @@ -4,6 +4,28 @@ When the extension is loaded before `packages/mysql-on-sqlite/src/load.php`, it registers native base classes used by the public `WP_MySQL_Lexer` and `WP_MySQL_Parser` wrappers. Without the extension, those public wrappers extend the pure-PHP implementations instead. +## Versioning and the grammar ABI + +The native parser and the PHP driver exchange the parser grammar at runtime via +`wp_sqlite_mysql_native_export_grammar()`. The shape of that data is an ABI shared +between the extension binary and the PHP code, and it can change between releases +(for example, the move from a coarse lookahead table to per-token branch selectors). + +Compatibility is tracked by the extension's **minor** version (the `x` in `0.x`): + +- **Bump the minor version on any backward-incompatible change to the grammar ABI** + (the data exchanged by `wp_sqlite_mysql_native_export_grammar()` or consumed by the + native parser). Patch releases must keep the ABI unchanged. +- The PHP side (`packages/mysql-on-sqlite/src/load.php`) pins the supported minor + line and selects the native lexer/parser only when `phpversion( 'wp_mysql_parser' )` + falls within it. A mismatch — most commonly a plugin update that outpaces the + installed extension binary — falls back cleanly to the pure-PHP path instead of + failing at parse time. + +When you change the grammar ABI, bump `version` in `Cargo.toml` and update the +supported range in `wp_sqlite_mysql_native_grammar_abi_supported()` in `load.php` +together. + ## Published WASM build for Playground Published WASM builds are listed on this repository's GitHub Pages site, with manifest links and a “Run in Playground” link for each release: @@ -86,14 +108,33 @@ The GitHub Pages demo reads published benchmark data from: -Latest local measurement (Apple Silicon macOS, PHP 8.4.5 CLI, 2026-05-26): +Latest local measurement (Apple Silicon macOS, PHP 8.5.5 CLI, over the 69,577-query corpus, +best of five runs, 2026-06-05): + +**Without JIT:** + +| Benchmark | Pure PHP [QPS] | Native [QPS] | Speedup | +| --- | ---: | ---: | ---: | +| MySQL lexer | 178,619 | 354,058 | 1.98x | +| MySQL parser | 28,640 | 60,119 | 2.10x | + +**With opcache + tracing JIT:** + +| Benchmark | Pure PHP [QPS] | Native [QPS] | Speedup | +| --- | ---: | ---: | ---: | +| MySQL lexer | 332,974 | 364,365 | 1.09x | +| MySQL parser | 50,088 | 60,253 | 1.20x | + +The parser rows are parse-only and reuse a single parser instance across the corpus (resetting +tokens per query), mirroring the driver, which reuses its parser across a request's queries. +On this branch the native parser materializes the full +`WP_Parser_Node` tree eagerly, so the number reflects building a complete AST rather than a +deferred handle (the earlier 15x figure measured a lazy parse that never built the tree). -| Benchmark | Implementation | Queries | QPS | Speedup | -| --- | --- | ---: | ---: | ---: | -| MySQL lexer | Pure PHP | 69,577 | 71,553 | — | -| MySQL lexer | Native extension | 69,577 | 343,124 | 4.80x | -| MySQL parser | Pure PHP | 69,577 | 7,015 | — | -| MySQL parser | Native extension | 69,577 | 108,354 | 15.45x | +The native code is essentially JIT-independent, while the pure-PHP path speeds up substantially +under opcache + tracing JIT — so the native edge narrows from roughly 2x to about 1.1x for the +lexer and 1.2x for the parser. The published `benchmark.json` environment matches the without-JIT +numbers. That file should be updated whenever a new extension build or benchmark environment is published. diff --git a/packages/php-ext-wp-mysql-parser/src/lib.rs b/packages/php-ext-wp-mysql-parser/src/lib.rs index 35f17fbd..8425ac35 100644 --- a/packages/php-ext-wp-mysql-parser/src/lib.rs +++ b/packages/php-ext-wp-mysql-parser/src/lib.rs @@ -1,6 +1,5 @@ #![cfg_attr(windows, feature(abi_vectorcall))] -use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::os::raw::c_char; use std::ptr; @@ -61,7 +60,7 @@ fn php_function(name: &str) -> PhpResult> { struct PhpClasses { parser_token: &'static ClassEntry, mysql_token: &'static ClassEntry, - native_parser_node: &'static ClassEntry, + parser_node: &'static ClassEntry, } fn php_classes() -> PhpResult { @@ -70,8 +69,8 @@ fn php_classes() -> PhpResult { .ok_or_else(|| php_error("Missing WP_Parser_Token class"))?, mysql_token: ClassEntry::try_find("WP_MySQL_Token") .ok_or_else(|| php_error("Missing WP_MySQL_Token class"))?, - native_parser_node: ClassEntry::try_find("WP_MySQL_Native_Parser_Node") - .ok_or_else(|| php_error("Missing WP_MySQL_Native_Parser_Node class"))?, + parser_node: ClassEntry::try_find("WP_Parser_Node") + .ok_or_else(|| php_error("Missing WP_Parser_Node class"))?, }) } @@ -921,7 +920,12 @@ struct Grammar { struct Rule { branches: Vec>, - lookahead: Option>, + /// Sorted FIRST set: token ids that can start a match for this rule. + /// `None` means the rule has no FIRST entry at all (cannot match the + /// non-empty case); see `nullable` for the empty case. + first_set: Option>, + /// At least one branch is nullable (matches empty input). + nullable: bool, rule_name: String, is_fragment: bool, } @@ -970,38 +974,6 @@ impl ParserTokenSource { } } } - - fn token_info(&self, index: usize) -> PhpResult { - match self { - Self::Php(tokens) => { - let token = tokens - .get(index) - .ok_or_else(|| php_error("Parser token index is out of range"))?; - let token_object = token - .object() - .ok_or_else(|| php_error("Parser token must be an object"))?; - let id = token_object.get_property::("id").map_err(php_error)?; - let start = token_object - .get_property::("start") - .map_err(php_error)?; - let length = token_object - .get_property::("length") - .map_err(php_error)?; - let start = usize::try_from(start).map_err(php_error)?; - let length = usize::try_from(length).map_err(php_error)?; - - Ok(TokenInfo { - id, - start, - end: start.saturating_add(length), - }) - } - Self::Native { tokens, .. } => tokens - .get(index) - .copied() - .ok_or_else(|| php_error("Parser token index is out of range")), - } - } } #[derive(Clone, Copy)] @@ -1029,9 +1001,6 @@ enum NativeParseMatch { struct NativeAstNode { rule_id: i64, children: Vec, - first_token: Option, - last_token: Option, - descendant_count: usize, } struct NativeAstArena { @@ -1043,27 +1012,6 @@ struct NativeAstArena { struct NativeAstState { arena: Arc, - /// Per-AST identity map: node arena index → live PHP wrapper pointer. - /// - /// `WP_Parser_Node` callers expect stable child identity (mutate a child - /// once, walk past, walk back, the mutation is still there). Each - /// accessor in this extension constructs a fresh wrapper unless we - /// intern it here. The cache intentionally stores raw wrapper pointers, - /// not strong PHP references, so Rust can preserve identity without - /// pinning wrappers after PHP drops them. - node_cache: RefCell>, -} - -struct NativeAstWrapperEntry { - ast: Rc, - node_index: usize, - /// Materialized wrappers still participate in identity lookups but no - /// longer delegate reads through the native AST bridge. - is_materialized: bool, -} - -thread_local! { - static NATIVE_AST_WRAPPERS: RefCell> = RefCell::new(HashMap::new()); } impl NativeAstArena { @@ -1078,39 +1026,7 @@ impl NativeAstArena { fn push_node(&mut self, rule_id: i64, children: Vec) -> usize { let index = self.nodes.len(); - let mut first_token = None; - let mut last_token = None; - let mut descendant_count = 0; - for child in &children { - match child { - NativeAstChild::Node(child_index) => { - if let Some(node) = self.nodes.get(*child_index) { - descendant_count += 1 + node.descendant_count; - if first_token.is_none() { - first_token = node.first_token; - } - if node.last_token.is_some() { - last_token = node.last_token; - } - } - } - NativeAstChild::Token(token_index) => { - if first_token.is_none() { - first_token = Some(*token_index); - } - last_token = Some(*token_index); - descendant_count += 1; - } - } - } - - self.nodes.push(NativeAstNode { - rule_id, - children, - first_token, - last_token, - descendant_count, - }); + self.nodes.push(NativeAstNode { rule_id, children }); index } @@ -1119,125 +1035,11 @@ impl NativeAstArena { .get(index) .ok_or_else(|| php_error("Native AST node index is out of range")) } - - fn child_node_matches(&self, child: NativeAstChild, rule_name: Option<&str>) -> bool { - let NativeAstChild::Node(index) = child else { - return false; - }; - let Ok(node) = self.node(index) else { - return false; - }; - rule_name.is_none_or(|expected| { - self.grammar - .rule(node.rule_id) - .map(|rule| rule.rule_name == expected) - .unwrap_or(false) - }) - } - - fn child_token_matches(&self, child: NativeAstChild, token_id: Option) -> bool { - let NativeAstChild::Token(index) = child else { - return false; - }; - token_id.is_none_or(|expected| { - self.token_source - .token_info(index) - .map(|token| token.id == expected) - .unwrap_or(false) - }) - } - - fn descendant_stack(&self, index: usize) -> PhpResult> { - let node = self.node(index)?; - let mut stack = Vec::with_capacity(node.descendant_count); - stack.extend(node.children.iter().rev().copied()); - Ok(stack) - } -} - -fn native_ast_wrapper_key(wrapper_zval: &Zval) -> PhpResult { - let object = wrapper_zval - .object() - .ok_or_else(|| php_error("Missing native AST wrapper"))?; - Ok(ptr::from_ref(object) as usize) -} - -fn native_ast_from_wrapper(wrapper_zval: &Zval) -> PhpResult<(Rc, usize)> { - let key = native_ast_wrapper_key(wrapper_zval)?; - NATIVE_AST_WRAPPERS - .with(|wrappers| { - wrappers.borrow().get(&key).and_then(|entry| { - (!entry.is_materialized).then(|| (Rc::clone(&entry.ast), entry.node_index)) - }) - }) - .ok_or_else(|| php_error("Missing native AST handle")) -} - -fn register_native_ast_wrapper( - object: &ZendObject, - ast: &Rc, - node_index: usize, -) -> usize { - let key = ptr::from_ref(object) as usize; - NATIVE_AST_WRAPPERS.with(|wrappers| { - wrappers.borrow_mut().insert( - key, - NativeAstWrapperEntry { - ast: Rc::clone(ast), - node_index, - is_materialized: false, - }, - ); - }); - ast.node_cache.borrow_mut().insert(node_index, key); - key -} - -fn mark_native_ast_wrapper_materialized_key(key: usize) { - NATIVE_AST_WRAPPERS.with(|wrappers| { - if let Some(entry) = wrappers.borrow_mut().get_mut(&key) { - entry.is_materialized = true; - } - }); -} - -fn release_native_ast_wrapper_key(key: usize) { - let entry = NATIVE_AST_WRAPPERS.with(|wrappers| wrappers.borrow_mut().remove(&key)); - if let Some(entry) = entry { - let mut cache = entry.ast.node_cache.borrow_mut(); - if cache.get(&entry.node_index).copied() == Some(key) { - cache.remove(&entry.node_index); - } - } -} - -fn native_ast_wrapper_matches(key: usize, ast: &Rc, node_index: usize) -> bool { - NATIVE_AST_WRAPPERS.with(|wrappers| { - wrappers - .borrow() - .get(&key) - .is_some_and(|entry| Rc::ptr_eq(&entry.ast, ast) && entry.node_index == node_index) - }) -} - -/// Build a Zval that references an existing PHP object. -/// -/// Used on cache hits to hand a live wrapper back to PHP without allocating a -/// new object. `Zval::set_object()` bumps the object refcount for the returned -/// zval; the Rust cache only stores the pointer and does not own a reference. -unsafe fn zval_from_cached_object(key: usize) -> Zval { - let obj = &mut *(key as *mut ZendObject); - let mut zv = Zval::new(); - zv.set_object(obj); - zv } impl NativeAstState { fn new(arena: Arc) -> Rc { - Rc::new(Self { - arena, - node_cache: RefCell::new(HashMap::new()), - }) + Rc::new(Self { arena }) } fn create_php_ast(self: &Rc) -> PhpResult { @@ -1253,344 +1055,54 @@ impl NativeAstState { zval.set_bool(true); Ok(zval) } - NativeAstRoot::Node(index) => self.create_php_node_with_classes(index, classes), + NativeAstRoot::Node(index) => create_php_node_with_classes(&self.arena, index, classes), NativeAstRoot::Token(index) => self .arena .token_source .create_php_token_with_classes(index, classes), } } - - /// Resolve a child slot to a Zval, going through the per-AST identity - /// cache for nodes. Tokens are not yet cached — they have no public - /// mutators and no caller in this repo relies on token identity. - fn cached_child_zval( - self: &Rc, - child: NativeAstChild, - classes: &PhpClasses, - ) -> PhpResult { - match child { - NativeAstChild::Node(index) => self.cached_node_zval(index, classes), - NativeAstChild::Token(index) => self - .arena - .token_source - .create_php_token_with_classes(index, classes), - } - } - - fn cached_node_zval(self: &Rc, index: usize, classes: &PhpClasses) -> PhpResult { - let cached_key = { - let cache = self.node_cache.borrow(); - cache.get(&index).copied() - }; - if let Some(key) = cached_key { - if native_ast_wrapper_matches(key, self, index) { - return Ok(unsafe { zval_from_cached_object(key) }); - } - self.node_cache.borrow_mut().remove(&index); - } - - self.create_php_node_with_classes(index, classes) - } - - fn create_php_node_with_classes( - self: &Rc, - index: usize, - classes: &PhpClasses, - ) -> PhpResult { - let node = self.arena.node(index)?; - let mut object = classes.native_parser_node.new(); - let rule_name = self - .arena - .grammar - .rule(node.rule_id) - .map(|rule| rule.rule_name.as_str()) - .unwrap_or_default(); - - update_object_property( - &mut object, - classes.native_parser_node, - "rule_id", - node.rule_id, - )?; - update_object_property( - &mut object, - classes.native_parser_node, - "rule_name", - rule_name.to_owned(), - )?; - - register_native_ast_wrapper(object.as_ref(), self, index); - object.into_zval(false).map_err(php_error) - } -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_release_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { - let key = native_ast_wrapper_key(wrapper_zval)?; - release_native_ast_wrapper_key(key); - Ok(()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_materialize_wrapper(wrapper_zval: &Zval) -> PhpResult<()> { - let key = native_ast_wrapper_key(wrapper_zval)?; - mark_native_ast_wrapper_materialized_key(key); - Ok(()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(!ast.arena.node(node_index)?.children.is_empty()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(ast - .arena - .node(node_index)? - .children - .iter() - .copied() - .any(|child| ast.arena.child_node_matches(child, rule_name.as_deref()))) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_has_child_token( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - Ok(ast - .arena - .node(node_index)? - .children - .iter() - .copied() - .any(|child| ast.arena.child_token_matches(child, token_id))) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let Some(child) = ast.arena.node(node_index)?.children.first().copied() else { - return Ok(Zval::null()); - }; - ast.cached_child_zval(child, &classes) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - for child in &ast.arena.node(node_index)?.children { - if ast.arena.child_node_matches(*child, rule_name.as_deref()) { - return ast.cached_child_zval(*child, &classes); - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_child_token( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - for child in &ast.arena.node(node_index)?.children { - if ast.arena.child_token_matches(*child, token_id) { - return ast.cached_child_zval(*child, &classes); - } - } - Ok(Zval::null()) } -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_descendant_node( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_node_matches(child, rule_name.as_deref()) { - return ast.cached_child_zval(child, &classes); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_first_descendant_token( - wrapper_zval: &Zval, - token_id: Option, +/// Build a complete PHP `WP_Parser_Node` Zval, recursively materializing +/// children from the Rust arena. The returned object is a plain +/// `WP_Parser_Node` instance with `rule_id`, `rule_name`, and `children` +/// populated, so callers see no difference from the pure-PHP parser's output. +fn create_php_node_with_classes( + arena: &NativeAstArena, + index: usize, + classes: &PhpClasses, ) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_token_matches(child, token_id) { - return ast.cached_child_zval(child, &classes); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(Zval::null()) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_children(wrapper_zval: &Zval) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_child_nodes( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .filter(|child| ast.arena.child_node_matches(*child, rule_name.as_deref())) - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_child_tokens( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - ast.arena - .node(node_index)? - .children - .iter() - .copied() - .filter(|child| ast.arena.child_token_matches(*child, token_id)) - .map(|child| ast.cached_child_zval(child, &classes)) - .collect() -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendants(wrapper_zval: &Zval) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let root = ast.arena.node(node_index)?; - let mut descendants = Vec::with_capacity(root.descendant_count); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - descendants.push(ast.cached_child_zval(child, &classes)?); - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } - } - Ok(descendants) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendant_nodes( - wrapper_zval: &Zval, - rule_name: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut descendants = Vec::new(); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_node_matches(child, rule_name.as_deref()) { - descendants.push(ast.cached_child_zval(child, &classes)?); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); + let node = arena.node(index)?; + let rule_name = arena + .grammar + .rule(node.rule_id) + .map(|rule| rule.rule_name.as_str()) + .unwrap_or_default(); + + let mut children: Vec = Vec::with_capacity(node.children.len()); + for child in &node.children { + let child_zval = match child { + NativeAstChild::Node(child_index) => { + create_php_node_with_classes(arena, *child_index, classes)? } - } - } - Ok(descendants) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_descendant_tokens( - wrapper_zval: &Zval, - token_id: Option, -) -> PhpResult> { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let classes = php_classes()?; - let mut descendants = Vec::new(); - let mut stack = ast.arena.descendant_stack(node_index)?; - while let Some(child) = stack.pop() { - if ast.arena.child_token_matches(child, token_id) { - descendants.push(ast.cached_child_zval(child, &classes)?); - } - if let NativeAstChild::Node(index) = child { - for child in ast.arena.node(index)?.children.iter().rev() { - stack.push(*child); - } - } + NativeAstChild::Token(token_index) => arena + .token_source + .create_php_token_with_classes(*token_index, classes)?, + }; + children.push(child_zval); } - Ok(descendants) -} -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_start(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let node = ast.arena.node(node_index)?; - let token_index = node - .first_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let token = ast.arena.token_source.token_info(token_index)?; - i64::try_from(token.start).map_err(php_error) -} - -#[php_function] -pub fn wp_sqlite_mysql_native_ast_get_length(wrapper_zval: &Zval) -> PhpResult { - let (ast, node_index) = native_ast_from_wrapper(wrapper_zval)?; - let node = ast.arena.node(node_index)?; - let first_token_index = node - .first_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let last_token_index = node - .last_token - .ok_or_else(|| php_error("Native AST node has no descendant tokens"))?; - let first_token = ast.arena.token_source.token_info(first_token_index)?; - let last_token = ast.arena.token_source.token_info(last_token_index)?; - let length = last_token.end.saturating_sub(first_token.start); - i64::try_from(length).map_err(php_error) + let mut object = classes.parser_node.new(); + update_object_property(&mut object, classes.parser_node, "rule_id", node.rule_id)?; + update_object_property( + &mut object, + classes.parser_node, + "rule_name", + rule_name.to_owned(), + )?; + update_object_property(&mut object, classes.parser_node, "children", children)?; + object.into_zval(false).map_err(php_error) } #[php_class] @@ -1729,11 +1241,13 @@ impl WpMySqlNativeParser { return Ok(NativeParseMatch::No); } - if let Some(lookahead) = rule.lookahead.as_ref() { + if let Some(first_set) = rule.first_set.as_ref() { let token_id = self.token_ids.get(self.position).copied().unwrap_or(0); - if lookahead.binary_search(&token_id).is_err() && lookahead.binary_search(&0).is_err() { + if first_set.binary_search(&token_id).is_err() && !rule.nullable { return Ok(NativeParseMatch::No); } + } else if !rule.nullable { + return Ok(NativeParseMatch::No); } let starting_position = self.position; @@ -1823,11 +1337,17 @@ fn export_grammar(grammar_zval: &mut Zval) -> PhpResult> { .and_then(Zval::array) .ok_or_else(|| php_error("Missing grammar rules"))?, )?; - let parsed_lookahead = parse_lookahead( + let parsed_first_sets = parse_first_sets( + array + .get("first_sets") + .and_then(Zval::array) + .ok_or_else(|| php_error("Missing grammar first_sets"))?, + )?; + let parsed_nullable = parse_id_set( array - .get("lookahead_is_match_possible") + .get("nullable_branches") .and_then(Zval::array) - .ok_or_else(|| php_error("Missing grammar lookahead"))?, + .ok_or_else(|| php_error("Missing grammar nullable_branches"))?, )?; let parsed_rule_names = parse_rule_names( array @@ -1850,7 +1370,8 @@ fn export_grammar(grammar_zval: &mut Zval) -> PhpResult> { .find_map(|(id, name)| (name == "selectStatement").then_some(*id)); let rules = build_rules( parsed_rules, - parsed_lookahead, + parsed_first_sets, + parsed_nullable, parsed_rule_names, parsed_fragment_ids, )?; @@ -1928,13 +1449,15 @@ fn export_tokens(tokens: &mut Zval) -> PhpResult<(ParserTokenSource, Vec)> fn build_rules( rules: HashMap>>, - lookahead: HashMap>, + first_sets: HashMap>, + nullable: HashSet, rule_names: HashMap, fragment_ids: HashSet, ) -> PhpResult>> { let max_rule_id = rules .keys() - .chain(lookahead.keys()) + .chain(first_sets.keys()) + .chain(nullable.iter()) .chain(rule_names.keys()) .chain(fragment_ids.iter()) .copied() @@ -1945,18 +1468,19 @@ fn build_rules( for (rule_id, branches) in rules { let index = usize::try_from(rule_id).map_err(php_error)?; - let mut lookahead = lookahead.get(&rule_id).map(|set| { + let mut first_set = first_sets.get(&rule_id).map(|set| { let mut values: Vec = set.iter().copied().collect(); values.sort_unstable(); values }); - if let Some(values) = lookahead.as_mut() { + if let Some(values) = first_set.as_mut() { values.dedup(); } dense_rules[index] = Some(Rule { branches, - lookahead, + first_set, + nullable: nullable.contains(&rule_id), rule_name: rule_names.get(&rule_id).cloned().unwrap_or_default(), is_fragment: fragment_ids.contains(&rule_id), }); @@ -1992,20 +1516,24 @@ fn parse_rules(array: &ZendHashTable) -> PhpResult>>> Ok(rules) } -fn parse_lookahead(array: &ZendHashTable) -> PhpResult>> { - let mut lookahead = HashMap::new(); - for (rule_key, lookup_zval) in array { +/// Parse the per-rule FIRST sets, keyed `[rule_id => [token_id => true]]`. +/// The native parser uses these to decide early whether a rule can start +/// with the current token; it builds its own branch candidates from +/// `rules`, so the pure-PHP parser's per-token selector table is not needed. +fn parse_first_sets(array: &ZendHashTable) -> PhpResult>> { + let mut first_sets = HashMap::new(); + for (rule_key, tokens_zval) in array { let rule_id = array_key_to_i64(rule_key)?; - let lookup_array = lookup_zval + let tokens = tokens_zval .array() - .ok_or_else(|| php_error("Grammar lookahead entry must be an array"))?; - let mut set = HashSet::with_capacity(lookup_array.len()); - for (token_key, _) in lookup_array { + .ok_or_else(|| php_error("Grammar first_sets entry must be an array"))?; + let mut set = HashSet::with_capacity(tokens.len()); + for (token_key, _) in tokens { set.insert(array_key_to_i64(token_key)?); } - lookahead.insert(rule_id, set); + first_sets.insert(rule_id, set); } - Ok(lookahead) + Ok(first_sets) } fn parse_rule_names(array: &ZendHashTable) -> PhpResult> { @@ -2050,37 +1578,5 @@ pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { .class::() .class::() .class::() - .function(wrap_function!(wp_sqlite_mysql_native_ast_release_wrapper)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_materialize_wrapper - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_node)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_has_child_token)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_first_child)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_child_node - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_child_token - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_descendant_node - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_first_descendant_token - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_children)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_nodes)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_child_tokens)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_descendants)) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_descendant_nodes - )) - .function(wrap_function!( - wp_sqlite_mysql_native_ast_get_descendant_tokens - )) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_start)) - .function(wrap_function!(wp_sqlite_mysql_native_ast_get_length)) .info_function(php_module_info) }