php-toolkit/components/DataLiberation/URL/URLInTextProcessor.php at f2e4f6d67804e5e9568ada5ea481768e6defd398 · WordPress/php-toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
<?php

namespace WordPress\DataLiberation\URL;

use WordPress\DataLiberation\BlockMarkup\URL;
use WordPress\HTML\WP_HTML_Text_Replacement;

/**
 * Finds string fragments that look like URLs and allow replacing them.
 * This is the first, "thick" sieve that yields "URL candidates" that must be
 * validated with a WHATWG-compliant parser. Some of the candidates will be
 * false positives.
 *
 * This is a "thick sieve" that matches too much instead of too little. It
 * will yield false positives, but will not miss a URL
 *
 * Looks for URLs:
 *
 * * Starting with http:// or https://
 * * Starting with //
 * * Domain-only, e.g. www.example.com
 * * Domain + path, e.g. www.example.com/path
 *
 * ### Protocols
 *
 * As a migration-oriented tool, this processor will only consider http and https protocols.
 *
 * ### Domain names
 *
 * UTF-8 characters in the domain names are supported even if they're
 * not encoded as punycode. For example, scanning the text:
 *
 * > Więcej na łąka.pl
 *
 * Would yield `łąka.pl`
 *
 * ### Paths
 *
 * The path is limited to ASCII characters, as per the URL specification.
 * For example, scanning the text:
 *
 * > Visit the WordPress plugins directory https://w.org/plugins?łąka=1
 *
 * Would yield `https://w.org/plugins?`, not `https://w.org/plugins?łąka=1`.
 * However, scanning this text:
 *
 * > Visit the WordPress plugins directory https://w.org/plugins?%C5%82%C4%85ka=1
 *
 * Would yield `https://w.org/plugins?%C5%82%C4%85ka=1`.
 *
 * ### Parenthesis treatment
 *
 * This scanner captures parentheses as a part of the path, query, or fragment, except
 * when they're seen as the last character in the URL. For example, scanning the text:
 *
 * > Visit the WordPress plugins directory (https://w.org/plugins)
 *
 * Would yield `https://w.org/plugins`, but scanning the text:
 *
 * > Visit the WordPress plugins directory (https://w.org/plug(in)s
 *
 * Would yield `https://w.org/plug(in)s`.
 */
class URLInTextProcessor {

	private $text;
	private $url_starts_at;
	private $url_length;
	private $bytes_already_parsed = 0;
	/**
	 * @var string
	 */
	private $raw_url;
	/**
	 * @var URL
	 */
	private $parsed_url;
	private $did_prepend_protocol;
	/**
	 * The base URL for the parsing algorithm.
	 * See https://url.spec.whatwg.org/.
	 *
	 * @var mixed|null
	 */
	private $base_url;
	private $base_protocol;

	/**
	 * The regular expression pattern used for the matchin URL candidates
	 * from the text.
	 *
	 * @var string
	 */
	private $regex;

	/**
	 * @see \WP_HTML_Tag_Processor
	 * @var WP_HTML_Text_Replacement[]
	 */
	private $lexical_updates = array();

	/**
	 * @var bool
	 * A flag to indicate whether the URL matching should be strict or not.
	 * If set to true, the matching will be strict, meaning it will only match URLs that strictly adhere to the pattern.
	 * If set to false, the matching will be more lenient, allowing for potential false positives.
	 */
	private $strict = false;
	private static $public_suffix_list;


	public function __construct( $text, $base_url = null ) {
		if ( ! self::$public_suffix_list ) {
			// @TODO: Parse wildcards and exceptions from the public suffix list.
			self::$public_suffix_list = require_once __DIR__ . '/public_suffix_list.php';
		}
		$this->text          = $text;
		$this->base_url      = $base_url;
		$this->base_protocol = $base_url ? parse_url( $base_url, PHP_URL_SCHEME ) : null;

		$prefix = $this->strict ? '^' : '';
		$suffix = $this->strict ? '$' : '';

		// Source: https://github.com/vstelmakh/url-highlight/blob/master/src/Matcher/Matcher.php.
		$this->regex = '/' . $prefix . '
            (?:                                                      # scheme
                (?<scheme>https?:)?                                  # Only consider http and https
                \/\/                                                 # The protocol does not have to be there, but when
                                                                     # it is, is must be followed by \/\/
            )?
            (?:                                                        # userinfo
                (?:
                    (?<=\/{2})                                             # prefixed with \/\/
                    |                                                      # or
                    (?=[^\p{Sm}\p{Sc}\p{Sk}\p{P}])                         # start with not: mathematical, currency, modifier symbol, punctuation
                )
                (?<userinfo>[^\s<>@\/]+)                                   # not: whitespace, < > @ \/
                @                                                          # at
            )?
            (?=[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}])                   # followed by valid host char
            (?|                                                        # host
                (?<host>                                                   # host prefixed by scheme or userinfo (less strict)
                    (?<=\/\/|@)                                               # prefixed with \/\/ or @
                    (?=[^\-])                                                  # label start, not: -
                    (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
                    (?<=[^\-])                                                 # label end, not: -
                    (?:                                                        # more label parts
                        \.
                        (?=[^\-])                                                  # label start, not: -
                        (?<tld>(?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63})   # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
                        (?<=[^\-])                                                 # label end, not: -
                    )*
                )
                |                                                          # or
                (?<host>                                                   # host with tld (no scheme or userinfo)
                    (?=[^\-])                                                  # label start, not: -
                    (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
                    (?<=[^\-])                                                 # label end, not: -
                    (?:                                                        # more label parts
                        \.
                        (?=[^\-])                                                  # label start, not: -
                        (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
                        (?<=[^\-])                                                 # label end, not: -
                    )*
                    \.(?<tld>\w{2,63})                                         # tld
                )
            )
            (?:\:(?<port>\d+))?                                        # port
            (?<path>                                                   # path, query, fragment
                [\/?#]                                                 # prefixed with \/ or ? or #
                [^\s<>]*                                               # any chars except whitespace and <>
                (?<=[^\s<>({\[`!;:\'".,?«»“”‘’])                       # end with not a space or some punctuation chars
            )?
        ' . $suffix . '/ixuJ';
	}

	/**
	 * @return string
	 */
	public function next_url() {
		$this->raw_url              = null;
		$this->parsed_url           = null;
		$this->url_starts_at        = null;
		$this->url_length           = null;
		$this->did_prepend_protocol = false;
		while ( true ) {
			/**
			 * Thick sieve – eagerly match things that look like URLs but turn out to not be URLs in the end.
			 */
			$matches = array();
			$found   = preg_match( $this->regex, $this->text, $matches, PREG_OFFSET_CAPTURE, $this->bytes_already_parsed );
			if ( 1 !== $found ) {
				return false;
			}

			$matched_url = $matches[0][0];
			if (
				$matched_url[ strlen( $matched_url ) - 1 ] === ')' ||
				$matched_url[ strlen( $matched_url ) - 1 ] === '.'
			) {
				$matched_url = substr( $matched_url, 0, - 1 );
			}
			$this->bytes_already_parsed = $matches[0][1] + strlen( $matched_url );

			$had_double_slash = WPURL::has_double_slash( $matched_url );

			$url_to_parse = $matched_url;
			if ( $this->base_url && $this->base_protocol && ! $had_double_slash ) {
				$url_to_parse               = WPURL::ensure_protocol( $url_to_parse, $this->base_protocol );
				$this->did_prepend_protocol = true;
			}

			/*
			 * Extra fine sieve – parse the candidates using a WHATWG-compliant parser to rule out false positives.
			 */
			$parsed_url = WPURL::parse( $url_to_parse, $this->base_url );
			if ( false === $parsed_url ) {
				continue;
			}

			// Additional rigor for URLs that are not explicitly preceded by a double slash.
			if ( ! $had_double_slash ) {
				/*
				 * Skip TLDs that are not in the public suffix.
				 * This reduces false positives like `index.html` or `plugins.php`.
				 *
				 * See https://publicsuffix.org/.
				 */
				$last_dot_position = strrpos( $parsed_url->hostname, '.' );
				if ( false === $last_dot_position ) {
					/*
					 * Oh, there was no dot in the hostname AND no double slash at
					 * the beginning! Let's assume this isn't a valid URL and move on.
					 * @TODO: Explore updating the regular expression above to avoid matching
					 *        URLs without a dot in the hostname when they're not preceeded
					 *        by a protocol.
					 */
					continue;
				}

				$tld = strtolower( substr( $parsed_url->hostname, $last_dot_position + 1 ) );
				if ( empty( self::$public_suffix_list[ $tld ] ) && $tld !== 'internal' ) {
					// This TLD is not in the public suffix list. It's not a valid domain name.
					continue;
				}
			}

			$this->parsed_url    = $parsed_url;
			$this->raw_url       = $matched_url;
			$this->url_starts_at = $matches[0][1];
			$this->url_length    = strlen( $matches[0][0] );

			return true;
		}
	}

	public function get_raw_url() {
		if ( null === $this->raw_url ) {
			return false;
		}

		return $this->raw_url;
	}

	public function get_parsed_url() {
		if ( null === $this->parsed_url ) {
			return false;
		}

		return $this->parsed_url;
	}

	public function set_raw_url( $new_url ) {
		if ( null === $this->raw_url ) {
			return false;
		}
		if ( $this->did_prepend_protocol ) {
			$new_url = substr( $new_url, strpos( $new_url, '://' ) + 3 );
		}
		$this->raw_url                                 = $new_url;
		$this->lexical_updates[ $this->url_starts_at ] = new \WordPress\HTML\WP_HTML_Text_Replacement(
			$this->url_starts_at,
			$this->url_length,
			$new_url
		);

		return true;
	}

	private function apply_lexical_updates() {
		if ( ! count( $this->lexical_updates ) ) {
			return 0;
		}

		/*
		 * Attribute updates can be enqueued in any order but updates
		 * to the document must occur in lexical order; that is, each
		 * replacement must be made before all others which follow it
		 * at later string indices in the input document.
		 *
		 * Sorting avoid making out-of-order replacements which
		 * can lead to mangled output, partially-duplicated
		 * attributes, and overwritten attributes.
		 */

		ksort( $this->lexical_updates );

		$bytes_already_copied = 0;
		$output_buffer        = '';
		foreach ( $this->lexical_updates as $diff ) {
			$shift = strlen( $diff->text ) - $diff->length;

			// Adjust the cursor position by however much an update affects it.
			if ( $diff->start < $this->bytes_already_parsed ) {
				$this->bytes_already_parsed += $shift;
			}

			$output_buffer .= substr( $this->text, $bytes_already_copied, $diff->start - $bytes_already_copied );
			if ( $diff->start === $this->url_starts_at ) {
				$this->url_starts_at = strlen( $output_buffer );
				$this->url_length    = strlen( $diff->text );
			}
			$output_buffer        .= $diff->text;
			$bytes_already_copied = $diff->start + $diff->length;
		}

		$this->text            = $output_buffer . substr( $this->text, $bytes_already_copied );
		$this->lexical_updates = array();
	}

	public function get_updated_text() {
		$this->apply_lexical_updates();

		return $this->text;
	}
}