Skip to content

Commit 7efecae

Browse files
committed
reuse the HTMLUnicodeEntitiesParser (like we do for many other things) and simplify the code a it
1 parent 3d9449b commit 7efecae

1 file changed

Lines changed: 20 additions & 16 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLScanner.java

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,11 @@ public class HTMLScanner implements XMLDocumentSource, XMLLocator, HTMLComponent
517517
* each use to ensure correctness.
518518
* <p>
519519
* Thread safety: Safe because scanner instances are single-threaded.
520-
*/ final boolean[] fSingleBoolean = {false};
520+
*/
521+
final boolean[] fSingleBoolean = {false};
522+
523+
/** Reusable parser for numeric character references (&#x...; and &#...;) */
524+
private final HTMLUnicodeEntitiesParser fUnicodeEntitiesParser = new HTMLUnicodeEntitiesParser();
521525

522526
final HTMLConfiguration htmlConfiguration_;
523527

@@ -1343,6 +1347,7 @@ else if (NAMES_LOWERCASE == mode && !Character.isLowerCase(c)) {
13431347
break;
13441348
}
13451349
}
1350+
13461351
final int length = fCurrentEntity.offset_ - offset;
13471352
final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null;
13481353
if (DEBUG_BUFFER) {
@@ -1418,7 +1423,6 @@ else if (NAMES_LOWERCASE == fNamesElems && !Character.isLowerCase(c)) {
14181423
if (DEBUG_BUFFER) {
14191424
fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
14201425
}
1421-
14221426
return name;
14231427
}
14241428

@@ -1439,17 +1443,17 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin
14391443
str.append((char) nextChar);
14401444

14411445
if ('#' == nextChar) {
1442-
final HTMLUnicodeEntitiesParser parser = new HTMLUnicodeEntitiesParser();
1446+
fUnicodeEntitiesParser.reset();
14431447

14441448
do {
14451449
nextChar = fCurrentEntity.readPreservingBufferContent();
14461450
if (nextChar != -1) {
14471451
str.append((char) nextChar);
14481452
}
14491453
}
1450-
while (nextChar != -1 && parser.parseNumeric(nextChar));
1454+
while (nextChar != -1 && fUnicodeEntitiesParser.parseNumeric(nextChar));
14511455

1452-
final String match = parser.getMatch();
1456+
final String match = fUnicodeEntitiesParser.getMatch();
14531457
if (match == null) {
14541458
fCurrentEntity.rewind(str.length() - 1);
14551459
if (plainValue != null) {
@@ -1458,7 +1462,7 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin
14581462
str.clearAndAppend('&');
14591463
}
14601464
else {
1461-
fCurrentEntity.rewind(parser.getRewindCount());
1465+
fCurrentEntity.rewind(fUnicodeEntitiesParser.getRewindCount());
14621466
if (plainValue != null) {
14631467
plainValue.append(str);
14641468
}
@@ -2312,16 +2316,16 @@ else if (c == '/') {
23122316
else if (!fAllowSelfclosingTags_
23132317
&& !fAllowSelfclosingIframe_
23142318
&& "iframe".equals(enameLC)) {
2315-
scanUntilEndTag("iframe");
2319+
scanUntilEndTag("/iframe");
23162320
}
23172321
else if (!fParseNoScriptContent_ && "noscript".equals(enameLC)) {
2318-
scanUntilEndTag("noscript");
2322+
scanUntilEndTag("/noscript");
23192323
}
23202324
else if ("noframes".equals(enameLC)) {
2321-
scanUntilEndTag("noframes");
2325+
scanUntilEndTag("/noframes");
23222326
}
23232327
else if ("noembed".equals(enameLC)) {
2324-
scanUntilEndTag("noembed");
2328+
scanUntilEndTag("/noembed");
23252329
}
23262330
// title inside svg
23272331
else if ("title".equals(enameLC)
@@ -2400,15 +2404,14 @@ private void eof() {
24002404
* plain text when feature {@link HTMLScanner#PARSE_NOSCRIPT_CONTENT} is set to
24012405
* false.
24022406
*
2403-
* @param tagName the tag for which content is scanned (one of "noscript",
2404-
* "noframes", "iframe")
2407+
* @param tagName the tag for which content is scanned (one of "/noscript",
2408+
* "/noframes", "/noembed", "/iframe")
24052409
* @throws IOException on error
24062410
*/
2407-
private void scanUntilEndTag(final String tagName) throws IOException {
2411+
private void scanUntilEndTag(final String tagNameWithLeadingSlash) throws IOException {
24082412
fScanUntilEndTag.clear();
24092413

2410-
final String end = "/" + tagName;
2411-
final int lengthToScan = tagName.length() + 2;
2414+
final int lengthToScan = tagNameWithLeadingSlash.length() + 1;
24122415

24132416
while (true) {
24142417
final int c = fCurrentEntity.read();
@@ -2418,7 +2421,8 @@ private void scanUntilEndTag(final String tagName) throws IOException {
24182421
if (c == '<') {
24192422
final String next = fCurrentEntity.nextContent(lengthToScan) + " ";
24202423
if (next.length() >= lengthToScan
2421-
&& end.equalsIgnoreCase(next.substring(0, end.length()))
2424+
&& tagNameWithLeadingSlash.equalsIgnoreCase(
2425+
next.substring(0, tagNameWithLeadingSlash.length()))
24222426
&& ('>' == next.charAt(lengthToScan - 1)
24232427
|| Character.isWhitespace(next.charAt(lengthToScan - 1)))) {
24242428
fCurrentEntity.rewind();

0 commit comments

Comments
 (0)