From e787820f49a161c76562401c8c63cb2903080571 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 24 Mar 2026 14:51:55 -0400 Subject: [PATCH 1/2] decapsulate html from rtf within msgs...lol --- .../parser/microsoft/OfficeParserConfig.java | 18 -- .../parser/microsoft/OutlookExtractor.java | 151 ++++++---------- .../msg/RTFEncapsulatedHTMLExtractor.java | 165 +++++++++++++++--- .../parser/microsoft/OutlookParserTest.java | 48 +---- .../msg/RTFEncapsulatedHTMLExtractorTest.java | 108 ++++++++++++ 5 files changed, 310 insertions(+), 180 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index db6d4e78e9..c8886e5fdf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -39,7 +39,6 @@ public class OfficeParserConfig implements Serializable { private boolean writeSelectHeadersInBody = false; - private boolean extractAllAlternativesFromMSG = false; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore @@ -214,23 +213,6 @@ public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { this.concatenatePhoneticRuns = concatenatePhoneticRuns; } - public boolean isExtractAllAlternativesFromMSG() { - return extractAllAlternativesFromMSG; - } - - /** - * Some .msg files can contain body content in html, rtf and/or text. - * The default behavior is to pick the first non-null value and include only that. - * If you'd like to extract all non-null body content, which is likely duplicative, - * set this value to true. - * - * @param extractAllAlternativesFromMSG whether or not to extract all alternative parts - * @since 1.17 - */ - public void setExtractAllAlternativesFromMSG(boolean extractAllAlternativesFromMSG) { - this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG; - } - public boolean isIncludeMissingRows() { return includeMissingRows; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 0b8db23f45..a2ef6de04f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -78,7 +78,6 @@ import org.apache.tika.metadata.Property; import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlEncodingDetector; @@ -183,7 +182,6 @@ private static void loadMessageClasses() { private final DirectoryNode root; private final MAPIMessage msg; private final ParseContext parseContext; - private final boolean extractAllAlternatives; HtmlEncodingDetector detector = new HtmlEncodingDetector(); @@ -191,8 +189,6 @@ public OutlookExtractor(DirectoryNode root, Metadata metadata, ParseContext cont super(context, metadata); this.root = root; this.parseContext = context; - this.extractAllAlternatives = - context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG(); try { this.msg = new MAPIMessage(root); } catch (IOException e) { @@ -296,6 +292,7 @@ private void _parse(XHTMLContentHandler xhtml) throws TikaException, SAXExceptio Set contentIdNames = new HashSet<>(); handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); + // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { Metadata attachMetadata = Metadata.newInstance(context); @@ -586,36 +583,11 @@ private void handleGeneralDates(MAPIMessage msg, Map headers, private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, XHTMLContentHandler xhtml, Set contentIdNames) throws SAXException, IOException, TikaException { - - if (extractAllAlternatives) { - extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); - return; - } - _handleBestBodyChunk(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); - - } - private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, - XHTMLContentHandler xhtml, Set contentIdNames) - throws SAXException, IOException, TikaException { - //try html, then rtf, then text + // Priority: a) HTML chunk, b) HTML extracted from RTF, c) raw RTF, d) text if (htmlChunk != null) { - byte[] data = null; - if (htmlChunk instanceof ByteChunk) { - data = ((ByteChunk) htmlChunk).getValue(); - } else if (htmlChunk instanceof StringChunk) { - data = ((StringChunk) htmlChunk).getRawValue(); - } + byte[] data = getValue(htmlChunk); if (data != null) { - Parser htmlParser = EmbeddedDocumentUtil - .tryToFindExistingLeafParser(JSoupParser.class, parseContext); - if (htmlParser == null) { - htmlParser = new JSoupParser(); - } - Metadata htmlMetadata = Metadata.newInstance(context); - try (TikaInputStream tis = TikaInputStream.get(data)) { - htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), htmlMetadata, parseContext); - } - extractContentIdNamesFromHtml(data, htmlMetadata, contentIdNames); + parseHtmlBody(data, xhtml, contentIdNames); parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name()); return; } @@ -623,25 +595,34 @@ private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk textChu if (rtfChunk != null) { ByteChunk chunk = (ByteChunk) rtfChunk; //avoid buffer underflow TIKA-2530 - //TODO -- would be good to find an example triggering file and - //figure out if this is a bug in POI or a genuine 0 length chunk if (chunk.getValue() != null && chunk.getValue().length > 0) { MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); + byte[] rtfData = rtf.getData(); + // Try to extract encapsulated HTML — returns null if not present + String html = RTFEncapsulatedHTMLExtractor.extract(rtfData); + if (html != null) { + parseHtmlString(html, xhtml, contentIdNames); + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, + BODY_TYPES_PROCESSED.RTF.name()); + parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, "true"); + return; + } + // Fall back to parsing as raw RTF RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil .tryToFindExistingLeafParser(RTFParser.class, parseContext); if (rtfParser == null) { rtfParser = new RTFParser(); } Metadata rtfMetadata = Metadata.newInstance(context); - try (TikaInputStream tis = TikaInputStream.get(rtf.getData())) { + try (TikaInputStream tis = TikaInputStream.get(rtfData)) { rtfParser.parseInline(tis, xhtml, rtfMetadata, parseContext); } - extractContentIdNamesFromRtf(rtf.getData(), rtfMetadata, contentIdNames); + // Scan raw RTF bytes for cid: references + extractContentIdNames(rtfData, contentIdNames); parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name()); - parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, - rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); + parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, "false"); return; } } @@ -651,21 +632,46 @@ private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk textChu extractContentIdNamesFromText(s, contentIdNames); parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name()); } + } + private void parseHtmlBody(byte[] htmlData, XHTMLContentHandler xhtml, + Set contentIdNames) + throws SAXException, IOException, TikaException { + Parser htmlParser = EmbeddedDocumentUtil + .tryToFindExistingLeafParser(JSoupParser.class, parseContext); + if (htmlParser == null) { + htmlParser = new JSoupParser(); + } + Metadata htmlMetadata = Metadata.newInstance(context); + try (TikaInputStream tis = TikaInputStream.get(htmlData)) { + htmlParser.parse(tis, + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + htmlMetadata, parseContext); + } + extractContentIdNames(htmlData, contentIdNames); } - private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, Set contentIdNames) { - // Try to de-encapsulate the HTML from the RTF first - String html = RTFEncapsulatedHTMLExtractor.extract(data); - if (html != null) { - extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata, contentIdNames); - return; - } - // Fall back to scanning the raw RTF bytes for cid: references - extractContentIdNamesFromHtml(data, metadata, contentIdNames); + /** + * Parse an already-decoded HTML string using JSoupParser.parseString(), + * bypassing encoding detection entirely. Used for HTML de-encapsulated + * from RTF where the charset has already been handled. + */ + private void parseHtmlString(String html, XHTMLContentHandler xhtml, + Set contentIdNames) + throws SAXException, IOException, TikaException { + JSoupParser htmlParser = (JSoupParser) EmbeddedDocumentUtil + .tryToFindExistingLeafParser(JSoupParser.class, parseContext); + if (htmlParser == null) { + htmlParser = new JSoupParser(); + } + Metadata htmlMetadata = Metadata.newInstance(context); + htmlParser.parseString(html, + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + htmlMetadata, parseContext); + extractContentIdNames(html.getBytes(UTF_8), contentIdNames); } - private void extractContentIdNamesFromHtml(byte[] data, Metadata metadata, Set contentIdNames) { + private void extractContentIdNames(byte[] data, Set contentIdNames) { String html = new String(data, UTF_8); Matcher imageMatcher = IMG_TAG_PATTERN.matcher(html); Matcher cidSrcMatcher = SRC_ATTR_PATTERN.matcher(""); @@ -687,55 +693,6 @@ private void extractContentIdNamesFromText(String s, Set contentIdNames) } } - private void extractAllAlternatives(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, - XHTMLContentHandler xhtml, Set contentIdNames) - throws TikaException, SAXException, IOException { - if (htmlChunk != null) { - byte[] data = getValue(htmlChunk); - if (data != null) { - handleEmbeddedResource(TikaInputStream.get(data), "html-body", null, - MediaType.TEXT_HTML.toString(), xhtml, true); - extractContentIdNamesFromHtml(data, Metadata.newInstance(context), contentIdNames); - parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name()); - } - } - if (rtfChunk != null) { - ByteChunk chunk = (ByteChunk) rtfChunk; - MAPIRtfAttribute rtf = - new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), - chunk.getValue()); - - byte[] data = rtf.getData(); - if (data != null) { - Metadata rtfMetadata = Metadata.newInstance(context); - handleEmbeddedResource(TikaInputStream.get(data), rtfMetadata, - "rtf-body", null, null, - "application/rtf", xhtml, true); - extractContentIdNamesFromRtf(data, rtfMetadata, contentIdNames); - //copy this info into the parent...what else should we copy? - parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name()); - parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, - rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); - - } - } - if (textChunk != null) { - byte[] data = getValue(textChunk); - if (data != null) { - Metadata chunkMetadata = Metadata.newInstance(context); - chunkMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, - MediaType.TEXT_PLAIN.toString()); - handleEmbeddedResource(TikaInputStream.get(data), chunkMetadata, null, "text-body", - null, MediaType.TEXT_PLAIN.toString(), xhtml, true); - if (textChunk instanceof StringChunk) { - extractContentIdNamesFromText(((StringChunk) textChunk).getValue(), contentIdNames); - } - parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name()); - } - } - - } - //can return null! private byte[] getValue(Chunk chunk) { byte[] data = null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java index 3ef453a48d..faaaa9f010 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java @@ -19,6 +19,8 @@ import java.io.ByteArrayOutputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,14 +36,10 @@ * wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only rendering hints) * * - *

Within both htmltag groups and inter-tag text, the following RTF escapes are decoded:

- *
    - *
  • {@code \par} → newline
  • - *
  • {@code \tab} → tab character
  • - *
  • {@code \line} → {@code
    }
  • - *
  • {@code \'xx} → single byte (decoded using the document's ANSI code page)
  • - *
  • {@code \\}, {@code \{}, {@code \}} → literal characters
  • - *
+ *

Per the MS-OXRTFEX specification, {@code \'xx} hex escapes in inter-tag text are decoded + * using the code page of the currently selected font ({@code \fN}). The font-to-charset mapping + * is built from the RTF font table's {@code \fcharsetN} declarations. Inside + * {@code {\*\htmltag}} groups, the document's default code page ({@code \ansicpgN}) is used.

*/ public class RTFEncapsulatedHTMLExtractor { @@ -51,6 +49,29 @@ public class RTFEncapsulatedHTMLExtractor { private static final String FROM_HTML_MARKER = "\\fromhtml"; private static final String ANSICPG_PREFIX = "\\ansicpg"; + // Maps RTF \fcharset values to Java Charset objects. + // Based on the Windows CharacterSet enumeration and Tika's TextExtractor.FCHARSET_MAP. + private static final Map FCHARSET_MAP = new HashMap<>(); + + static { + FCHARSET_MAP.put(0, Charset.forName("windows-1252")); // ANSI + FCHARSET_MAP.put(77, Charset.forName("MacRoman")); // Mac Roman + FCHARSET_MAP.put(128, Charset.forName("MS932")); // Shift_JIS (Japanese) + FCHARSET_MAP.put(129, Charset.forName("ms949")); // Hangul (Korean) + FCHARSET_MAP.put(130, charsetOrNull("x-Johab")); // Johab (Korean) + FCHARSET_MAP.put(134, Charset.forName("GBK")); // GB2312 (Simplified Chinese) + FCHARSET_MAP.put(136, Charset.forName("Big5")); // Big5 (Traditional Chinese) + FCHARSET_MAP.put(161, Charset.forName("windows-1253")); // Greek + FCHARSET_MAP.put(162, Charset.forName("windows-1254")); // Turkish + FCHARSET_MAP.put(163, Charset.forName("windows-1258")); // Vietnamese + FCHARSET_MAP.put(177, Charset.forName("windows-1255")); // Hebrew + FCHARSET_MAP.put(178, Charset.forName("windows-1256")); // Arabic + FCHARSET_MAP.put(186, Charset.forName("windows-1257")); // Baltic + FCHARSET_MAP.put(204, Charset.forName("windows-1251")); // Russian + FCHARSET_MAP.put(222, Charset.forName("ms874")); // Thai + FCHARSET_MAP.put(238, Charset.forName("windows-1250")); // Eastern Europe + } + /** * Extracts the HTML content from an encapsulated-HTML RTF document. * @@ -69,11 +90,12 @@ public static String extract(byte[] rtfBytes) { return null; } - Charset codePage = detectCodePage(rtf); + Charset defaultCodePage = detectCodePage(rtf); + Map fontCharsets = parseFontTable(rtf); + // Track the current font's charset for inter-tag text decoding + Charset currentFontCharset = defaultCodePage; // Find the start of the document body (after the RTF header). - // We skip past the initial {\rtf1... header by finding the first - // htmltag group or \htmlrtf marker — everything before that is RTF preamble. int bodyStart = rtf.indexOf(HTMLTAG_PREFIX); if (bodyStart < 0) { return null; @@ -88,7 +110,7 @@ public static String extract(byte[] rtfBytes) { while (pos < len) { // Check if we're at an htmltag group if (rtf.startsWith(HTMLTAG_PREFIX, pos)) { - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); // Find matching close brace int groupEnd = findMatchingBrace(rtf, pos); @@ -106,9 +128,9 @@ public static String extract(byte[] rtfBytes) { contentStart++; } - // Decode the htmltag content + // Decode the htmltag content using default code page per MS-OXRTFEX spec String inner = rtf.substring(contentStart, groupEnd); - decodeRtfEscapes(inner, html, codePage); + decodeRtfEscapes(inner, html, defaultCodePage); pos = groupEnd + 1; continue; @@ -116,7 +138,7 @@ public static String extract(byte[] rtfBytes) { // Check for \htmlrtf control word (start or end of RTF-only block) if (rtf.startsWith("\\htmlrtf", pos)) { - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); int afterWord = pos + "\\htmlrtf".length(); if (afterWord < len && rtf.charAt(afterWord) == '0') { @@ -137,16 +159,33 @@ public static String extract(byte[] rtfBytes) { continue; } - // If we're inside an \htmlrtf skip block, just advance past this character. - // We don't skip nested groups wholesale because \htmlrtf0 may appear inside them. + // Inside \htmlrtf skip blocks: don't emit text, but DO track \fN font switches if (inHtmlRtfSkip) { + if (rtf.charAt(pos) == '\\' && pos + 1 < len && rtf.charAt(pos + 1) == 'f' + && pos + 2 < len && Character.isDigit(rtf.charAt(pos + 2))) { + // Parse \fN control word + int numStart = pos + 2; + int numEnd = numStart; + while (numEnd < len && Character.isDigit(rtf.charAt(numEnd))) { + numEnd++; + } + // Make sure this is \f and not \fcharset, \fi, etc. + if (numEnd == numStart + (numEnd - numStart) && + (numEnd >= len || !Character.isLetter(rtf.charAt(numEnd)))) { + int fontId = Integer.parseInt(rtf.substring(numStart, numEnd)); + Charset fontCs = fontCharsets.get(fontId); + if (fontCs != null) { + currentFontCharset = fontCs; + } + } + } pos++; continue; } // Check for other { groups (nested RTF groups that aren't htmltag) if (rtf.charAt(pos) == '{') { - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); int end = findMatchingBrace(rtf, pos); if (end > 0) { pos = end + 1; @@ -158,7 +197,7 @@ public static String extract(byte[] rtfBytes) { // Skip closing braces if (rtf.charAt(pos) == '}') { - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); pos++; continue; } @@ -167,7 +206,7 @@ public static String extract(byte[] rtfBytes) { if (rtf.charAt(pos) == '\\' && pos + 1 < len) { char next = rtf.charAt(pos + 1); - // \'xx hex escape + // \'xx hex escape — decode using current font's charset if (next == '\'' && pos + 3 < len) { int hi = Character.digit(rtf.charAt(pos + 2), 16); int lo = Character.digit(rtf.charAt(pos + 3), 16); @@ -178,7 +217,7 @@ public static String extract(byte[] rtfBytes) { continue; } - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); // Escaped literals if (next == '\\' || next == '{' || next == '}') { @@ -196,7 +235,8 @@ public static String extract(byte[] rtfBytes) { } String word = rtf.substring(wordStart, wordEnd); - // Skip optional numeric parameter + // Parse optional numeric parameter + int paramStart = wordEnd; int paramEnd = wordEnd; if (paramEnd < len && (rtf.charAt(paramEnd) == '-' || Character.isDigit(rtf.charAt(paramEnd)))) { @@ -222,6 +262,17 @@ public static String extract(byte[] rtfBytes) { case "line": html.append("
"); break; + case "f": + // Font switch in inter-tag text — update current charset + if (paramEnd > paramStart) { + int fontId = Integer.parseInt( + rtf.substring(paramStart, paramEnd)); + Charset fontCs = fontCharsets.get(fontId); + if (fontCs != null) { + currentFontCharset = fontCs; + } + } + break; default: // Skip unknown control words break; @@ -242,12 +293,12 @@ public static String extract(byte[] rtfBytes) { } // Regular text character between htmltag groups — this is HTML content - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); html.append(rtf.charAt(pos)); pos++; } - flushPendingBytes(pendingBytes, html, codePage); + flushPendingBytes(pendingBytes, html, currentFontCharset); if (html.length() == 0) { return null; @@ -255,6 +306,64 @@ public static String extract(byte[] rtfBytes) { return html.toString(); } + /** + * Parse the RTF font table to build a mapping from font ID to charset. + */ + static Map parseFontTable(String rtf) { + Map result = new HashMap<>(); + int fontTblStart = rtf.indexOf("{\\fonttbl"); + if (fontTblStart < 0) { + return result; + } + int fontTblEnd = findMatchingBrace(rtf, fontTblStart); + if (fontTblEnd < 0) { + return result; + } + String fontTable = rtf.substring(fontTblStart, fontTblEnd + 1); + + int currentFontId = -1; + int pos = 0; + int ftLen = fontTable.length(); + + while (pos < ftLen) { + if (fontTable.charAt(pos) == '\\' && pos + 1 < ftLen + && Character.isLetter(fontTable.charAt(pos + 1))) { + int wordStart = pos + 1; + int wordEnd = wordStart; + while (wordEnd < ftLen && Character.isLetter(fontTable.charAt(wordEnd))) { + wordEnd++; + } + String word = fontTable.substring(wordStart, wordEnd); + + // Parse numeric parameter + int paramStart = wordEnd; + int paramEnd = wordEnd; + if (paramEnd < ftLen && (fontTable.charAt(paramEnd) == '-' + || Character.isDigit(fontTable.charAt(paramEnd)))) { + paramEnd++; + while (paramEnd < ftLen && Character.isDigit(fontTable.charAt(paramEnd))) { + paramEnd++; + } + } + + if ("f".equals(word) && paramEnd > paramStart) { + currentFontId = Integer.parseInt(fontTable.substring(paramStart, paramEnd)); + } else if ("fcharset".equals(word) && paramEnd > paramStart + && currentFontId >= 0) { + int fcharset = Integer.parseInt(fontTable.substring(paramStart, paramEnd)); + Charset cs = FCHARSET_MAP.get(fcharset); + if (cs != null) { + result.put(currentFontId, cs); + } + } + pos = paramEnd; + } else { + pos++; + } + } + return result; + } + /** * Find the position of the closing brace that matches the opening brace at * {@code openPos}. Handles nested groups and escaped braces. @@ -443,6 +552,14 @@ static Charset detectCodePage(String rtf) { } } + private static Charset charsetOrNull(String name) { + try { + return Charset.forName(name); + } catch (Exception e) { + return null; + } + } + private static void flushPendingBytes(ByteArrayOutputStream pending, StringBuilder out, Charset codePage) { if (pending.size() > 0) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index eb92465dbe..f4c4b50b73 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -34,7 +34,6 @@ import org.xml.sax.ContentHandler; import org.apache.tika.TikaTest; -import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; @@ -42,7 +41,6 @@ import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.utils.XMLReaderUtils; @@ -282,13 +280,13 @@ public void testOutlookHTMLfromRTF() throws Exception { AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext()); } - // As the HTML version should have been processed, ensure - // we got some of the links + // The encapsulated HTML should have been extracted and parsed through JSoupParser String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " "); assertNotContained("
New Outlook User
", content); - assertContains("designed to help you", content); + assertContains("designed to help you", content); assertContains( - "

Cached Exchange Mode", + "" + + "Cached Exchange Mode", content); // Link - check text around it, and the link itself @@ -366,45 +364,13 @@ public void testPostExtendedMetadata() throws Exception { } - @Test - public void testHandlingAllAlternativesBodies() throws Exception { - //test that default only has one body - List metadataList = getRecursiveMetadata("testMSG.msg"); - assertEquals(1, metadataList.size()); - assertContains("breaking your application", - metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); - assertEquals("application/vnd.ms-outlook", metadataList.get(0).get(Metadata.CONTENT_TYPE)); - - //now try extracting all bodies - //they should each appear as standalone attachments - //with no content in the body of the msg level - Parser p = TikaLoader.load( - getConfigPath(OutlookParserTest.class, "tika-config-extract-all-alternatives-msg.json")) - .loadAutoDetectParser(); - - metadataList = getRecursiveMetadata("testMSG.msg", p); - assertEquals(3, metadataList.size()); - - assertNotContained("breaking your application", - metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); - assertEquals("application/vnd.ms-outlook", - metadataList.get(0).get(Metadata.CONTENT_TYPE)); - - assertContains("breaking your application", - metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); - assertEquals("application/rtf", metadataList.get(1).get(Metadata.CONTENT_TYPE)); - - assertContains("breaking your application", - metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); - assertTrue(metadataList.get(2).get(Metadata.CONTENT_TYPE).startsWith("text/plain")); - - } - @Test public void testNewlinesInRTFBody() throws Exception { List metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER, BasicContentHandlerFactory.HANDLER_TYPE.BODY); - assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); + assertContains("annuaires", content); + assertContains("Synchronisation", content); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java index 0c1096f4f2..9c923a8773 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java @@ -22,6 +22,9 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.nio.charset.Charset; +import java.util.Map; + import org.junit.jupiter.api.Test; public class RTFEncapsulatedHTMLExtractorTest { @@ -214,4 +217,109 @@ public void testLineControlWord() { assertNotNull(html); assertEquals("line1
line2", html); } + + @Test + public void testParseFontTable() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset0 Arial;}\n" + + "{\\f1\\fmodern\\fcharset0 Courier New;}\n" + + "{\\f4\\fswiss\\fcharset134 Simsun;}\n" + + "{\\f5\\fswiss\\fcharset128 MS PGothic;}\n" + + "{\\f6\\fswiss\\fcharset162 Arial Tur;}\n" + + "}\n}"; + Map fonts = RTFEncapsulatedHTMLExtractor.parseFontTable(rtf); + assertEquals(Charset.forName("windows-1252"), fonts.get(0)); + assertEquals(Charset.forName("GBK"), fonts.get(4)); + assertEquals(Charset.forName("MS932"), fonts.get(5)); + assertEquals(Charset.forName("windows-1254"), fonts.get(6)); + } + + @Test + public void testParseFontTableEmpty() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0 no font table}"; + Map fonts = RTFEncapsulatedHTMLExtractor.parseFontTable(rtf); + assertTrue(fonts.isEmpty()); + } + + @Test + public void testCjkFontCharsetTracking() { + // Simulates the real-world case: \ansicpg1252 but \fcharset134 (GBK) font + // used for inter-tag CJK text. The \htmlrtf block switches to \f1 (GBK font) + // and the \'xx bytes after \htmlrtf0 should be decoded as GBK. + // \u53ef\u4ee5 = 可以, GBK bytes: BF C9 D2 D4 + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset0 Arial;}\n" + + "{\\f1\\fswiss\\fcharset134 Simsun;}\n" + + "}\n" + + "{\\*\\htmltag64

}\n" + + "\\htmlrtf {\\f1 \\htmlrtf0\n" + + "\\'bf\\'c9\\'d2\\'d4\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag72

}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("\u53ef\u4ee5"), + "GBK bytes should be decoded as Chinese characters, got: " + html); + } + + @Test + public void testCjkFontSwitchBackToLatin() { + // After CJK text, font switches back to Latin font for ASCII content + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset0 Arial;}\n" + + "{\\f1\\fswiss\\fcharset134 Simsun;}\n" + + "}\n" + + "{\\*\\htmltag64

}\n" + + "\\htmlrtf {\\f1 \\htmlrtf0\n" + + "\\'bf\\'c9\\'d2\\'d4\n" + + "\\htmlrtf\\f0 \\htmlrtf0\n" + + "Hello\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag72

}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("\u53ef\u4ee5"), + "CJK should be decoded correctly, got: " + html); + assertTrue(html.contains("Hello"), + "Latin text after font switch should be preserved"); + } + + @Test + public void testHtmltagUsesDefaultCodePage() { + // Per MS-OXRTFEX spec, \'xx inside htmltag groups should use the + // default code page (\ansicpg), not the current font's charset. + // \'e9 in windows-1252 = é + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset0 Arial;}\n" + + "{\\f1\\fswiss\\fcharset134 Simsun;}\n" + + "}\n" + + "\\htmlrtf {\\f1 \\htmlrtf0\n" + + "{\\*\\htmltag84 caf\\'e9}\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("café", html, + "htmltag content should use default code page, not font charset"); + } + + @Test + public void testFontSwitchInInterTagText() { + // \f control word directly in inter-tag text (outside \htmlrtf blocks) + // should also update the current charset + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset0 Arial;}\n" + + "{\\f1\\fswiss\\fcharset134 Simsun;}\n" + + "}\n" + + "{\\*\\htmltag64

}\n" + + "\\f1 \\'bf\\'c9\n" + + "{\\*\\htmltag72

}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("\u53ef"), + "Font switch in inter-tag text should affect charset, got: " + html); + } } From 4a088dfc3e89fe759c01d86f60bf43a34744d5bd Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 24 Mar 2026 14:51:55 -0400 Subject: [PATCH 2/2] improve font tracking --- .../msg/RTFEncapsulatedHTMLExtractor.java | 26 +++++++++++----- .../msg/RTFEncapsulatedHTMLExtractorTest.java | 31 +++++++++++++++++++ 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java index faaaa9f010..e254dc4447 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java @@ -19,6 +19,8 @@ import java.io.ByteArrayOutputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.ArrayDeque; +import java.util.Deque; import java.util.HashMap; import java.util.Map; @@ -92,8 +94,11 @@ public static String extract(byte[] rtfBytes) { Charset defaultCodePage = detectCodePage(rtf); Map fontCharsets = parseFontTable(rtf); - // Track the current font's charset for inter-tag text decoding + // Track the current font's charset for inter-tag text decoding. + // The stack mirrors RTF brace nesting so that font switches inside + // groups (e.g. {\f3 ...}) are automatically unwound on '}'. Charset currentFontCharset = defaultCodePage; + Deque charsetStack = new ArrayDeque<>(); // Find the start of the document body (after the RTF header). int bodyStart = rtf.indexOf(HTMLTAG_PREFIX); @@ -159,19 +164,26 @@ public static String extract(byte[] rtfBytes) { continue; } - // Inside \htmlrtf skip blocks: don't emit text, but DO track \fN font switches + // Inside \htmlrtf skip blocks: don't emit text, but track brace + // nesting so that font switches inside groups are properly scoped + // (pushed on '{', popped on '}') — just like the full RTF parser. if (inHtmlRtfSkip) { - if (rtf.charAt(pos) == '\\' && pos + 1 < len && rtf.charAt(pos + 1) == 'f' + char sc = rtf.charAt(pos); + if (sc == '{') { + charsetStack.push(currentFontCharset); + } else if (sc == '}') { + if (!charsetStack.isEmpty()) { + currentFontCharset = charsetStack.pop(); + } + } else if (sc == '\\' && pos + 1 < len && rtf.charAt(pos + 1) == 'f' && pos + 2 < len && Character.isDigit(rtf.charAt(pos + 2))) { - // Parse \fN control word + // Track \fN font switches within the current group int numStart = pos + 2; int numEnd = numStart; while (numEnd < len && Character.isDigit(rtf.charAt(numEnd))) { numEnd++; } - // Make sure this is \f and not \fcharset, \fi, etc. - if (numEnd == numStart + (numEnd - numStart) && - (numEnd >= len || !Character.isLetter(rtf.charAt(numEnd)))) { + if (numEnd >= len || !Character.isLetter(rtf.charAt(numEnd))) { int fontId = Integer.parseInt(rtf.substring(numStart, numEnd)); Charset fontCs = fontCharsets.get(fontId); if (fontCs != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java index 9c923a8773..f09e6019fc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java @@ -18,6 +18,7 @@ import static java.nio.charset.StandardCharsets.US_ASCII; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -305,6 +306,36 @@ public void testHtmltagUsesDefaultCodePage() { "htmltag content should use default code page, not font charset"); } + @Test + public void testFontSwitchInBracedSkipBlockDoesNotPersist() { + // Reproduces the Hebrew/Chinese bug: a skip block contains {\f3\'a0} + // where \f3 is a Latin font (charset 0). The braces should scope the + // font switch so it doesn't affect subsequent inter-tag text. + // \u05d0\u05d2 = אג, windows-1255 bytes: E0 E2 + String rtf = "{\\rtf1\\ansi\\ansicpg1255\\fromhtml1 \\deff0{\\fonttbl\n" + + "{\\f0\\fswiss\\fcharset177 David;}\n" + + "{\\f3\\fmodern\\fcharset0 Courier New;}\n" + + "}\n" + + "{\\*\\htmltag64

}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "\\'e0\\'e2\n" + // Hebrew: אג + "{\\*\\htmltag84  }" + + "\\htmlrtf {\\f3\\'a0}\\htmlrtf0\n" + // skip block with braced \f3 + "\\'e8\\'e5\\'e1\n" + // Hebrew: חוב + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag72

}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("\u05d0\u05d2"), + "First Hebrew text should decode correctly, got: " + html); + // \xe8\xe5\xe1 in windows-1255 = טוב; in windows-1252 = èåá + assertTrue(html.contains("\u05d8\u05d5\u05d1"), + "Hebrew text after braced skip block should still use windows-1255, got: " + html); + assertFalse(html.contains("\u00e8\u00e5\u00e1"), + "Should NOT decode as windows-1252 (mojibake), got: " + html); + } + @Test public void testFontSwitchInInterTagText() { // \f control word directly in inter-tag text (outside \htmlrtf blocks)