From 51cb1d51f536346ce8cfe9f3dc8674a2938860b7 Mon Sep 17 00:00:00 2001 From: Nicholas DiPiazza Date: Tue, 12 May 2026 23:44:25 -0500 Subject: [PATCH] TIKA-4727: Add experimental strongly-typed protobuf response to tika-grpc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds TikaTypedResponse as an experimental alternative to the flat map fields in FetchAndParseReply (TIKA-4722). Motivation (raised by Kristian Rickert, ai-pipestream): Tika's internal metadata model is already strongly typed — booleans, integers, timestamps, and repeated values are all serialised to strings in the current gRPC schema. That forces callers to parse them back, wastes CPU on both sides, and makes cross-language consumption error- prone ("true"/"false" vs. bool, ISO-8601 strings vs. Timestamp, etc.). Design: - New proto file tika_typed_response.proto adds TikaTypedResponse with: - TikaTextContent — plain-text body + summary fields - DublinCoreMetadata — dc:/dcterms: fields strongly typed - oneof document_metadata — PdfTypedMetadata, OfficeTypedMetadata, ImageTypedMetadata, EmailTypedMetadata, MediaTypedMetadata, GenericTypedMetadata (selected by Content-Type) - TikaTypedParseStatus — parse lifecycle info - repeated TikaEmbeddedDocument — embedded doc references - map overflow_fields — fields not covered above - FetchAndParseReply gains optional field 5 (typed_response). Field is always populated alongside the existing fields map — no breaking change for existing clients. - TikaTypedMetadataMapper maps from List → TikaTypedResponse. - TikaGrpcServerImpl wires the mapper in fetchAndParseImpl(). The typed schema is marked experimental. Feedback on field coverage, naming conventions, and the oneof approach is welcome on the JIRA ticket. Credit: Kristian Rickert's ai-pipestream/pipestream-protos served as the reference design for the typed field mapping. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../tika/pipes/grpc/TikaGrpcServerImpl.java | 6 +- .../pipes/grpc/TikaTypedMetadataMapper.java | 665 ++++++++++++++++++ tika-grpc/src/main/proto/tika.proto | 3 + .../src/main/proto/tika_typed_response.proto | 352 +++++++++ 4 files changed, 1025 insertions(+), 1 deletion(-) create mode 100644 tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaTypedMetadataMapper.java create mode 100644 tika-grpc/src/main/proto/tika_typed_response.proto diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 9f80afb352..680482b3fd 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -19,6 +19,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.util.List; import java.util.Map; import java.util.Objects; @@ -248,7 +249,8 @@ private void fetchAndParseImpl(FetchAndParseRequest request, fetchReplyBuilder.setErrorMessage(pipesResult.message()); } if (pipesResult.emitData() != null && pipesResult.emitData().getMetadataList() != null) { - for (Metadata metadata : pipesResult.emitData().getMetadataList()) { + List metadataList = pipesResult.emitData().getMetadataList(); + for (Metadata metadata : metadataList) { for (String name : metadata.names()) { String value = metadata.get(name); if (value != null) { @@ -256,6 +258,8 @@ private void fetchAndParseImpl(FetchAndParseRequest request, } } } + // Populate the experimental strongly-typed response alongside the flat fields map. + fetchReplyBuilder.setTypedResponse(TikaTypedMetadataMapper.map(metadataList)); } responseObserver.onNext(fetchReplyBuilder.build()); } catch (IOException e) { diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaTypedMetadataMapper.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaTypedMetadataMapper.java new file mode 100644 index 0000000000..a9ea471d78 --- /dev/null +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaTypedMetadataMapper.java @@ -0,0 +1,665 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import com.google.protobuf.Timestamp; +import org.apache.tika.DublinCoreMetadata; +import org.apache.tika.EmailTypedMetadata; +import org.apache.tika.GenericTypedMetadata; +import org.apache.tika.ImageTypedMetadata; +import org.apache.tika.MediaTypedMetadata; +import org.apache.tika.OfficeTypedMetadata; +import org.apache.tika.PdfTypedMetadata; +import org.apache.tika.TikaTextContent; +import org.apache.tika.TikaTypedParseStatus; +import org.apache.tika.TikaTypedResponse; +import org.apache.tika.metadata.Metadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Instant; +import java.time.format.DateTimeParseException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Maps a list of Tika {@link Metadata} objects (as produced by the pipes client) into a + * {@link TikaTypedResponse} protobuf message. + * + *

The first metadata entry in the list is treated as the primary document metadata; subsequent + * entries represent embedded documents. The plain-text body is stored under the key + * {@code X-TIKA:content} in the first metadata entry. + * + *

This class is experimental (TIKA-4727) and tracks the typed schema introduced to address + * the performance and type-safety concerns raised about the existing flat {@code map} + * representation. + */ +public class TikaTypedMetadataMapper { + + private static final Logger LOG = LoggerFactory.getLogger(TikaTypedMetadataMapper.class); + + // ---- well-known Tika metadata key constants ---- + + private static final String CONTENT_TYPE = "Content-Type"; + private static final String TIKA_CONTENT = "X-TIKA:content"; + private static final String PARSED_BY = "X-Parsed-By"; + private static final String TIKA_DETECTED_LANG = "X-TIKA:detected_language"; + private static final String TIKA_LANG_CONF = "X-TIKA:detected_language_confidence"; + private static final String TIKA_WARNINGS = "X-TIKA:EXCEPTION:warn"; + private static final String EMBED_RESOURCE_TYPE = "X-TIKA:embedded_resource_type"; + private static final String RESOURCE_NAME = "resourceName"; + + // Dublin Core + private static final String DC_TITLE = "dc:title"; + private static final String DC_CREATOR = "dc:creator"; + private static final String DC_DESCRIPTION = "dc:description"; + private static final String DC_SUBJECT = "dc:subject"; + private static final String DC_PUBLISHER = "dc:publisher"; + private static final String DC_CONTRIBUTOR = "dc:contributor"; + private static final String DC_DATE = "dc:date"; + private static final String DC_TYPE = "dc:type"; + private static final String DC_FORMAT = "dc:format"; + private static final String DC_IDENTIFIER = "dc:identifier"; + private static final String DC_SOURCE = "dc:source"; + private static final String DC_LANGUAGE = "dc:language"; + private static final String DC_RELATION = "dc:relation"; + private static final String DC_COVERAGE = "dc:coverage"; + private static final String DC_RIGHTS = "dc:rights"; + private static final String DCTERMS_CREATED = "dcterms:created"; + private static final String DCTERMS_MODIFIED = "dcterms:modified"; + private static final String XMP_CREATOR_TOOL = "xmp:CreatorTool"; + private static final String COMMENTS = "comments"; + private static final String RATING = "xmp:Rating"; + private static final String TITLE_ALT = "title"; + private static final String LAST_MODIFIED = "Last-Modified"; + + // PDF + private static final String PDF_VERSION = "pdf:PDFVersion"; + private static final String PDF_ENCRYPTED = "pdf:encrypted"; + private static final String PDF_PRODUCER = "pdf:producer"; + private static final String PDF_HAS_XFA = "pdf:hasXFA"; + private static final String PDF_HAS_XMP = "pdf:hasXMP"; + private static final String PDF_HAS_ACROFORM = "pdf:hasAcroFormFields"; + private static final String PDF_HAS_MARKED = "pdf:hasMarkedContent"; + private static final String PDF_HAS_COLLECTION = "pdf:hasCollection"; + private static final String PDF_HAS_3D = "pdf:has3D"; + private static final String PDF_SIGNATURE = "pdf:hasSignature"; + private static final String PDF_CAN_PRINT = "pdf:print"; + private static final String PDF_CAN_PRINT_FAITHFUL = "pdf:printFaithful"; + private static final String PDF_CAN_MODIFY = "pdf:modify"; + private static final String PDF_CAN_MODIFY_ANNOTS = "pdf:modifyAnnotations"; + private static final String PDF_CAN_EXTRACT = "pdf:extract"; + private static final String PDF_CAN_ASSEMBLE = "pdf:assemble"; + private static final String PDF_CAN_FILL_FORM = "pdf:fillInForm"; + private static final String PDF_CAN_EXTRACT_ACCESSIBILITY = "pdf:extractForAccessibility"; + private static final String PDF_TOTAL_UNMAPPED_UNICODE = "pdf:totalUnmappedUnicodeChars"; + private static final String PDF_PCT_UNMAPPED = "pdf:overallPercentageUnmappedUnicodeChars"; + private static final String PDF_DAMAGED_FONT = "pdf:containsDamagedFont"; + private static final String PDF_NON_EMBEDDED_FONT = "pdf:containsNonEmbeddedFont"; + private static final String PDF_OCR_PAGES = "pdf:ocrPageCount"; + private static final String PDF_ACTION_TYPES = "pdf:actionTypes"; + private static final String PDF_ANNOTATION_TYPES = "pdf:annotationTypes"; + private static final String PDF_INCREMENTAL_UPDATES = "pdf:incrementalUpdateNumber"; + private static final String PDF_PDFA_VERSION = "pdfa:PDFVersion"; + private static final String PDF_PDFAID_CONFORMANCE = "pdfaid:conformance"; + private static final String PDF_PDFAID_PART = "pdfaid:part"; + private static final String PDF_DOC_INFO_CREATOR = "pdf:docinfo:creator"; + private static final String PDF_DOC_INFO_CREATOR_TOOL = "pdf:docinfo:creator_tool"; + private static final String PDF_DOC_INFO_CREATED = "pdf:docinfo:created"; + private static final String PDF_DOC_INFO_MODIFIED = "pdf:docinfo:modified"; + private static final String PDF_DOC_INFO_PRODUCER = "pdf:docinfo:producer"; + private static final String PDF_DOC_INFO_KEYWORDS = "pdf:docinfo:keywords"; + private static final String PDF_DOC_INFO_SUBJECT = "pdf:docinfo:subject"; + private static final String PDF_DOC_INFO_TITLE = "pdf:docinfo:title"; + private static final String XMPTP_N_PAGES = "xmpTPg:NPages"; + + // Office + private static final String META_AUTHOR = "meta:author"; + private static final String META_LAST_AUTHOR = "meta:last-author"; + private static final String META_INIT_AUTHOR = "meta:initial-author"; + private static final String META_CREATION_DATE = "meta:creation-date"; + private static final String META_SAVE_DATE = "meta:save-date"; + private static final String META_PRINT_DATE = "meta:print-date"; + private static final String META_PAGE_COUNT = "meta:page-count"; + private static final String META_WORD_COUNT = "meta:word-count"; + private static final String META_CHAR_COUNT = "meta:character-count"; + private static final String META_CHAR_SPACES = "meta:character-count-with-spaces"; + private static final String META_PARA_COUNT = "meta:paragraph-count"; + private static final String META_LINE_COUNT = "meta:line-count"; + private static final String META_SLIDE_COUNT = "meta:slide-count"; + private static final String META_IMAGE_COUNT = "meta:image-count"; + private static final String META_TABLE_COUNT = "meta:table-count"; + private static final String EXT_APPLICATION = "extended-properties:Application"; + private static final String EXT_APP_VERSION = "extended-properties:AppVersion"; + private static final String EXT_TEMPLATE = "extended-properties:Template"; + private static final String EXT_COMPANY = "extended-properties:Company"; + private static final String EXT_MANAGER = "extended-properties:Manager"; + private static final String EXT_PRES_FORMAT = "extended-properties:PresentationFormat"; + private static final String EXT_NOTES = "extended-properties:Notes"; + private static final String CP_REVISION = "cp:revision"; + private static final String CP_CATEGORY = "cp:category"; + private static final String CP_CONTENT_STATUS = "cp:contentStatus"; + private static final String CP_LAST_MOD_BY = "cp:lastModifiedBy"; + private static final String CP_LAST_PRINTED = "cp:lastPrinted"; + private static final String DOC_SECURITY = "extended-properties:DocSecurity"; + private static final String OFFICE_HAS_TRACK_CHANGES = "meta:has-track-changes"; + private static final String OFFICE_HAS_HIDDEN_TEXT = "meta:has-hidden-text"; + private static final String OFFICE_HAS_COMMENTS = "meta:has-comments"; + private static final String OFFICE_HAS_HIDDEN_SHEETS = "meta:has-hidden-sheets"; + private static final String OFFICE_HAS_ANIMATIONS = "meta:has-animations"; + private static final String IS_ENCRYPTED = "protected"; + private static final String HAS_SIGNATURE = "signature"; + + // Image / EXIF / TIFF + private static final String TIFF_WIDTH = "tiff:ImageWidth"; + private static final String TIFF_HEIGHT = "tiff:ImageLength"; + private static final String TIFF_BITS = "tiff:BitsPerSample"; + private static final String TIFF_SAMPLES = "tiff:SamplesPerPixel"; + private static final String TIFF_X_RES = "tiff:XResolution"; + private static final String TIFF_Y_RES = "tiff:YResolution"; + private static final String TIFF_RES_UNIT = "tiff:ResolutionUnit"; + private static final String TIFF_COMPRESSION = "tiff:Compression"; + private static final String TIFF_ORIENTATION = "tiff:Orientation"; + private static final String EXIF_DATETIME_ORIG = "exif:DateTimeOriginal"; + private static final String EXIF_DATETIME_DIG = "exif:DateTimeDigitized"; + private static final String EXIF_MAKE = "tiff:Make"; + private static final String EXIF_MODEL = "tiff:Model"; + private static final String EXIF_SOFTWARE = "tiff:Software"; + private static final String EXIF_EXPOSURE = "exif:ExposureTime"; + private static final String EXIF_F_NUMBER = "exif:FNumber"; + private static final String EXIF_ISO = "exif:ISOSpeedRatings"; + private static final String EXIF_FOCAL = "exif:FocalLength"; + private static final String EXIF_FLASH = "exif:Flash"; + private static final String EXIF_METERING = "exif:MeteringMode"; + private static final String EXIF_WHITE_BAL = "exif:WhiteBalance"; + private static final String GEO_LAT = "geo:lat"; + private static final String GEO_LONG = "geo:long"; + private static final String GEO_ALT = "geo:alt"; + + // Email / Message + private static final String MSG_FROM = "Message-From"; + private static final String MSG_TO = "Message-To"; + private static final String MSG_CC = "Message-Cc"; + private static final String MSG_BCC = "Message-Bcc"; + private static final String MSG_SUBJECT = "dc:title"; + private static final String MSG_DATE = "dcterms:created"; + private static final String MSG_ID = "Message-ID"; + private static final String MSG_IN_REPLY_TO = "In-Reply-To"; + private static final String MSG_MULTIPART = "multipart"; + + // Media / AV + private static final String XMPDM_DURATION = "xmpDM:duration"; + private static final String XMPDM_VIDEO_WIDTH = "xmpDM:videoFrameWidth"; + private static final String XMPDM_VIDEO_HEIGHT = "xmpDM:videoFrameHeight"; + private static final String XMPDM_VIDEO_FRAME_RATE = "xmpDM:videoFrameRate"; + private static final String XMPDM_VIDEO_COMPRESSOR = "xmpDM:videoCompressor"; + private static final String XMPDM_AUDIO_SAMPLE_RATE = "xmpDM:audioSampleRate"; + private static final String XMPDM_AUDIO_CHANNELS = "xmpDM:audioChannelType"; + private static final String XMPDM_AUDIO_COMPRESSOR = "xmpDM:audioCompressor"; + private static final String XMPDM_AUDIO_BITS = "xmpDM:audioBitsPerSample"; + private static final String XMPDM_BIT_RATE = "xmpDM:fileDataRate"; + + // Fields consumed by the typed mapper — not forwarded to overflow_fields. + private static final Set MAPPED_KEYS = new HashSet<>(Arrays.asList( + CONTENT_TYPE, TIKA_CONTENT, PARSED_BY, TIKA_DETECTED_LANG, TIKA_LANG_CONF, + TIKA_WARNINGS, EMBED_RESOURCE_TYPE, RESOURCE_NAME, + DC_TITLE, DC_CREATOR, DC_DESCRIPTION, DC_SUBJECT, DC_PUBLISHER, DC_CONTRIBUTOR, + DC_DATE, DC_TYPE, DC_FORMAT, DC_IDENTIFIER, DC_SOURCE, DC_LANGUAGE, DC_RELATION, + DC_COVERAGE, DC_RIGHTS, DCTERMS_CREATED, DCTERMS_MODIFIED, XMP_CREATOR_TOOL, + COMMENTS, RATING, TITLE_ALT, LAST_MODIFIED + )); + + private TikaTypedMetadataMapper() {} + + /** + * Builds a {@link TikaTypedResponse} from the list of metadata entries returned by the + * Tika pipes client. The first entry is treated as the primary document. + */ + public static TikaTypedResponse map(List metadataList) { + if (metadataList == null || metadataList.isEmpty()) { + return TikaTypedResponse.getDefaultInstance(); + } + + Metadata primary = metadataList.get(0); + TikaTypedResponse.Builder response = TikaTypedResponse.newBuilder(); + + String contentType = primary.get(CONTENT_TYPE); + if (contentType != null && contentType.contains(";")) { + contentType = contentType.split(";")[0].trim(); + } + + mapTextContent(primary, response); + mapDublinCore(primary, response); + mapDocumentMetadata(primary, contentType, response); + mapParseStatus(primary, response); + mapOverflow(primary, contentType, response); + + return response.build(); + } + + private static void mapTextContent(Metadata m, TikaTypedResponse.Builder b) { + TikaTextContent.Builder c = TikaTextContent.newBuilder(); + boolean any = false; + String body = m.get(TIKA_CONTENT); + if (body != null) { + c.setBody(body); + c.setContentLength(body.codePointCount(0, body.length())); + any = true; + } + String title = firstNonNull(m.get(DC_TITLE), m.get(TITLE_ALT)); + if (title != null) { + c.setTitle(title); + any = true; + } + String desc = m.get(DC_DESCRIPTION); + if (desc != null) { + c.setDescription(desc); + any = true; + } + String kw = m.get(DC_SUBJECT); + if (kw != null) { + c.setKeywords(kw); + any = true; + } + if (any) { + b.setContent(c.build()); + } + } + + private static void mapDublinCore(Metadata m, TikaTypedResponse.Builder b) { + DublinCoreMetadata.Builder dc = DublinCoreMetadata.newBuilder(); + boolean any = false; + + any |= setString(m, DC_TITLE, dc::setTitle); + any |= addStrings(m, DC_CREATOR, dc::addCreator); + any |= setString(m, DC_DESCRIPTION, dc::setDescription); + any |= addStrings(m, DC_SUBJECT, dc::addSubject); + any |= setString(m, DC_PUBLISHER, dc::setPublisher); + any |= addStrings(m, DC_CONTRIBUTOR, dc::addContributor); + any |= setString(m, DC_TYPE, dc::setType); + any |= setString(m, DC_FORMAT, dc::setFormat); + any |= setString(m, DC_IDENTIFIER, dc::setIdentifier); + any |= setString(m, DC_SOURCE, dc::setSource); + any |= addStrings(m, DC_LANGUAGE, dc::addLanguage); + any |= setString(m, DC_RELATION, dc::setRelation); + any |= setString(m, DC_COVERAGE, dc::setCoverage); + any |= setString(m, DC_RIGHTS, dc::setRights); + any |= setString(m, XMP_CREATOR_TOOL, dc::setCreatorTool); + any |= setString(m, COMMENTS, dc::setComments); + any |= setString(m, RATING, dc::setRating); + + String createdRaw = m.get(DCTERMS_CREATED); + if (createdRaw != null) { + dc.setCreatedRaw(createdRaw); + Timestamp ts = parseTimestamp(createdRaw); + if (ts != null) { + dc.setCreated(ts); + } + any = true; + } + String modifiedRaw = firstNonNull(m.get(DCTERMS_MODIFIED), m.get(LAST_MODIFIED)); + if (modifiedRaw != null) { + dc.setModifiedRaw(modifiedRaw); + Timestamp ts = parseTimestamp(modifiedRaw); + if (ts != null) { + dc.setModified(ts); + } + any = true; + } + + if (any) { + b.setDublinCore(dc.build()); + } + } + + private static void mapDocumentMetadata(Metadata m, String contentType, + TikaTypedResponse.Builder b) { + if (contentType == null) { + b.setGeneric(buildGeneric(m)); + return; + } + if (contentType.equals("application/pdf")) { + b.setPdf(buildPdf(m)); + } else if (isOfficeMimeType(contentType)) { + b.setOffice(buildOffice(m)); + } else if (contentType.startsWith("image/")) { + b.setImage(buildImage(m)); + } else if (isEmailMimeType(contentType)) { + b.setEmail(buildEmail(m)); + } else if (contentType.startsWith("audio/") || contentType.startsWith("video/")) { + b.setMedia(buildMedia(m)); + } else { + b.setGeneric(buildGeneric(m)); + } + } + + private static boolean isOfficeMimeType(String ct) { + return ct.startsWith("application/vnd.openxmlformats-officedocument.") + || ct.startsWith("application/vnd.ms-") + || ct.startsWith("application/vnd.oasis.opendocument.") + || ct.equals("application/msword") + || ct.equals("application/vnd.ms-excel") + || ct.equals("application/vnd.ms-powerpoint"); + } + + private static boolean isEmailMimeType(String ct) { + return ct.equals("message/rfc822") + || ct.equals("application/mbox") + || ct.startsWith("message/"); + } + + private static PdfTypedMetadata buildPdf(Metadata m) { + PdfTypedMetadata.Builder b = PdfTypedMetadata.newBuilder(); + setString(m, PDF_VERSION, b::setPdfVersion); + setBool(m, PDF_ENCRYPTED, b::setIsEncrypted); + setInt(m, XMPTP_N_PAGES, b::setPageCount); + setString(m, PDF_PRODUCER, b::setProducer); + setString(m, PDF_DOC_INFO_CREATOR, b::setDocInfoCreator); + setString(m, PDF_DOC_INFO_CREATOR_TOOL, b::setDocInfoCreatorTool); + setString(m, PDF_DOC_INFO_PRODUCER, b::setDocInfoProducer); + setString(m, PDF_DOC_INFO_KEYWORDS, b::setDocInfoKeywords); + setString(m, PDF_DOC_INFO_SUBJECT, b::setDocInfoSubject); + setString(m, PDF_DOC_INFO_TITLE, b::setDocInfoTitle); + setTimestamp(m, PDF_DOC_INFO_CREATED, b::setDocInfoCreated, b::setDocInfoCreatedRaw); + setTimestamp(m, PDF_DOC_INFO_MODIFIED, b::setDocInfoModified, b::setDocInfoModifiedRaw); + setString(m, PDF_PDFA_VERSION, b::setPdfaVersion); + setString(m, PDF_PDFAID_CONFORMANCE, b::setPdfaidConformance); + setInt(m, PDF_PDFAID_PART, b::setPdfaidPart); + setBool(m, PDF_HAS_XFA, b::setHasXfa); + setBool(m, PDF_HAS_XMP, b::setHasXmp); + setBool(m, PDF_HAS_ACROFORM, b::setHasAcroformFields); + setBool(m, PDF_HAS_MARKED, b::setHasMarkedContent); + setBool(m, PDF_HAS_COLLECTION, b::setHasCollection); + setBool(m, PDF_HAS_3D, b::setHas3D); + setBool(m, PDF_SIGNATURE, b::setHasSignature); + setBool(m, PDF_CAN_PRINT, b::setCanPrint); + setBool(m, PDF_CAN_PRINT_FAITHFUL, b::setCanPrintFaithful); + setBool(m, PDF_CAN_MODIFY, b::setCanModifyDocument); + setBool(m, PDF_CAN_MODIFY_ANNOTS, b::setCanModifyAnnotations); + setBool(m, PDF_CAN_EXTRACT, b::setCanExtractContent); + setBool(m, PDF_CAN_ASSEMBLE, b::setCanAssembleDocument); + setBool(m, PDF_CAN_FILL_FORM, b::setCanFillInForm); + setBool(m, PDF_CAN_EXTRACT_ACCESSIBILITY, b::setCanExtractForAccessibility); + setInt(m, PDF_TOTAL_UNMAPPED_UNICODE, b::setTotalUnmappedUnicodeChars); + setDouble(m, PDF_PCT_UNMAPPED, b::setOverallPctUnmappedUnicodeChars); + setBool(m, PDF_DAMAGED_FONT, b::setContainsDamagedFont); + setBool(m, PDF_NON_EMBEDDED_FONT, b::setContainsNonEmbeddedFont); + setInt(m, PDF_OCR_PAGES, b::setOcrPageCount); + setInt(m, PDF_INCREMENTAL_UPDATES, b::setIncrementalUpdateNumber); + addStrings(m, PDF_ACTION_TYPES, b::addActionTypes); + addStrings(m, PDF_ANNOTATION_TYPES, b::addAnnotationTypes); + addStrings(m, PARSED_BY, b::addParsedBy); + setString(m, CONTENT_TYPE, b::setContentType); + return b.build(); + } + + private static OfficeTypedMetadata buildOffice(Metadata m) { + OfficeTypedMetadata.Builder b = OfficeTypedMetadata.newBuilder(); + setString(m, EXT_APPLICATION, b::setApplication); + setString(m, EXT_APP_VERSION, b::setAppVersion); + setString(m, EXT_TEMPLATE, b::setTemplate); + addStrings(m, META_AUTHOR, b::addAuthor); + setString(m, META_LAST_AUTHOR, b::setLastAuthor); + setString(m, META_INIT_AUTHOR, b::setInitialAuthor); + setString(m, CP_LAST_MOD_BY, b::setLastModifiedBy); + setString(m, EXT_COMPANY, b::setCompany); + addStrings(m, EXT_MANAGER, b::addManager); + setTimestamp(m, META_CREATION_DATE, b::setCreationDate, b::setCreationDateRaw); + setTimestamp(m, META_SAVE_DATE, b::setSaveDate, b::setSaveDateRaw); + setTimestamp(m, META_PRINT_DATE, b::setPrintDate, b::setPrintDateRaw); + setTimestamp(m, CP_LAST_PRINTED, b::setLastPrinted, b::setLastPrintedRaw); + setInt(m, META_PAGE_COUNT, b::setPageCount); + setInt(m, META_WORD_COUNT, b::setWordCount); + setInt(m, META_CHAR_COUNT, b::setCharacterCount); + setInt(m, META_CHAR_SPACES, b::setCharacterCountWithSpaces); + setInt(m, META_PARA_COUNT, b::setParagraphCount); + setInt(m, META_LINE_COUNT, b::setLineCount); + setInt(m, META_SLIDE_COUNT, b::setSlideCount); + setInt(m, META_IMAGE_COUNT, b::setImageCount); + setInt(m, META_TABLE_COUNT, b::setTableCount); + setString(m, CP_REVISION, b::setRevision); + setString(m, CP_CATEGORY, b::setCategory); + setString(m, CP_CONTENT_STATUS, b::setContentStatus); + setString(m, EXT_PRES_FORMAT, b::setPresentationFormat); + setString(m, EXT_NOTES, b::setNotes); + setInt(m, DOC_SECURITY, b::setDocSecurity); + setBool(m, OFFICE_HAS_TRACK_CHANGES, b::setHasTrackChanges); + setBool(m, OFFICE_HAS_HIDDEN_TEXT, b::setHasHiddenText); + setBool(m, OFFICE_HAS_COMMENTS, b::setHasComments); + setBool(m, IS_ENCRYPTED, b::setIsEncrypted); + setBool(m, HAS_SIGNATURE, b::setHasSignature); + setBool(m, OFFICE_HAS_HIDDEN_SHEETS, b::setHasHiddenSheets); + setBool(m, OFFICE_HAS_ANIMATIONS, b::setHasAnimations); + addStrings(m, PARSED_BY, b::addParsedBy); + setString(m, CONTENT_TYPE, b::setContentType); + return b.build(); + } + + private static ImageTypedMetadata buildImage(Metadata m) { + ImageTypedMetadata.Builder b = ImageTypedMetadata.newBuilder(); + setInt(m, TIFF_WIDTH, b::setImageWidth); + setInt(m, TIFF_HEIGHT, b::setImageHeight); + setInt(m, TIFF_BITS, b::setBitsPerSample); + setInt(m, TIFF_SAMPLES, b::setSamplesPerPixel); + setDouble(m, TIFF_X_RES, b::setXResolution); + setDouble(m, TIFF_Y_RES, b::setYResolution); + setString(m, TIFF_RES_UNIT, b::setResolutionUnit); + setString(m, TIFF_COMPRESSION, b::setCompression); + setString(m, TIFF_ORIENTATION, b::setOrientation); + setTimestamp(m, EXIF_DATETIME_ORIG, b::setDatetimeOriginal, b::setDatetimeOriginalRaw); + setTimestamp(m, EXIF_DATETIME_DIG, b::setDatetimeDigitized, b::setDatetimeDigitizedRaw); + setString(m, EXIF_MAKE, b::setMake); + setString(m, EXIF_MODEL, b::setModel); + setString(m, EXIF_SOFTWARE, b::setSoftware); + setDouble(m, EXIF_EXPOSURE, b::setExposureTime); + setDouble(m, EXIF_F_NUMBER, b::setFNumber); + setInt(m, EXIF_ISO, b::setIsoSpeedRatings); + setDouble(m, EXIF_FOCAL, b::setFocalLength); + setString(m, EXIF_FLASH, b::setFlash); + setString(m, EXIF_METERING, b::setMeteringMode); + setString(m, EXIF_WHITE_BAL, b::setWhiteBalance); + setDouble(m, GEO_LAT, b::setGpsLatitude); + setDouble(m, GEO_LONG, b::setGpsLongitude); + setDouble(m, GEO_ALT, b::setGpsAltitude); + addStrings(m, PARSED_BY, b::addParsedBy); + setString(m, CONTENT_TYPE, b::setContentType); + return b.build(); + } + + private static EmailTypedMetadata buildEmail(Metadata m) { + EmailTypedMetadata.Builder b = EmailTypedMetadata.newBuilder(); + setString(m, MSG_FROM, b::setMessageFrom); + addStrings(m, MSG_TO, b::addMessageTo); + addStrings(m, MSG_CC, b::addMessageCc); + addStrings(m, MSG_BCC, b::addMessageBcc); + setString(m, DC_TITLE, b::setSubject); + setTimestamp(m, DCTERMS_CREATED, b::setMessageDate, b::setMessageDateRaw); + setString(m, MSG_ID, b::setMessageId); + setString(m, MSG_IN_REPLY_TO, b::setInReplyTo); + addStrings(m, PARSED_BY, b::addParsedBy); + setString(m, CONTENT_TYPE, b::setContentType); + return b.build(); + } + + private static MediaTypedMetadata buildMedia(Metadata m) { + MediaTypedMetadata.Builder b = MediaTypedMetadata.newBuilder(); + setString(m, XMPDM_DURATION, b::setDurationRaw); + setDouble(m, XMPDM_DURATION, b::setDurationSeconds); + setInt(m, XMPDM_VIDEO_WIDTH, b::setVideoWidth); + setInt(m, XMPDM_VIDEO_HEIGHT, b::setVideoHeight); + setDouble(m, XMPDM_VIDEO_FRAME_RATE, b::setVideoFrameRate); + setString(m, XMPDM_VIDEO_COMPRESSOR, b::setVideoCompressor); + setInt(m, XMPDM_AUDIO_SAMPLE_RATE, b::setAudioSampleRate); + setString(m, XMPDM_AUDIO_CHANNELS, b::setAudioChannels); + setString(m, XMPDM_AUDIO_COMPRESSOR, b::setAudioCompressor); + setInt(m, XMPDM_AUDIO_BITS, b::setAudioBitsPerSample); + setString(m, CONTENT_TYPE, b::setContentType); + addStrings(m, PARSED_BY, b::addParsedBy); + return b.build(); + } + + private static GenericTypedMetadata buildGeneric(Metadata m) { + GenericTypedMetadata.Builder b = GenericTypedMetadata.newBuilder(); + setString(m, CONTENT_TYPE, b::setContentType); + addStrings(m, PARSED_BY, b::addParsedBy); + setString(m, TIKA_DETECTED_LANG, b::setDetectedLanguage); + setDouble(m, TIKA_LANG_CONF, b::setDetectedLanguageConfidence); + setString(m, RESOURCE_NAME, b::setResourceName); + return b.build(); + } + + private static void mapParseStatus(Metadata m, TikaTypedResponse.Builder b) { + TikaTypedParseStatus.Builder s = TikaTypedParseStatus.newBuilder(); + s.setStatus(TikaTypedParseStatus.Status.STATUS_SUCCESS); + String[] parsedBy = m.getValues(PARSED_BY); + if (parsedBy != null) { + for (String parser : parsedBy) { + s.addParsersUsed(parser); + } + } + String warnings = m.get(TIKA_WARNINGS); + if (warnings != null) { + s.addWarnings(warnings); + } + b.setParseStatus(s.build()); + } + + private static void mapOverflow(Metadata m, String contentType, TikaTypedResponse.Builder b) { + for (String name : m.names()) { + if (!MAPPED_KEYS.contains(name)) { + String value = m.get(name); + if (value != null) { + b.putOverflowFields(name, value); + } + } + } + } + + // ---- helper methods ---- + + @FunctionalInterface + interface StringSetter { + void accept(String s); + } + + @FunctionalInterface + interface TimestampSetter { + void accept(Timestamp t); + } + + private static boolean setString(Metadata m, String key, StringSetter setter) { + String v = m.get(key); + if (v != null) { + setter.accept(v); + return true; + } + return false; + } + + private static boolean addStrings(Metadata m, String key, StringSetter adder) { + String[] values = m.getValues(key); + if (values == null || values.length == 0) { + return false; + } + for (String v : values) { + if (v != null) { + adder.accept(v); + } + } + return true; + } + + private static boolean setBool(Metadata m, String key, + java.util.function.Consumer setter) { + String v = m.get(key); + if (v == null) { + return false; + } + setter.accept(Boolean.parseBoolean(v) || "yes".equalsIgnoreCase(v) || "1".equals(v)); + return true; + } + + private static boolean setInt(Metadata m, String key, + java.util.function.Consumer setter) { + String v = m.get(key); + if (v == null) { + return false; + } + try { + setter.accept(Integer.parseInt(v.trim())); + return true; + } catch (NumberFormatException e) { + LOG.debug("Could not parse int for key {}: {}", key, v); + return false; + } + } + + private static boolean setDouble(Metadata m, String key, + java.util.function.Consumer setter) { + String v = m.get(key); + if (v == null) { + return false; + } + try { + setter.accept(Double.parseDouble(v.trim())); + return true; + } catch (NumberFormatException e) { + LOG.debug("Could not parse double for key {}: {}", key, v); + return false; + } + } + + private static void setTimestamp(Metadata m, String key, TimestampSetter tsSetter, + StringSetter rawSetter) { + String v = m.get(key); + if (v == null) { + return; + } + rawSetter.accept(v); + Timestamp ts = parseTimestamp(v); + if (ts != null) { + tsSetter.accept(ts); + } + } + + private static Timestamp parseTimestamp(String raw) { + if (raw == null || raw.isBlank()) { + return null; + } + try { + Instant instant = Instant.parse(raw); + return Timestamp.newBuilder() + .setSeconds(instant.getEpochSecond()) + .setNanos(instant.getNano()) + .build(); + } catch (DateTimeParseException e) { + LOG.debug("Could not parse timestamp: {}", raw); + return null; + } + } + + private static String firstNonNull(String... values) { + for (String v : values) { + if (v != null) { + return v; + } + } + return null; + } +} diff --git a/tika-grpc/src/main/proto/tika.proto b/tika-grpc/src/main/proto/tika.proto index 0a64f37115..b06eaaa96e 100644 --- a/tika-grpc/src/main/proto/tika.proto +++ b/tika-grpc/src/main/proto/tika.proto @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; +import "tika_typed_response.proto"; package tika; option java_multiple_files = true; @@ -115,6 +116,8 @@ message FetchAndParseReply { // The status from the message. See javadoc for org.apache.tika.pipes.PipesResult.STATUS for the list of status. string status = 3; // If there was an error, this will contain the error message. + // Experimental: strongly-typed parse response. See TikaTypedResponse and TIKA-4727. + TikaTypedResponse typed_response = 5; string error_message = 4; } diff --git a/tika-grpc/src/main/proto/tika_typed_response.proto b/tika-grpc/src/main/proto/tika_typed_response.proto new file mode 100644 index 0000000000..5f36cfb4d4 --- /dev/null +++ b/tika-grpc/src/main/proto/tika_typed_response.proto @@ -0,0 +1,352 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax = "proto3"; +package tika; + +import "google/protobuf/timestamp.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.tika"; +option java_outer_classname = "TikaTypedResponseProto"; +option objc_class_prefix = "HLW"; + +// Experimental: strongly-typed parse response as an alternative to the flat map +// fields in FetchAndParseReply. Motivation: Tika's metadata is already strongly typed internally +// (booleans, integers, timestamps, lists) — serialising everything to strings wastes CPU cycles +// during serialisation/deserialisation and forces callers to re-parse values. +// +// This message is populated alongside the existing `fields` map in FetchAndParseReply (field 5). +// It is opt-in and backward compatible — existing clients that ignore field 5 are unaffected. +// +// Design credit: Kristian Rickert (ai-pipestream/pipestream-protos) who produced the original +// comprehensive typed schema that inspired this implementation. +// +// See TIKA-4727. +message TikaTypedResponse { + // Extracted plain-text body and summary fields. + TikaTextContent content = 1; + + // Dublin Core standard descriptive metadata — present for most document types. + DublinCoreMetadata dublin_core = 2; + + // Document-type-specific typed fields. Exactly one branch is populated based on Content-Type. + oneof document_metadata { + PdfTypedMetadata pdf = 3; + OfficeTypedMetadata office = 4; + ImageTypedMetadata image = 5; + EmailTypedMetadata email = 6; + MediaTypedMetadata media = 7; + GenericTypedMetadata generic = 8; + } + + // Parse lifecycle details. + TikaTypedParseStatus parse_status = 9; + + // Embedded documents discovered during parsing (depth-limited by the Tika config). + repeated TikaEmbeddedDocument embedded_documents = 10; + + // Fields present in the original metadata that were not mapped to any typed field above. + // This allows callers to receive all Tika metadata without falling back to the full + // map fields response. + map overflow_fields = 11; +} + +// Plain-text extraction result. +message TikaTextContent { + // Full extracted body text. + string body = 1; + optional string title = 2; + optional string description = 3; + optional string keywords = 4; + // Number of Unicode characters in the extracted body (can differ from body.length() for + // supplementary characters). + optional int64 content_length = 5; +} + +// Dublin Core Metadata Elements (https://www.dublincore.org/specifications/dublin-core/dces/). +// Field names follow dcterms / dc namespace conventions as used inside Tika. +message DublinCoreMetadata { + optional string title = 1; + repeated string creator = 2; + optional string description = 3; + repeated string subject = 4; + optional string publisher = 5; + repeated string contributor = 6; + optional google.protobuf.Timestamp date = 7; + optional string date_raw = 8; + optional string type = 9; + optional string format = 10; + optional string identifier = 11; + optional string source = 12; + repeated string language = 13; + optional string relation = 14; + optional string coverage = 15; + optional string rights = 16; + optional google.protobuf.Timestamp created = 17; + optional string created_raw = 18; + optional google.protobuf.Timestamp modified = 19; + optional string modified_raw = 20; + // xmp:CreatorTool + optional string creator_tool = 21; + optional string comments = 22; + optional string rating = 23; +} + +// Typed PDF metadata — populated when Content-Type is application/pdf. +// Field names correspond to Tika's PDF and AccessPermissions interfaces. +message PdfTypedMetadata { + optional string pdf_version = 1; + optional bool is_encrypted = 2; + optional int32 page_count = 3; + optional string producer = 4; + + // PDF DocInfo fields + optional string doc_info_creator = 5; + optional string doc_info_creator_tool = 6; + optional google.protobuf.Timestamp doc_info_created = 7; + optional string doc_info_created_raw = 8; + optional google.protobuf.Timestamp doc_info_modified = 9; + optional string doc_info_modified_raw = 10; + optional string doc_info_keywords = 11; + optional string doc_info_producer = 12; + optional string doc_info_subject = 13; + optional string doc_info_title = 14; + + // Standards compliance + optional string pdfa_version = 15; + optional string pdfaid_conformance = 16; + optional int32 pdfaid_part = 17; + + // Document features + optional bool has_xfa = 18; + optional bool has_xmp = 19; + optional bool has_acroform_fields = 20; + optional bool has_marked_content = 21; + optional bool has_collection = 22; + optional bool has_3d = 23; + optional bool has_signature = 24; + + // Access permissions (from AccessPermissions interface) + optional bool can_print = 25; + optional bool can_print_faithful = 26; + optional bool can_modify_document = 27; + optional bool can_modify_annotations = 28; + optional bool can_extract_content = 29; + optional bool can_assemble_document = 30; + optional bool can_fill_in_form = 31; + optional bool can_extract_for_accessibility = 32; + + // Text quality + optional int32 total_unmapped_unicode_chars = 33; + optional double overall_pct_unmapped_unicode_chars = 34; + optional bool contains_damaged_font = 35; + optional bool contains_non_embedded_font = 36; + optional int32 ocr_page_count = 37; + + // Actions / annotations + repeated string action_types = 38; + repeated string annotation_types = 39; + + // Versioning + optional int32 incremental_update_number = 40; + + // Parsing diagnostics + repeated string parsed_by = 41; + optional string content_type = 42; +} + +// Typed Office document metadata — populated when Content-Type matches Microsoft Office +// (OOXML, OLE2) or OpenDocument formats. +// Field names correspond to Tika's Office, OfficeOpenXMLCore, and OfficeOpenXMLExtended interfaces. +message OfficeTypedMetadata { + // Application identity + optional string application = 1; + optional string app_version = 2; + optional string template = 3; + + // Authorship + repeated string author = 4; + optional string last_author = 5; + optional string initial_author = 6; + optional string last_modified_by = 7; + optional string company = 8; + repeated string manager = 9; + + // Dates + optional google.protobuf.Timestamp creation_date = 10; + optional string creation_date_raw = 11; + optional google.protobuf.Timestamp save_date = 12; + optional string save_date_raw = 13; + optional google.protobuf.Timestamp print_date = 14; + optional string print_date_raw = 15; + optional google.protobuf.Timestamp last_printed = 16; + optional string last_printed_raw = 17; + + // Document statistics + optional int32 page_count = 18; + optional int32 word_count = 19; + optional int32 character_count = 20; + optional int32 character_count_with_spaces = 21; + optional int32 paragraph_count = 22; + optional int32 line_count = 23; + optional int32 slide_count = 24; + optional int32 image_count = 25; + optional int32 table_count = 26; + + // Versioning and identity + optional string revision = 27; + optional string category = 28; + optional string content_status = 29; + optional string presentation_format = 30; + optional string notes = 31; + optional int32 doc_security = 32; + optional string doc_security_string = 33; + + // Feature flags + optional bool has_track_changes = 34; + optional bool has_hidden_text = 35; + optional bool has_comments = 36; + optional bool is_encrypted = 37; + optional bool has_signature = 38; + optional bool has_hidden_sheets = 39; + optional bool has_animations = 40; + + // Parsing diagnostics + repeated string parsed_by = 41; + optional string content_type = 42; +} + +// Typed image metadata — populated when Content-Type starts with image/. +// Field names follow Tika's TIFF / EXIF interfaces. +message ImageTypedMetadata { + optional int32 image_width = 1; + optional int32 image_height = 2; + optional int32 bits_per_sample = 3; + optional int32 samples_per_pixel = 4; + optional string color_space = 5; + + // Resolution + optional double x_resolution = 6; + optional double y_resolution = 7; + optional string resolution_unit = 8; + + // Compression / encoding + optional string compression = 9; + optional string pixel_array_dimensions = 10; + optional string color_transform = 11; + + // EXIF capture info + optional google.protobuf.Timestamp datetime_original = 12; + optional string datetime_original_raw = 13; + optional google.protobuf.Timestamp datetime_digitized = 14; + optional string datetime_digitized_raw = 15; + optional string make = 16; + optional string model = 17; + optional string software = 18; + optional string orientation = 19; + optional double exposure_time = 20; + optional double f_number = 21; + optional int32 iso_speed_ratings = 22; + optional double focal_length = 23; + optional string flash = 24; + optional string metering_mode = 25; + optional string white_balance = 26; + + // GPS + optional double gps_latitude = 27; + optional double gps_longitude = 28; + optional double gps_altitude = 29; + + // Parsing diagnostics + repeated string parsed_by = 30; + optional string content_type = 31; +} + +// Typed email / message metadata — populated when Content-Type is message/rfc822 or similar. +message EmailTypedMetadata { + optional string message_from = 1; + repeated string message_to = 2; + repeated string message_cc = 3; + repeated string message_bcc = 4; + optional string subject = 5; + optional google.protobuf.Timestamp message_date = 6; + optional string message_date_raw = 7; + optional string message_id = 8; + optional string in_reply_to = 9; + optional bool is_multipart = 10; + optional int32 attachment_count = 11; + repeated string parsed_by = 12; + optional string content_type = 13; +} + +// Typed audio/video metadata — populated when Content-Type starts with audio/ or video/. +message MediaTypedMetadata { + optional string format = 1; + optional string duration_raw = 2; + optional double duration_seconds = 3; + optional int32 video_width = 4; + optional int32 video_height = 5; + optional double video_frame_rate = 6; + optional string video_compressor = 7; + optional int32 audio_sample_rate = 8; + optional int32 audio_channels = 9; + optional string audio_compressor = 10; + optional int32 audio_bits_per_sample = 11; + optional int32 bit_rate = 12; + optional string channel_type = 13; + repeated string parsed_by = 14; + optional string content_type = 15; +} + +// Typed metadata for document types not covered by the specialised messages above +// (HTML, plain text, XML, archives, fonts, etc.). +message GenericTypedMetadata { + optional string content_type = 1; + repeated string parsed_by = 2; + optional string detected_language = 3; + optional double detected_language_confidence = 4; + optional string detected_encoding = 5; + optional int32 version_count = 6; + optional int32 version_number = 7; + optional string resource_name = 8; +} + +// Parse lifecycle information. +message TikaTypedParseStatus { + enum Status { + STATUS_UNSPECIFIED = 0; + STATUS_SUCCESS = 1; + STATUS_PARTIAL = 2; + STATUS_FAILED = 3; + STATUS_TIMEOUT = 4; + } + Status status = 1; + repeated string parsers_used = 2; + repeated string warnings = 3; + optional string error_message = 4; +} + +// Reference to a document embedded within the parent document. +// The embedded document is reported separately in the emit data list and can be correlated +// by fetch_key. +message TikaEmbeddedDocument { + string fetch_key = 1; + optional string filename = 2; + optional string content_type = 3; + optional int32 embed_depth = 4; +}