From 5d3bd56ebb7846839df8fe4bcfb954eb2f1c2f43 Mon Sep 17 00:00:00 2001 From: luisquintanilla Date: Thu, 7 May 2026 10:00:04 -0400 Subject: [PATCH 1/3] Propagate element metadata to chunks in MEDI chunkers Fix #7465: All four IngestionChunker implementations (SectionChunker, HeaderChunker, SemanticSimilarityChunker, DocumentTokenChunker) now propagate IngestionDocumentElement.Metadata to IngestionChunk.Metadata. Design decisions: - First-wins merge strategy (TryAdd) for conflicting keys - Null metadata values skipped (element allows object?, chunk requires object) - Split elements: metadata goes to the first chunk only - Lazy allocation: dictionary only created when elements have metadata ElementsChunker (fixes SectionChunker, HeaderChunker, SemanticSimilarityChunker): - Added AccumulateMetadata/ApplyMetadata static helpers - Accumulates metadata as elements are processed - Applies to chunk on commit, then clears accumulator DocumentTokenChunker: - Added AccumulateMetadata static helper - Accumulates metadata during element iteration - Applies in FinalizeChunk, then clears accumulator Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Chunkers/DocumentTokenChunker.cs | 44 ++- .../Chunkers/ElementsChunker.cs | 49 ++- .../ChunkerMetadataPropagationTests.cs | 315 ++++++++++++++++++ 3 files changed, 403 insertions(+), 5 deletions(-) create mode 100644 test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs index 57025773049..c851b57bf4e 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs @@ -46,6 +46,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge int stringBuilderTokenCount = 0; StringBuilder stringBuilder = new(); + Dictionary? accumulatedMetadata = null; foreach (IngestionDocumentElement element in document.EnumerateContent()) { cancellationToken.ThrowIfCancellationRequested(); @@ -55,6 +56,8 @@ public override async IAsyncEnumerable> ProcessAsync(Inge continue; } + AccumulateMetadata(element, ref accumulatedMetadata); + int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false); ReadOnlyMemory contentToProcess = elementContent.AsMemory(); while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk) @@ -73,7 +76,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge _ = stringBuilder.Append(ptr, index); } } - yield return FinalizeChunk(); + yield return FinalizeChunk(ref accumulatedMetadata); contentToProcess = contentToProcess.Slice(index); contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false); @@ -85,16 +88,27 @@ public override async IAsyncEnumerable> ProcessAsync(Inge if (stringBuilder.Length > 0) { - yield return FinalizeChunk(); + yield return FinalizeChunk(ref accumulatedMetadata); } yield break; - IngestionChunk FinalizeChunk() + IngestionChunk FinalizeChunk(ref Dictionary? metadata) { IngestionChunk chunk = new IngestionChunk( content: stringBuilder.ToString(), document: document, context: string.Empty); + + if (metadata is { Count: > 0 }) + { + foreach (var kvp in metadata) + { + chunk.Metadata[kvp.Key] = kvp.Value; + } + + metadata = null; + } + _ = stringBuilder.Clear(); stringBuilderTokenCount = 0; @@ -121,5 +135,29 @@ IngestionChunk FinalizeChunk() } } + private static void AccumulateMetadata(IngestionDocumentElement element, ref Dictionary? accumulated) + { + if (!element.HasMetadata) + { + return; + } + + accumulated ??= []; + foreach (var kvp in element.Metadata) + { + if (kvp.Value is not null) + { +#if NET + accumulated.TryAdd(kvp.Key, kvp.Value); +#else + if (!accumulated.ContainsKey(kvp.Key)) + { + accumulated[kvp.Key] = kvp.Value; + } +#endif + } + } + } + } } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs index a50508f2a5e..a5b15ba526a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs @@ -36,6 +36,7 @@ internal IEnumerable> Process(IngestionDocument document, { // Not using yield return here as we use ref structs. List> chunks = []; + Dictionary? accumulatedMetadata = null; int contextTokenCount = CountTokens(context.AsSpan()); int totalTokenCount = contextTokenCount; @@ -67,6 +68,8 @@ internal IEnumerable> Process(IngestionDocument document, continue; // An image can come with Markdown, but no AlternativeText or Text. } + AccumulateMetadata(element, ref accumulatedMetadata); + int elementTokenCount = CountTokens(semanticContent.AsSpan()); if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk) { @@ -196,7 +199,9 @@ internal IEnumerable> Process(IngestionDocument document, if (totalTokenCount > contextTokenCount) { - chunks.Add(new(_currentChunk.ToString(), document, context)); + var chunk = new IngestionChunk(_currentChunk.ToString(), document, context); + ApplyMetadata(chunk, accumulatedMetadata); + chunks.Add(chunk); } _currentChunk.Clear(); @@ -205,7 +210,10 @@ internal IEnumerable> Process(IngestionDocument document, void Commit() { - chunks.Add(new(_currentChunk.ToString(), document, context)); + var chunk = new IngestionChunk(_currentChunk.ToString(), document, context); + ApplyMetadata(chunk, accumulatedMetadata); + chunks.Add(chunk); + accumulatedMetadata = null; // We keep the context in the current chunk as it's the same for all elements. _currentChunk.Remove( @@ -268,6 +276,43 @@ private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStrin vsb.Append(Environment.NewLine); } + private static void AccumulateMetadata(IngestionDocumentElement element, ref Dictionary? accumulated) + { + if (!element.HasMetadata) + { + return; + } + + accumulated ??= []; + foreach (var kvp in element.Metadata) + { + if (kvp.Value is not null) + { +#if NET + accumulated.TryAdd(kvp.Key, kvp.Value); +#else + if (!accumulated.ContainsKey(kvp.Key)) + { + accumulated[kvp.Key] = kvp.Value; + } +#endif + } + } + } + + private static void ApplyMetadata(IngestionChunk chunk, Dictionary? accumulated) + { + if (accumulated is null or { Count: 0 }) + { + return; + } + + foreach (var kvp in accumulated) + { + chunk.Metadata[kvp.Key] = kvp.Value; + } + } + private int CountTokens(ReadOnlySpan input) => _tokenizer.CountTokens(input, considerNormalization: false); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs new file mode 100644 index 00000000000..7ab691c0d49 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs @@ -0,0 +1,315 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.ML.Tokenizers; +using Xunit; + +namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests; + +public class ChunkerMetadataPropagationTests +{ + private static IngestionChunker CreateSectionChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new SectionChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + private static IngestionChunker CreateHeaderChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new HeaderChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + private static IngestionChunker CreateDocumentTokenChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + [Fact] + public async Task SectionChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = "text"; + paragraph.Metadata["page"] = 1; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(1, chunk.Metadata["page"]); + } + + [Fact] + public async Task SectionChunker_MultipleElementsDifferentKeys_AllKeysAppear() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["confidence"] = 0.95; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(0.95, chunk.Metadata["confidence"]); + } + + [Fact] + public async Task SectionChunker_ConflictingKeys_FirstElementWins() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "table"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.Equal("table", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task SectionChunker_NullMetadataValue_Skipped() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = null; + paragraph.Metadata["valid_key"] = "valid_value"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.False(chunk.Metadata.ContainsKey("element_type")); + Assert.Equal("valid_value", chunk.Metadata["valid_key"]); + } + + [Fact] + public async Task SectionChunker_NoMetadata_ChunkHasNoMetadata() + { + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection + { + Elements = + { + new IngestionDocumentParagraph("No metadata here."), + new IngestionDocumentParagraph("Also no metadata.") + } + }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.False(chunk.HasMetadata); + } + + [Fact] + public async Task SectionChunker_ElementSplitAcrossChunks_FirstChunkGetsMetadata() + { + // Create a large paragraph that exceeds the token limit and forces a split + string longText = string.Join(" ", Enumerable.Repeat("word", 600)); + var paragraph = new IngestionDocumentParagraph(longText); + paragraph.Metadata["element_type"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1); + + // First chunk gets the metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("body", chunks[0].Metadata["element_type"]); + + // Subsequent chunks from the same element do NOT get metadata (accumulator was cleared on commit) + Assert.False(chunks[1].HasMetadata); + } + + [Fact] + public async Task SectionChunker_TwoSectionsWithMetadata_IndependentMetadataPerSection() + { + var para1 = new IngestionDocumentParagraph("First section paragraph."); + para1.Metadata["section"] = "intro"; + + var para2 = new IngestionDocumentParagraph("Second section paragraph."); + para2.Metadata["section"] = "conclusion"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1 } }); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.Equal(2, chunks.Count); + Assert.Equal("intro", chunks[0].Metadata["section"]); + Assert.Equal("conclusion", chunks[1].Metadata["section"]); + } + + [Fact] + public async Task HeaderChunker_PropagatesMetadata() + { + var header = new IngestionDocumentHeader("# Title") { Level = 1 }; + var para = new IngestionDocumentParagraph("Body text."); + para.Metadata["element_type"] = "text"; + para.Metadata["page"] = 3; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { header, para } }); + + var chunker = CreateHeaderChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(3, chunk.Metadata["page"]); + } + + [Fact] + public async Task DocumentTokenChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task DocumentTokenChunker_MultipleElements_AccumulatesMetadata() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["confidence"] = 0.9; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(0.9, chunk.Metadata["confidence"]); + } + + [Fact] + public async Task DocumentTokenChunker_ConflictingKeys_FirstElementWins() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "table"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.Equal("table", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task DocumentTokenChunker_ElementSplitAcrossChunks_FirstChunkGetsMetadata() + { + string longText = string.Join(" ", Enumerable.Repeat("word", 600)); + var paragraph = new IngestionDocumentParagraph(longText); + paragraph.Metadata["element_type"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateDocumentTokenChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1); + + // First chunk gets the metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("body", chunks[0].Metadata["element_type"]); + + // Subsequent chunks from the same element do NOT get metadata (cleared on finalize) + Assert.False(chunks[1].HasMetadata); + } + + [Fact] + public async Task DocumentTokenChunker_NoMetadata_ChunkHasNoMetadata() + { + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection + { + Elements = + { + new IngestionDocumentParagraph("No metadata here.") + } + }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.False(chunk.HasMetadata); + } + + [Fact] + public async Task SectionChunker_TableWithMetadata_PropagatesMetadata() + { + var cells = new IngestionDocumentElement?[2, 2] + { + { new IngestionDocumentParagraph("Header1"), new IngestionDocumentParagraph("Header2") }, + { new IngestionDocumentParagraph("Value1"), new IngestionDocumentParagraph("Value2") } + }; + var table = new IngestionDocumentTable("| Header1 | Header2 |\n| --- | --- |\n| Value1 | Value2 |", cells); + table.Metadata["element_type"] = "table"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { table } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("table", chunk.Metadata["element_type"]); + } +} From 966e7ececa04d8167492aa276bd65d5b3b8e77be Mon Sep 17 00:00:00 2001 From: luisquintanilla Date: Mon, 8 Jun 2026 11:40:53 -0400 Subject: [PATCH 2/3] fix: correct metadata timing in chunkers and add boundary tests Fix metadata accumulation timing bugs in ElementsChunker and DocumentTokenChunker where AccumulateMetadata was called before determining which chunk the element's content contributes to. When a Commit/FinalizeChunk happens before the new element adds content (table pre-commit, non-table overflow, exact-fill boundary), the metadata was incorrectly applied to the previous chunk. ElementsChunker fixes: - Branch 1 (fits): accumulate right before appending - Branch 2 (table): use flag, accumulate before first table content append to _currentChunk, after any pre-commit or row-level commit - Branch 3 (non-table too big): use flag, accumulate when index > 0 (first content contribution in the while loop) DocumentTokenChunker fixes: - Use flag to defer accumulation until first content contribution - In while loop: accumulate only when index > 0 - After while loop: accumulate if not yet done (element fits entirely) New boundary tests (6 tests): - Previous element fills chunk, next element metadata on new chunk - Non-table element too large, metadata on correct chunks - Table pre-commit: table metadata not on pre-committed chunk - DocumentTokenChunker boundary with large filler element - DocumentTokenChunker with overlap enabled - Table split across chunks: first chunk gets metadata Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Chunkers/DocumentTokenChunker.cs | 16 +- .../Chunkers/ElementsChunker.cs | 26 ++- .../ChunkerMetadataPropagationTests.cs | 210 ++++++++++++++++++ 3 files changed, 248 insertions(+), 4 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs index c851b57bf4e..b3a3a7df18a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs @@ -56,10 +56,9 @@ public override async IAsyncEnumerable> ProcessAsync(Inge continue; } - AccumulateMetadata(element, ref accumulatedMetadata); - int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false); ReadOnlyMemory contentToProcess = elementContent.AsMemory(); + bool elementMetadataAccumulated = false; while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk) { int index = _tokenizer.GetIndexByTokenCount( @@ -69,6 +68,13 @@ public override async IAsyncEnumerable> ProcessAsync(Inge out int _, considerNormalization: false); + // Accumulate metadata the first time this element contributes content. + if (!elementMetadataAccumulated && index > 0) + { + AccumulateMetadata(element, ref accumulatedMetadata); + elementMetadataAccumulated = true; + } + unsafe { fixed (char* ptr = &MemoryMarshal.GetReference(contentToProcess.Span)) @@ -82,6 +88,12 @@ public override async IAsyncEnumerable> ProcessAsync(Inge contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false); } + // Accumulate metadata if the element only contributed content after the loop. + if (!elementMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + } + _ = stringBuilder.Append(contentToProcess); stringBuilderTokenCount += contentToProcessTokenCount; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs index a5b15ba526a..7c811e7a7bd 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs @@ -68,17 +68,18 @@ internal IEnumerable> Process(IngestionDocument document, continue; // An image can come with Markdown, but no AlternativeText or Text. } - AccumulateMetadata(element, ref accumulatedMetadata); - int elementTokenCount = CountTokens(semanticContent.AsSpan()); if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk) { + // Element fits in the current chunk — accumulate its metadata here. + AccumulateMetadata(element, ref accumulatedMetadata); totalTokenCount += elementTokenCount; AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan()); } else if (element is IngestionDocumentTable table) { ValueStringBuilder tableBuilder = new(initialCapacity: 8000); + bool tableMetadataAccumulated = false; try { @@ -116,6 +117,13 @@ internal IEnumerable> Process(IngestionDocument document, // We append the table as long as it's not just the header. if (rowIndex != 1) { + // Accumulate metadata before first table content append. + if (!tableMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + tableMetadataAccumulated = true; + } + AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); } @@ -140,6 +148,12 @@ internal IEnumerable> Process(IngestionDocument document, totalTokenCount += lastRowTokens; } + // Accumulate metadata before appending remaining table content. + if (!tableMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + } + AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); } finally @@ -150,6 +164,7 @@ internal IEnumerable> Process(IngestionDocument document, else { ReadOnlySpan remainingContent = semanticContent.AsSpan(); + bool elementMetadataAccumulated = false; while (!remainingContent.IsEmpty) { @@ -173,6 +188,13 @@ internal IEnumerable> Process(IngestionDocument document, tokenCount = CountTokens(remainingContent.Slice(0, index)); } + // Accumulate metadata the first time this element contributes content. + if (!elementMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + elementMetadataAccumulated = true; + } + totalTokenCount += tokenCount; ReadOnlySpan spanToAppend = remainingContent.Slice(0, index); AppendNewLineAndSpan(_currentChunk, spanToAppend); diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs index 7ab691c0d49..b0324f41944 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs @@ -312,4 +312,214 @@ public async Task SectionChunker_TableWithMetadata_PropagatesMetadata() Assert.True(chunk.HasMetadata); Assert.Equal("table", chunk.Metadata["element_type"]); } + + [Fact] + public async Task SectionChunker_PreviousElementFillsChunk_NextElementMetadataOnNewChunk() + { + // First element exceeds the chunk limit, so it fills chunk 0 and overflows into chunk 1. + // Second element is small and goes into the last chunk. + // Each element has a unique metadata key — verify they end up on the correct chunks. + string fillerText = string.Join(" ", Enumerable.Repeat("word", 600)); + var filler = new IngestionDocumentParagraph(fillerText); + filler.Metadata["filler_key"] = "from_filler"; + + var nextElement = new IngestionDocumentParagraph("Next element content here."); + nextElement.Metadata["next_key"] = "from_next"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, nextElement } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk must have filler metadata (it contributed content to this chunk) + Assert.True(chunks[0].HasMetadata); + Assert.Equal("from_filler", chunks[0].Metadata["filler_key"]); + Assert.False(chunks[0].Metadata.ContainsKey("next_key")); + + // The last chunk must have the next element's metadata + var lastChunk = chunks[chunks.Count - 1]; + Assert.True(lastChunk.HasMetadata); + Assert.Equal("from_next", lastChunk.Metadata["next_key"]); + } + + [Fact] + public async Task SectionChunker_NonTableElementTooLargeForCurrentChunk_MetadataOnCorrectChunks() + { + // Two large elements with the same metadata key but different values. + // Each element exceeds chunk limit. Verify first-wins semantics per chunk: + // - Chunks containing elem1 content get elem1's metadata (only the first such chunk) + // - Chunks containing elem2 content get elem2's metadata (only the first such chunk) + var elem1 = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("alpha", 300))); + elem1.Metadata["source"] = "elem1"; + + var elem2 = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("beta", 300))); + elem2.Metadata["source"] = "elem2"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { elem1, elem2 } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 3); + + // First chunk: elem1's metadata (elem1 contributes content) + Assert.Equal("elem1", chunks[0].Metadata["source"]); + + // Find the first chunk that contains elem2's content + var firstElem2Chunk = chunks.First(c => c.Content.Contains("beta")); + Assert.True(firstElem2Chunk.HasMetadata); + Assert.Equal("elem2", firstElem2Chunk.Metadata["source"]); + } + + [Fact] + public async Task SectionChunker_TablePreCommit_TableMetadataNotOnPreviousChunk() + { + // Previous content fills most of the chunk. Table header doesn't fit, forcing a pre-commit. + // Table metadata must go on the chunk with the table, not the pre-committed chunk. + // Use different metadata keys to distinguish elements. + var filler = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("fill", 500))); + filler.Metadata["paragraph_key"] = "paragraph_value"; + + var cells = new IngestionDocumentElement?[2, 2] + { + { new IngestionDocumentParagraph("Col1"), new IngestionDocumentParagraph("Col2") }, + { new IngestionDocumentParagraph("Val1"), new IngestionDocumentParagraph("Val2") } + }; + var table = new IngestionDocumentTable("| Col1 | Col2 |\n| --- | --- |\n| Val1 | Val2 |", cells); + table.Metadata["table_key"] = "table_value"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, table } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // Find the chunk containing table content + var tableChunk = chunks.FirstOrDefault(c => c.Content.Contains("Col1") || c.Content.Contains("Val1")); + Assert.NotNull(tableChunk); + + // The table chunk must have the table's metadata + Assert.True(tableChunk!.HasMetadata); + Assert.Equal("table_value", tableChunk.Metadata["table_key"]); + + // Chunks before the table chunk should NOT have table metadata + int tableChunkIndex = chunks.IndexOf(tableChunk); + for (int i = 0; i < tableChunkIndex; i++) + { + Assert.False(chunks[i].Metadata.ContainsKey("table_key"), + $"Chunk {i} should not have table metadata"); + } + } + + [Fact] + public async Task DocumentTokenChunker_PreviousElementFillsChunk_NextElementMetadataOnNewChunk() + { + // First element exceeds chunk limit, second element is small. + // Each has unique keys — verify correct chunk association. + string fillerText = string.Join(" ", Enumerable.Repeat("word", 600)); + var filler = new IngestionDocumentParagraph(fillerText); + filler.Metadata["filler_key"] = "from_filler"; + + var nextElement = new IngestionDocumentParagraph("Next element with metadata."); + nextElement.Metadata["next_key"] = "from_next"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, nextElement } }); + + var chunker = CreateDocumentTokenChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk must have filler metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("from_filler", chunks[0].Metadata["filler_key"]); + Assert.False(chunks[0].Metadata.ContainsKey("next_key")); + + // The last chunk must have the next element's metadata + var lastChunk = chunks[chunks.Count - 1]; + Assert.True(lastChunk.HasMetadata); + Assert.Equal("from_next", lastChunk.Metadata["next_key"]); + } + + [Fact] + public async Task DocumentTokenChunker_WithOverlap_PropagatesMetadata() + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + var chunker = new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = 200, OverlapTokens = 50 }); + + string text1 = string.Join(" ", Enumerable.Repeat("alpha", 300)); + var para1 = new IngestionDocumentParagraph(text1); + para1.Metadata["section"] = "intro"; + + string text2 = string.Join(" ", Enumerable.Repeat("beta", 100)); + var para2 = new IngestionDocumentParagraph(text2); + para2.Metadata["section"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk should have intro metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("intro", chunks[0].Metadata["section"]); + } + + [Fact] + public async Task SectionChunker_TableSplitAcrossChunks_FirstChunkGetsMetadata() + { + // A large table that spans multiple chunks — only the first chunk containing table content gets metadata + int rowCount = 30; + int colCount = 3; + var cells = new IngestionDocumentElement?[rowCount, colCount]; + cells[0, 0] = new IngestionDocumentParagraph("HeaderColumn1"); + cells[0, 1] = new IngestionDocumentParagraph("HeaderColumn2"); + cells[0, 2] = new IngestionDocumentParagraph("HeaderColumn3"); + + // Build a proper markdown string that's long enough to exceed the token limit + var mdBuilder = new System.Text.StringBuilder(); + mdBuilder.AppendLine("| HeaderColumn1 | HeaderColumn2 | HeaderColumn3 |"); + mdBuilder.AppendLine("| --- | --- | --- |"); + + for (int i = 1; i < rowCount; i++) + { + string c1 = $"Row{i} first column value with extra text to increase token count"; + string c2 = $"Row{i} second column value with extra text to increase token count"; + string c3 = $"Row{i} third column value with extra text to increase token count"; + cells[i, 0] = new IngestionDocumentParagraph(c1); + cells[i, 1] = new IngestionDocumentParagraph(c2); + cells[i, 2] = new IngestionDocumentParagraph(c3); + mdBuilder.AppendLine($"| {c1} | {c2} | {c3} |"); + } + + var table = new IngestionDocumentTable(mdBuilder.ToString(), cells); + table.Metadata["element_type"] = "data_table"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { table } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1, $"Table should span multiple chunks but got {chunks.Count}"); + + // First chunk gets table metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("data_table", chunks[0].Metadata["element_type"]); + + // Subsequent table chunks do NOT get metadata (cleared on commit, first-wins) + for (int i = 1; i < chunks.Count; i++) + { + Assert.False(chunks[i].HasMetadata, $"Chunk {i} should not have metadata"); + } + } } From 8bac32d40f4422a4cb29fa7d2b17c6d24dc22b42 Mon Sep 17 00:00:00 2001 From: luisquintanilla Date: Mon, 8 Jun 2026 14:42:39 -0400 Subject: [PATCH 3/3] test: add SemanticSimilarityChunker metadata propagation tests Add 2 tests covering SemanticSimilarityChunker metadata flow: - Single element with metadata propagates to chunk - Multiple elements with different keys each carry metadata Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ChunkerMetadataPropagationTests.cs | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs index b0324f41944..07f24cefdf4 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using Microsoft.Extensions.AI; using Microsoft.ML.Tokenizers; using Xunit; @@ -522,4 +523,77 @@ public async Task SectionChunker_TableSplitAcrossChunks_FirstChunkGetsMetadata() Assert.False(chunks[i].HasMetadata, $"Chunk {i} should not have metadata"); } } + + [Fact] + public async Task SemanticSimilarityChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph for semantic chunking."); + paragraph.Metadata["element_type"] = "text"; + paragraph.Metadata["page"] = 1; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + using var embeddingGenerator = new TestEmbeddingGenerator() + { + GenerateAsyncCallback = static (values, options, ct) => + { + var embeddings = values.Select(v => + new Embedding(new float[] { 1.0f, 2.0f, 3.0f, 4.0f })) + .ToArray(); + return Task.FromResult(new GeneratedEmbeddings>(embeddings)); + } + }; + var chunker = new SemanticSimilarityChunker( + embeddingGenerator, + new(tokenizer) { MaxTokensPerChunk = 2_000, OverlapTokens = 0 }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(1, chunk.Metadata["page"]); + } + + [Fact] + public async Task SemanticSimilarityChunker_MultipleElementsDifferentKeys_AllKeysAppear() + { + var para1 = new IngestionDocumentParagraph("First paragraph about .NET development."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph about cloud computing."); + para2.Metadata["confidence"] = 0.95; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + using var embeddingGenerator = new TestEmbeddingGenerator() + { + GenerateAsyncCallback = static (values, options, ct) => + { + var embeddings = values.Select(v => + new Embedding(new float[] { 1.0f, 2.0f, 3.0f, 4.0f })) + .ToArray(); + return Task.FromResult(new GeneratedEmbeddings>(embeddings)); + } + }; + var chunker = new SemanticSimilarityChunker( + embeddingGenerator, + new(tokenizer) { MaxTokensPerChunk = 2_000, OverlapTokens = 0 }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + // Semantic chunker may split elements into separate chunks based on similarity. + // Verify each chunk carries its originating element's metadata. + Assert.Equal(2, chunks.Count); + + Assert.True(chunks[0].HasMetadata); + Assert.Equal("text", chunks[0].Metadata["element_type"]); + + Assert.True(chunks[1].HasMetadata); + Assert.Equal(0.95, chunks[1].Metadata["confidence"]); + } }