diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs index 57025773049..b3a3a7df18a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs @@ -46,6 +46,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge int stringBuilderTokenCount = 0; StringBuilder stringBuilder = new(); + Dictionary? accumulatedMetadata = null; foreach (IngestionDocumentElement element in document.EnumerateContent()) { cancellationToken.ThrowIfCancellationRequested(); @@ -57,6 +58,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false); ReadOnlyMemory contentToProcess = elementContent.AsMemory(); + bool elementMetadataAccumulated = false; while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk) { int index = _tokenizer.GetIndexByTokenCount( @@ -66,6 +68,13 @@ public override async IAsyncEnumerable> ProcessAsync(Inge out int _, considerNormalization: false); + // Accumulate metadata the first time this element contributes content. + if (!elementMetadataAccumulated && index > 0) + { + AccumulateMetadata(element, ref accumulatedMetadata); + elementMetadataAccumulated = true; + } + unsafe { fixed (char* ptr = &MemoryMarshal.GetReference(contentToProcess.Span)) @@ -73,28 +82,45 @@ public override async IAsyncEnumerable> ProcessAsync(Inge _ = stringBuilder.Append(ptr, index); } } - yield return FinalizeChunk(); + yield return FinalizeChunk(ref accumulatedMetadata); contentToProcess = contentToProcess.Slice(index); contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false); } + // Accumulate metadata if the element only contributed content after the loop. + if (!elementMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + } + _ = stringBuilder.Append(contentToProcess); stringBuilderTokenCount += contentToProcessTokenCount; } if (stringBuilder.Length > 0) { - yield return FinalizeChunk(); + yield return FinalizeChunk(ref accumulatedMetadata); } yield break; - IngestionChunk FinalizeChunk() + IngestionChunk FinalizeChunk(ref Dictionary? metadata) { IngestionChunk chunk = new IngestionChunk( content: stringBuilder.ToString(), document: document, context: string.Empty); + + if (metadata is { Count: > 0 }) + { + foreach (var kvp in metadata) + { + chunk.Metadata[kvp.Key] = kvp.Value; + } + + metadata = null; + } + _ = stringBuilder.Clear(); stringBuilderTokenCount = 0; @@ -121,5 +147,29 @@ IngestionChunk FinalizeChunk() } } + private static void AccumulateMetadata(IngestionDocumentElement element, ref Dictionary? accumulated) + { + if (!element.HasMetadata) + { + return; + } + + accumulated ??= []; + foreach (var kvp in element.Metadata) + { + if (kvp.Value is not null) + { +#if NET + accumulated.TryAdd(kvp.Key, kvp.Value); +#else + if (!accumulated.ContainsKey(kvp.Key)) + { + accumulated[kvp.Key] = kvp.Value; + } +#endif + } + } + } + } } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs index a50508f2a5e..7c811e7a7bd 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs @@ -36,6 +36,7 @@ internal IEnumerable> Process(IngestionDocument document, { // Not using yield return here as we use ref structs. List> chunks = []; + Dictionary? accumulatedMetadata = null; int contextTokenCount = CountTokens(context.AsSpan()); int totalTokenCount = contextTokenCount; @@ -70,12 +71,15 @@ internal IEnumerable> Process(IngestionDocument document, int elementTokenCount = CountTokens(semanticContent.AsSpan()); if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk) { + // Element fits in the current chunk — accumulate its metadata here. + AccumulateMetadata(element, ref accumulatedMetadata); totalTokenCount += elementTokenCount; AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan()); } else if (element is IngestionDocumentTable table) { ValueStringBuilder tableBuilder = new(initialCapacity: 8000); + bool tableMetadataAccumulated = false; try { @@ -113,6 +117,13 @@ internal IEnumerable> Process(IngestionDocument document, // We append the table as long as it's not just the header. if (rowIndex != 1) { + // Accumulate metadata before first table content append. + if (!tableMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + tableMetadataAccumulated = true; + } + AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); } @@ -137,6 +148,12 @@ internal IEnumerable> Process(IngestionDocument document, totalTokenCount += lastRowTokens; } + // Accumulate metadata before appending remaining table content. + if (!tableMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + } + AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); } finally @@ -147,6 +164,7 @@ internal IEnumerable> Process(IngestionDocument document, else { ReadOnlySpan remainingContent = semanticContent.AsSpan(); + bool elementMetadataAccumulated = false; while (!remainingContent.IsEmpty) { @@ -170,6 +188,13 @@ internal IEnumerable> Process(IngestionDocument document, tokenCount = CountTokens(remainingContent.Slice(0, index)); } + // Accumulate metadata the first time this element contributes content. + if (!elementMetadataAccumulated) + { + AccumulateMetadata(element, ref accumulatedMetadata); + elementMetadataAccumulated = true; + } + totalTokenCount += tokenCount; ReadOnlySpan spanToAppend = remainingContent.Slice(0, index); AppendNewLineAndSpan(_currentChunk, spanToAppend); @@ -196,7 +221,9 @@ internal IEnumerable> Process(IngestionDocument document, if (totalTokenCount > contextTokenCount) { - chunks.Add(new(_currentChunk.ToString(), document, context)); + var chunk = new IngestionChunk(_currentChunk.ToString(), document, context); + ApplyMetadata(chunk, accumulatedMetadata); + chunks.Add(chunk); } _currentChunk.Clear(); @@ -205,7 +232,10 @@ internal IEnumerable> Process(IngestionDocument document, void Commit() { - chunks.Add(new(_currentChunk.ToString(), document, context)); + var chunk = new IngestionChunk(_currentChunk.ToString(), document, context); + ApplyMetadata(chunk, accumulatedMetadata); + chunks.Add(chunk); + accumulatedMetadata = null; // We keep the context in the current chunk as it's the same for all elements. _currentChunk.Remove( @@ -268,6 +298,43 @@ private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStrin vsb.Append(Environment.NewLine); } + private static void AccumulateMetadata(IngestionDocumentElement element, ref Dictionary? accumulated) + { + if (!element.HasMetadata) + { + return; + } + + accumulated ??= []; + foreach (var kvp in element.Metadata) + { + if (kvp.Value is not null) + { +#if NET + accumulated.TryAdd(kvp.Key, kvp.Value); +#else + if (!accumulated.ContainsKey(kvp.Key)) + { + accumulated[kvp.Key] = kvp.Value; + } +#endif + } + } + } + + private static void ApplyMetadata(IngestionChunk chunk, Dictionary? accumulated) + { + if (accumulated is null or { Count: 0 }) + { + return; + } + + foreach (var kvp in accumulated) + { + chunk.Metadata[kvp.Key] = kvp.Value; + } + } + private int CountTokens(ReadOnlySpan input) => _tokenizer.CountTokens(input, considerNormalization: false); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs new file mode 100644 index 00000000000..07f24cefdf4 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/ChunkerMetadataPropagationTests.cs @@ -0,0 +1,599 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.ML.Tokenizers; +using Xunit; + +namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests; + +public class ChunkerMetadataPropagationTests +{ + private static IngestionChunker CreateSectionChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new SectionChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + private static IngestionChunker CreateHeaderChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new HeaderChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + private static IngestionChunker CreateDocumentTokenChunker(int maxTokensPerChunk = 2_000) + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); + } + + [Fact] + public async Task SectionChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = "text"; + paragraph.Metadata["page"] = 1; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(1, chunk.Metadata["page"]); + } + + [Fact] + public async Task SectionChunker_MultipleElementsDifferentKeys_AllKeysAppear() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["confidence"] = 0.95; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(0.95, chunk.Metadata["confidence"]); + } + + [Fact] + public async Task SectionChunker_ConflictingKeys_FirstElementWins() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "table"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.Equal("table", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task SectionChunker_NullMetadataValue_Skipped() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = null; + paragraph.Metadata["valid_key"] = "valid_value"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.False(chunk.Metadata.ContainsKey("element_type")); + Assert.Equal("valid_value", chunk.Metadata["valid_key"]); + } + + [Fact] + public async Task SectionChunker_NoMetadata_ChunkHasNoMetadata() + { + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection + { + Elements = + { + new IngestionDocumentParagraph("No metadata here."), + new IngestionDocumentParagraph("Also no metadata.") + } + }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.False(chunk.HasMetadata); + } + + [Fact] + public async Task SectionChunker_ElementSplitAcrossChunks_FirstChunkGetsMetadata() + { + // Create a large paragraph that exceeds the token limit and forces a split + string longText = string.Join(" ", Enumerable.Repeat("word", 600)); + var paragraph = new IngestionDocumentParagraph(longText); + paragraph.Metadata["element_type"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1); + + // First chunk gets the metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("body", chunks[0].Metadata["element_type"]); + + // Subsequent chunks from the same element do NOT get metadata (accumulator was cleared on commit) + Assert.False(chunks[1].HasMetadata); + } + + [Fact] + public async Task SectionChunker_TwoSectionsWithMetadata_IndependentMetadataPerSection() + { + var para1 = new IngestionDocumentParagraph("First section paragraph."); + para1.Metadata["section"] = "intro"; + + var para2 = new IngestionDocumentParagraph("Second section paragraph."); + para2.Metadata["section"] = "conclusion"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1 } }); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para2 } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.Equal(2, chunks.Count); + Assert.Equal("intro", chunks[0].Metadata["section"]); + Assert.Equal("conclusion", chunks[1].Metadata["section"]); + } + + [Fact] + public async Task HeaderChunker_PropagatesMetadata() + { + var header = new IngestionDocumentHeader("# Title") { Level = 1 }; + var para = new IngestionDocumentParagraph("Body text."); + para.Metadata["element_type"] = "text"; + para.Metadata["page"] = 3; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { header, para } }); + + var chunker = CreateHeaderChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(3, chunk.Metadata["page"]); + } + + [Fact] + public async Task DocumentTokenChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph."); + paragraph.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task DocumentTokenChunker_MultipleElements_AccumulatesMetadata() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["confidence"] = 0.9; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(0.9, chunk.Metadata["confidence"]); + } + + [Fact] + public async Task DocumentTokenChunker_ConflictingKeys_FirstElementWins() + { + var para1 = new IngestionDocumentParagraph("First paragraph."); + para1.Metadata["element_type"] = "table"; + + var para2 = new IngestionDocumentParagraph("Second paragraph."); + para2.Metadata["element_type"] = "text"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.Equal("table", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task DocumentTokenChunker_ElementSplitAcrossChunks_FirstChunkGetsMetadata() + { + string longText = string.Join(" ", Enumerable.Repeat("word", 600)); + var paragraph = new IngestionDocumentParagraph(longText); + paragraph.Metadata["element_type"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var chunker = CreateDocumentTokenChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1); + + // First chunk gets the metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("body", chunks[0].Metadata["element_type"]); + + // Subsequent chunks from the same element do NOT get metadata (cleared on finalize) + Assert.False(chunks[1].HasMetadata); + } + + [Fact] + public async Task DocumentTokenChunker_NoMetadata_ChunkHasNoMetadata() + { + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection + { + Elements = + { + new IngestionDocumentParagraph("No metadata here.") + } + }); + + var chunker = CreateDocumentTokenChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.False(chunk.HasMetadata); + } + + [Fact] + public async Task SectionChunker_TableWithMetadata_PropagatesMetadata() + { + var cells = new IngestionDocumentElement?[2, 2] + { + { new IngestionDocumentParagraph("Header1"), new IngestionDocumentParagraph("Header2") }, + { new IngestionDocumentParagraph("Value1"), new IngestionDocumentParagraph("Value2") } + }; + var table = new IngestionDocumentTable("| Header1 | Header2 |\n| --- | --- |\n| Value1 | Value2 |", cells); + table.Metadata["element_type"] = "table"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { table } }); + + var chunker = CreateSectionChunker(); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("table", chunk.Metadata["element_type"]); + } + + [Fact] + public async Task SectionChunker_PreviousElementFillsChunk_NextElementMetadataOnNewChunk() + { + // First element exceeds the chunk limit, so it fills chunk 0 and overflows into chunk 1. + // Second element is small and goes into the last chunk. + // Each element has a unique metadata key — verify they end up on the correct chunks. + string fillerText = string.Join(" ", Enumerable.Repeat("word", 600)); + var filler = new IngestionDocumentParagraph(fillerText); + filler.Metadata["filler_key"] = "from_filler"; + + var nextElement = new IngestionDocumentParagraph("Next element content here."); + nextElement.Metadata["next_key"] = "from_next"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, nextElement } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk must have filler metadata (it contributed content to this chunk) + Assert.True(chunks[0].HasMetadata); + Assert.Equal("from_filler", chunks[0].Metadata["filler_key"]); + Assert.False(chunks[0].Metadata.ContainsKey("next_key")); + + // The last chunk must have the next element's metadata + var lastChunk = chunks[chunks.Count - 1]; + Assert.True(lastChunk.HasMetadata); + Assert.Equal("from_next", lastChunk.Metadata["next_key"]); + } + + [Fact] + public async Task SectionChunker_NonTableElementTooLargeForCurrentChunk_MetadataOnCorrectChunks() + { + // Two large elements with the same metadata key but different values. + // Each element exceeds chunk limit. Verify first-wins semantics per chunk: + // - Chunks containing elem1 content get elem1's metadata (only the first such chunk) + // - Chunks containing elem2 content get elem2's metadata (only the first such chunk) + var elem1 = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("alpha", 300))); + elem1.Metadata["source"] = "elem1"; + + var elem2 = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("beta", 300))); + elem2.Metadata["source"] = "elem2"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { elem1, elem2 } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 3); + + // First chunk: elem1's metadata (elem1 contributes content) + Assert.Equal("elem1", chunks[0].Metadata["source"]); + + // Find the first chunk that contains elem2's content + var firstElem2Chunk = chunks.First(c => c.Content.Contains("beta")); + Assert.True(firstElem2Chunk.HasMetadata); + Assert.Equal("elem2", firstElem2Chunk.Metadata["source"]); + } + + [Fact] + public async Task SectionChunker_TablePreCommit_TableMetadataNotOnPreviousChunk() + { + // Previous content fills most of the chunk. Table header doesn't fit, forcing a pre-commit. + // Table metadata must go on the chunk with the table, not the pre-committed chunk. + // Use different metadata keys to distinguish elements. + var filler = new IngestionDocumentParagraph(string.Join(" ", Enumerable.Repeat("fill", 500))); + filler.Metadata["paragraph_key"] = "paragraph_value"; + + var cells = new IngestionDocumentElement?[2, 2] + { + { new IngestionDocumentParagraph("Col1"), new IngestionDocumentParagraph("Col2") }, + { new IngestionDocumentParagraph("Val1"), new IngestionDocumentParagraph("Val2") } + }; + var table = new IngestionDocumentTable("| Col1 | Col2 |\n| --- | --- |\n| Val1 | Val2 |", cells); + table.Metadata["table_key"] = "table_value"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, table } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // Find the chunk containing table content + var tableChunk = chunks.FirstOrDefault(c => c.Content.Contains("Col1") || c.Content.Contains("Val1")); + Assert.NotNull(tableChunk); + + // The table chunk must have the table's metadata + Assert.True(tableChunk!.HasMetadata); + Assert.Equal("table_value", tableChunk.Metadata["table_key"]); + + // Chunks before the table chunk should NOT have table metadata + int tableChunkIndex = chunks.IndexOf(tableChunk); + for (int i = 0; i < tableChunkIndex; i++) + { + Assert.False(chunks[i].Metadata.ContainsKey("table_key"), + $"Chunk {i} should not have table metadata"); + } + } + + [Fact] + public async Task DocumentTokenChunker_PreviousElementFillsChunk_NextElementMetadataOnNewChunk() + { + // First element exceeds chunk limit, second element is small. + // Each has unique keys — verify correct chunk association. + string fillerText = string.Join(" ", Enumerable.Repeat("word", 600)); + var filler = new IngestionDocumentParagraph(fillerText); + filler.Metadata["filler_key"] = "from_filler"; + + var nextElement = new IngestionDocumentParagraph("Next element with metadata."); + nextElement.Metadata["next_key"] = "from_next"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { filler, nextElement } }); + + var chunker = CreateDocumentTokenChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk must have filler metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("from_filler", chunks[0].Metadata["filler_key"]); + Assert.False(chunks[0].Metadata.ContainsKey("next_key")); + + // The last chunk must have the next element's metadata + var lastChunk = chunks[chunks.Count - 1]; + Assert.True(lastChunk.HasMetadata); + Assert.Equal("from_next", lastChunk.Metadata["next_key"]); + } + + [Fact] + public async Task DocumentTokenChunker_WithOverlap_PropagatesMetadata() + { + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + var chunker = new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = 200, OverlapTokens = 50 }); + + string text1 = string.Join(" ", Enumerable.Repeat("alpha", 300)); + var para1 = new IngestionDocumentParagraph(text1); + para1.Metadata["section"] = "intro"; + + string text2 = string.Join(" ", Enumerable.Repeat("beta", 100)); + var para2 = new IngestionDocumentParagraph(text2); + para2.Metadata["section"] = "body"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count >= 2); + + // First chunk should have intro metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("intro", chunks[0].Metadata["section"]); + } + + [Fact] + public async Task SectionChunker_TableSplitAcrossChunks_FirstChunkGetsMetadata() + { + // A large table that spans multiple chunks — only the first chunk containing table content gets metadata + int rowCount = 30; + int colCount = 3; + var cells = new IngestionDocumentElement?[rowCount, colCount]; + cells[0, 0] = new IngestionDocumentParagraph("HeaderColumn1"); + cells[0, 1] = new IngestionDocumentParagraph("HeaderColumn2"); + cells[0, 2] = new IngestionDocumentParagraph("HeaderColumn3"); + + // Build a proper markdown string that's long enough to exceed the token limit + var mdBuilder = new System.Text.StringBuilder(); + mdBuilder.AppendLine("| HeaderColumn1 | HeaderColumn2 | HeaderColumn3 |"); + mdBuilder.AppendLine("| --- | --- | --- |"); + + for (int i = 1; i < rowCount; i++) + { + string c1 = $"Row{i} first column value with extra text to increase token count"; + string c2 = $"Row{i} second column value with extra text to increase token count"; + string c3 = $"Row{i} third column value with extra text to increase token count"; + cells[i, 0] = new IngestionDocumentParagraph(c1); + cells[i, 1] = new IngestionDocumentParagraph(c2); + cells[i, 2] = new IngestionDocumentParagraph(c3); + mdBuilder.AppendLine($"| {c1} | {c2} | {c3} |"); + } + + var table = new IngestionDocumentTable(mdBuilder.ToString(), cells); + table.Metadata["element_type"] = "data_table"; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { table } }); + + var chunker = CreateSectionChunker(maxTokensPerChunk: 200); + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + Assert.True(chunks.Count > 1, $"Table should span multiple chunks but got {chunks.Count}"); + + // First chunk gets table metadata + Assert.True(chunks[0].HasMetadata); + Assert.Equal("data_table", chunks[0].Metadata["element_type"]); + + // Subsequent table chunks do NOT get metadata (cleared on commit, first-wins) + for (int i = 1; i < chunks.Count; i++) + { + Assert.False(chunks[i].HasMetadata, $"Chunk {i} should not have metadata"); + } + } + + [Fact] + public async Task SemanticSimilarityChunker_SingleElementWithMetadata_PropagatesMetadata() + { + var paragraph = new IngestionDocumentParagraph("This is a paragraph for semantic chunking."); + paragraph.Metadata["element_type"] = "text"; + paragraph.Metadata["page"] = 1; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { paragraph } }); + + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + using var embeddingGenerator = new TestEmbeddingGenerator() + { + GenerateAsyncCallback = static (values, options, ct) => + { + var embeddings = values.Select(v => + new Embedding(new float[] { 1.0f, 2.0f, 3.0f, 4.0f })) + .ToArray(); + return Task.FromResult(new GeneratedEmbeddings>(embeddings)); + } + }; + var chunker = new SemanticSimilarityChunker( + embeddingGenerator, + new(tokenizer) { MaxTokensPerChunk = 2_000, OverlapTokens = 0 }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + var chunk = Assert.Single(chunks); + Assert.True(chunk.HasMetadata); + Assert.Equal("text", chunk.Metadata["element_type"]); + Assert.Equal(1, chunk.Metadata["page"]); + } + + [Fact] + public async Task SemanticSimilarityChunker_MultipleElementsDifferentKeys_AllKeysAppear() + { + var para1 = new IngestionDocumentParagraph("First paragraph about .NET development."); + para1.Metadata["element_type"] = "text"; + + var para2 = new IngestionDocumentParagraph("Second paragraph about cloud computing."); + para2.Metadata["confidence"] = 0.95; + + var doc = new IngestionDocument("doc"); + doc.Sections.Add(new IngestionDocumentSection { Elements = { para1, para2 } }); + + var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + using var embeddingGenerator = new TestEmbeddingGenerator() + { + GenerateAsyncCallback = static (values, options, ct) => + { + var embeddings = values.Select(v => + new Embedding(new float[] { 1.0f, 2.0f, 3.0f, 4.0f })) + .ToArray(); + return Task.FromResult(new GeneratedEmbeddings>(embeddings)); + } + }; + var chunker = new SemanticSimilarityChunker( + embeddingGenerator, + new(tokenizer) { MaxTokensPerChunk = 2_000, OverlapTokens = 0 }); + + var chunks = await chunker.ProcessAsync(doc).ToListAsync(); + + // Semantic chunker may split elements into separate chunks based on similarity. + // Verify each chunk carries its originating element's metadata. + Assert.Equal(2, chunks.Count); + + Assert.True(chunks[0].HasMetadata); + Assert.Equal("text", chunks[0].Metadata["element_type"]); + + Assert.True(chunks[1].HasMetadata); + Assert.Equal(0.95, chunks[1].Metadata["confidence"]); + } +}