From 9ec123b48a1b7d2b03806395f4703bbbdd480ae5 Mon Sep 17 00:00:00 2001 From: James Ross Date: Sat, 14 Mar 2026 16:40:17 -0700 Subject: [PATCH 1/2] Deduplicate repeated chunk tree entries --- src/domain/services/CasService.js | 39 ++++-- test/integration/round-trip.test.js | 123 +++++++++++++++++- .../domain/services/CasService.merkle.test.js | 31 +++++ test/unit/domain/services/CasService.test.js | 37 +++++- 4 files changed, 219 insertions(+), 11 deletions(-) diff --git a/src/domain/services/CasService.js b/src/domain/services/CasService.js index 851fd75..9b1ccf9 100644 --- a/src/domain/services/CasService.js +++ b/src/domain/services/CasService.js @@ -320,11 +320,37 @@ export default class CasService { return data; } + /** + * Builds unique chunk blob tree entries in first-seen order. + * + * Tree entries keep chunk blobs reachable in Git. The manifest remains the + * authoritative ordered list of chunk occurrences, so repeated digests only + * need one tree entry. + * + * @private + * @param {import('../value-objects/Chunk.js').default[]} chunks + * @returns {string[]} + */ + _createChunkTreeEntries(chunks) { + const treeEntries = []; + const seenDigests = new Set(); + + for (const chunk of chunks) { + if (seenDigests.has(chunk.digest)) { + continue; + } + seenDigests.add(chunk.digest); + treeEntries.push(`100644 blob ${chunk.blob}\t${chunk.digest}`); + } + + return treeEntries; + } + /** * Creates a Git tree object from a manifest. * - * The tree contains the serialized manifest file and one blob entry per chunk, - * keyed by its SHA-256 digest. + * The tree contains the serialized manifest file and one blob entry per + * unique chunk digest, preserving first-seen order. * * @param {Object} options * @param {import('../value-objects/Manifest.js').default} options.manifest - The file manifest. @@ -342,7 +368,7 @@ export default class CasService { const treeEntries = [ `100644 blob ${manifestOid}\tmanifest.${this.codec.extension}`, - ...chunks.map((c) => `100644 blob ${c.blob}\t${c.digest}`), + ...this._createChunkTreeEntries(chunks), ]; return await this.persistence.writeTree(treeEntries); @@ -358,7 +384,6 @@ export default class CasService { async _createMerkleTree({ manifest }) { const chunks = [...manifest.chunks]; const subManifestRefs = []; - const chunkBlobEntries = []; for (let i = 0; i < chunks.length; i += this.merkleThreshold) { const group = chunks.slice(i, i + this.merkleThreshold); @@ -371,10 +396,6 @@ export default class CasService { chunkCount: group.length, startIndex: i, }); - - for (const c of group) { - chunkBlobEntries.push(`100644 blob ${c.blob}\t${c.digest}`); - } } const rootManifestData = { @@ -394,7 +415,7 @@ export default class CasService { const treeEntries = [ `100644 blob ${rootOid}\tmanifest.${this.codec.extension}`, ...subManifestEntries, - ...chunkBlobEntries, + ...this._createChunkTreeEntries(chunks), ]; return await this.persistence.writeTree(treeEntries); diff --git a/test/integration/round-trip.test.js b/test/integration/round-trip.test.js index dba9fb8..53024a5 100644 --- a/test/integration/round-trip.test.js +++ b/test/integration/round-trip.test.js @@ -10,7 +10,7 @@ import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import { mkdtempSync, rmSync, writeFileSync, readFileSync } from 'node:fs'; import { randomBytes } from 'node:crypto'; -import { execSync } from 'node:child_process'; +import { execSync, spawnSync } from 'node:child_process'; import path from 'node:path'; import os from 'node:os'; import GitPlumbing from '@git-stunts/plumbing'; @@ -54,6 +54,37 @@ function tempFile(content) { return { filePath: fp, dir }; } +/** + * Returns chunk entry names from a Git tree listing. + */ +function chunkEntryNames(entries) { + return entries + .map((entry) => entry.name) + .filter((name) => /^[a-f0-9]{64}$/u.test(name)); +} + +/** + * Returns the unique chunk digests recorded by the manifest. + */ +function uniqueChunkDigests(manifest) { + return [...new Set(manifest.chunks.map((chunk) => chunk.digest))]; +} + +/** + * Runs git fsck and returns combined stdout/stderr for assertions. + */ +function runGitFsck() { + const result = spawnSync('git', ['fsck', '--full', '--no-dangling'], { + cwd: repoDir, + encoding: 'utf8', + }); + + return { + status: result.status ?? 0, + output: `${result.stdout}${result.stderr}`, + }; +} + // --------------------------------------------------------------------------- // Plaintext round trip (JSON) – basic // --------------------------------------------------------------------------- @@ -254,6 +285,96 @@ describe('restoreFile (write to disk)', () => { }); }); +// --------------------------------------------------------------------------- +// Repeated chunks — tree emission dedupe + fsck regression +// --------------------------------------------------------------------------- +describe('repeated chunks — v1 tree emission dedupe + fsck regression', () => { + it('deduplicates repeated chunk entries in a v1 tree and still restores correctly', async () => { + const repeatedChunk = Buffer.alloc(1024, 0x41); + const uniqueChunk = Buffer.alloc(1024, 0x42); + const original = Buffer.concat([repeatedChunk, uniqueChunk, repeatedChunk, repeatedChunk]); + const { filePath, dir } = tempFile(original); + const repeatedCas = new ContentAddressableStore({ + plumbing: GitPlumbing.createDefault({ cwd: repoDir }), + chunkSize: 1024, + merkleThreshold: 10, + }); + + try { + const manifest = await repeatedCas.storeFile({ filePath, slug: 'repeat-v1' }); + expect(manifest.chunks.map((chunk) => chunk.digest)).toEqual([ + manifest.chunks[0].digest, + manifest.chunks[1].digest, + manifest.chunks[0].digest, + manifest.chunks[0].digest, + ]); + + const treeOid = await repeatedCas.createTree({ manifest }); + const service = await repeatedCas.getService(); + const entries = await service.persistence.readTree(treeOid); + + const emittedChunkNames = chunkEntryNames(entries); + expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort()); + expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length); + + const restoredManifest = await service.readManifest({ treeOid }); + const { buffer } = await repeatedCas.restore({ manifest: restoredManifest }); + expect(buffer.equals(original)).toBe(true); + + const fsck = runGitFsck(); + expect(fsck.status).toBe(0); + expect(fsck.output).not.toContain('duplicateEntries'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe('repeated chunks — Merkle tree emission dedupe + fsck regression', () => { + it('deduplicates repeated chunk entries in a Merkle tree and still restores correctly', async () => { + const chunkA = Buffer.alloc(1024, 0x61); + const chunkB = Buffer.alloc(1024, 0x62); + const chunkC = Buffer.alloc(1024, 0x63); + const original = Buffer.concat([chunkA, chunkB, chunkA, chunkC, chunkA]); + const { filePath, dir } = tempFile(original); + const repeatedCas = new ContentAddressableStore({ + plumbing: GitPlumbing.createDefault({ cwd: repoDir }), + chunkSize: 1024, + merkleThreshold: 2, + }); + + try { + const manifest = await repeatedCas.storeFile({ filePath, slug: 'repeat-v2' }); + expect(manifest.chunks.map((chunk) => chunk.digest)).toEqual([ + manifest.chunks[0].digest, + manifest.chunks[1].digest, + manifest.chunks[0].digest, + manifest.chunks[3].digest, + manifest.chunks[0].digest, + ]); + + const treeOid = await repeatedCas.createTree({ manifest }); + const service = await repeatedCas.getService(); + const entries = await service.persistence.readTree(treeOid); + const emittedChunkNames = chunkEntryNames(entries); + + expect(entries.some((entry) => entry.name.startsWith('sub-manifest-'))).toBe(true); + expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort()); + expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length); + + const restoredManifest = await service.readManifest({ treeOid }); + const { buffer } = await repeatedCas.restore({ manifest: restoredManifest }); + expect(buffer.equals(original)).toBe(true); + + const fsck = runGitFsck(); + expect(fsck.status).toBe(0); + expect(fsck.output).not.toContain('duplicateEntries'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + // --------------------------------------------------------------------------- // Fuzz: 50 file sizes around chunk boundaries // --------------------------------------------------------------------------- diff --git a/test/unit/domain/services/CasService.merkle.test.js b/test/unit/domain/services/CasService.merkle.test.js index 685e236..7a57f43 100644 --- a/test/unit/domain/services/CasService.merkle.test.js +++ b/test/unit/domain/services/CasService.merkle.test.js @@ -328,6 +328,37 @@ describe('CasService Merkle – sub-manifest blobs are included as tree entries' }); }); +describe('CasService Merkle – repeated chunk tree entry dedupe', () => { + it('deduplicates repeated chunk entries across sub-manifest groups', async () => { + const { service, trees } = setup(2); + const chunkA = Buffer.alloc(1024, 0x41); + const chunkB = Buffer.alloc(1024, 0x42); + const chunkC = Buffer.alloc(1024, 0x43); + const original = Buffer.concat([chunkA, chunkB, chunkA, chunkC, chunkA]); + + const manifest = await service.store({ + source: bufferSource(original), + slug: 'repeated-merkle', + filename: 'repeated-merkle.bin', + }); + + expect(manifest.chunks).toHaveLength(5); + + const treeOid = await service.createTree({ manifest }); + const treeEntries = trees.get(treeOid); + const chunkEntryNames = treeEntries + .map((entry) => entry.split('\t')[1]) + .filter((name) => /^[a-f0-9]{64}$/u.test(name)); + + expect(chunkEntryNames).toEqual([ + manifest.chunks[0].digest, + manifest.chunks[1].digest, + manifest.chunks[3].digest, + ]); + expect(new Set(chunkEntryNames).size).toBe(chunkEntryNames.length); + }); +}); + // --------------------------------------------------------------------------- // 8. Exactly at threshold boundary uses v1 // --------------------------------------------------------------------------- diff --git a/test/unit/domain/services/CasService.test.js b/test/unit/domain/services/CasService.test.js index 5d63d82..7f80254 100644 --- a/test/unit/domain/services/CasService.test.js +++ b/test/unit/domain/services/CasService.test.js @@ -120,6 +120,41 @@ describe('CasService – createTree', () => { }); }); +describe('CasService – createTree dedupe', () => { + let service; + let mockPersistence; + + beforeEach(() => { + ({ service, mockPersistence } = setup()); + }); + + it('deduplicates repeated chunk digests while preserving first-seen order', async () => { + const duplicateDigest = digestOf('chunk-a'); + const uniqueDigest = digestOf('chunk-b'); + const manifest = new Manifest({ + slug: 'repeat', + filename: 'repeat.txt', + size: 120, + chunks: [ + { index: 0, size: 40, blob: 'b1', digest: duplicateDigest }, + { index: 1, size: 40, blob: 'b1', digest: duplicateDigest }, + { index: 2, size: 40, blob: 'b2', digest: uniqueDigest } + ] + }); + + await service.createTree({ manifest }); + + const treeEntries = mockPersistence.writeTree.mock.calls[0][0]; + const chunkEntries = treeEntries.filter((entry) => !entry.includes('manifest.json')); + + expect(chunkEntries).toEqual([ + `100644 blob b1\t${duplicateDigest}`, + `100644 blob b2\t${uniqueDigest}` + ]); + expect(new Set(chunkEntries.map((entry) => entry.split('\t')[1])).size).toBe(chunkEntries.length); + }); +}); + // --------------------------------------------------------------------------- // verifyIntegrity // --------------------------------------------------------------------------- @@ -147,4 +182,4 @@ describe('CasService – verifyIntegrity', () => { const isValid = await service.verifyIntegrity(manifest); expect(isValid).toBe(true); }); -}); \ No newline at end of file +}); From 94d7a34ba553965d430625cfe7536ccf410ed2a3 Mon Sep 17 00:00:00 2001 From: James Ross Date: Sun, 15 Mar 2026 01:36:20 -0700 Subject: [PATCH 2/2] Tighten fsck regression helper --- test/integration/round-trip.test.js | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/test/integration/round-trip.test.js b/test/integration/round-trip.test.js index 53024a5..479423b 100644 --- a/test/integration/round-trip.test.js +++ b/test/integration/round-trip.test.js @@ -79,9 +79,24 @@ function runGitFsck() { encoding: 'utf8', }); + const output = `${result.stdout ?? ''}${result.stderr ?? ''}`; + if (result.error) { + return { + status: 1, + output: `${output}${output ? '\n' : ''}spawn error: ${result.error.message}`, + }; + } + + if (result.status === null) { + return { + status: 1, + output: `${output}${output ? '\n' : ''}terminated by signal: ${result.signal ?? 'unknown'}`, + }; + } + return { - status: result.status ?? 0, - output: `${result.stdout}${result.stderr}`, + status: result.status, + output, }; } @@ -314,6 +329,8 @@ describe('repeated chunks — v1 tree emission dedupe + fsck regression', () => const entries = await service.persistence.readTree(treeOid); const emittedChunkNames = chunkEntryNames(entries); + // Git stores tree entries in filename-sorted order, so this integration + // check verifies uniqueness/membership while unit tests cover emit order. expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort()); expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length); @@ -359,6 +376,8 @@ describe('repeated chunks — Merkle tree emission dedupe + fsck regression', () const emittedChunkNames = chunkEntryNames(entries); expect(entries.some((entry) => entry.name.startsWith('sub-manifest-'))).toBe(true); + // Git stores tree entries in filename-sorted order, so this integration + // check verifies uniqueness/membership while unit tests cover emit order. expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort()); expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length);