Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 30 additions & 9 deletions src/domain/services/CasService.js
Original file line number Diff line number Diff line change
Expand Up @@ -320,11 +320,37 @@ export default class CasService {
return data;
}

/**
* Builds unique chunk blob tree entries in first-seen order.
*
* Tree entries keep chunk blobs reachable in Git. The manifest remains the
* authoritative ordered list of chunk occurrences, so repeated digests only
* need one tree entry.
*
* @private
* @param {import('../value-objects/Chunk.js').default[]} chunks
* @returns {string[]}
*/
_createChunkTreeEntries(chunks) {
const treeEntries = [];
const seenDigests = new Set();

for (const chunk of chunks) {
if (seenDigests.has(chunk.digest)) {
continue;
}
seenDigests.add(chunk.digest);
treeEntries.push(`100644 blob ${chunk.blob}\t${chunk.digest}`);
}

return treeEntries;
}

/**
* Creates a Git tree object from a manifest.
*
* The tree contains the serialized manifest file and one blob entry per chunk,
* keyed by its SHA-256 digest.
* The tree contains the serialized manifest file and one blob entry per
* unique chunk digest, preserving first-seen order.
*
* @param {Object} options
* @param {import('../value-objects/Manifest.js').default} options.manifest - The file manifest.
Expand All @@ -342,7 +368,7 @@ export default class CasService {

const treeEntries = [
`100644 blob ${manifestOid}\tmanifest.${this.codec.extension}`,
...chunks.map((c) => `100644 blob ${c.blob}\t${c.digest}`),
...this._createChunkTreeEntries(chunks),
];

return await this.persistence.writeTree(treeEntries);
Expand All @@ -358,7 +384,6 @@ export default class CasService {
async _createMerkleTree({ manifest }) {
const chunks = [...manifest.chunks];
const subManifestRefs = [];
const chunkBlobEntries = [];

for (let i = 0; i < chunks.length; i += this.merkleThreshold) {
const group = chunks.slice(i, i + this.merkleThreshold);
Expand All @@ -371,10 +396,6 @@ export default class CasService {
chunkCount: group.length,
startIndex: i,
});

for (const c of group) {
chunkBlobEntries.push(`100644 blob ${c.blob}\t${c.digest}`);
}
}

const rootManifestData = {
Expand All @@ -394,7 +415,7 @@ export default class CasService {
const treeEntries = [
`100644 blob ${rootOid}\tmanifest.${this.codec.extension}`,
...subManifestEntries,
...chunkBlobEntries,
...this._createChunkTreeEntries(chunks),
];

return await this.persistence.writeTree(treeEntries);
Expand Down
142 changes: 141 additions & 1 deletion test/integration/round-trip.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { mkdtempSync, rmSync, writeFileSync, readFileSync } from 'node:fs';
import { randomBytes } from 'node:crypto';
import { execSync } from 'node:child_process';
import { execSync, spawnSync } from 'node:child_process';
import path from 'node:path';
import os from 'node:os';
import GitPlumbing from '@git-stunts/plumbing';
Expand Down Expand Up @@ -54,6 +54,52 @@ function tempFile(content) {
return { filePath: fp, dir };
}

/**
* Returns chunk entry names from a Git tree listing.
*/
function chunkEntryNames(entries) {
return entries
.map((entry) => entry.name)
.filter((name) => /^[a-f0-9]{64}$/u.test(name));
}

/**
* Returns the unique chunk digests recorded by the manifest.
*/
function uniqueChunkDigests(manifest) {
return [...new Set(manifest.chunks.map((chunk) => chunk.digest))];
}

/**
* Runs git fsck and returns combined stdout/stderr for assertions.
*/
function runGitFsck() {
const result = spawnSync('git', ['fsck', '--full', '--no-dangling'], {
cwd: repoDir,
encoding: 'utf8',
});

const output = `${result.stdout ?? ''}${result.stderr ?? ''}`;
if (result.error) {
return {
status: 1,
output: `${output}${output ? '\n' : ''}spawn error: ${result.error.message}`,
};
}

if (result.status === null) {
return {
status: 1,
output: `${output}${output ? '\n' : ''}terminated by signal: ${result.signal ?? 'unknown'}`,
};
}

return {
status: result.status,
output,
};
}

// ---------------------------------------------------------------------------
// Plaintext round trip (JSON) – basic
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -254,6 +300,100 @@ describe('restoreFile (write to disk)', () => {
});
});

// ---------------------------------------------------------------------------
// Repeated chunks — tree emission dedupe + fsck regression
// ---------------------------------------------------------------------------
describe('repeated chunks — v1 tree emission dedupe + fsck regression', () => {
it('deduplicates repeated chunk entries in a v1 tree and still restores correctly', async () => {
const repeatedChunk = Buffer.alloc(1024, 0x41);
const uniqueChunk = Buffer.alloc(1024, 0x42);
const original = Buffer.concat([repeatedChunk, uniqueChunk, repeatedChunk, repeatedChunk]);
const { filePath, dir } = tempFile(original);
const repeatedCas = new ContentAddressableStore({
plumbing: GitPlumbing.createDefault({ cwd: repoDir }),
chunkSize: 1024,
merkleThreshold: 10,
});

try {
const manifest = await repeatedCas.storeFile({ filePath, slug: 'repeat-v1' });
expect(manifest.chunks.map((chunk) => chunk.digest)).toEqual([
manifest.chunks[0].digest,
manifest.chunks[1].digest,
manifest.chunks[0].digest,
manifest.chunks[0].digest,
]);

const treeOid = await repeatedCas.createTree({ manifest });
const service = await repeatedCas.getService();
const entries = await service.persistence.readTree(treeOid);

const emittedChunkNames = chunkEntryNames(entries);
// Git stores tree entries in filename-sorted order, so this integration
// check verifies uniqueness/membership while unit tests cover emit order.
expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort());
expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length);

const restoredManifest = await service.readManifest({ treeOid });
const { buffer } = await repeatedCas.restore({ manifest: restoredManifest });
expect(buffer.equals(original)).toBe(true);

const fsck = runGitFsck();
expect(fsck.status).toBe(0);
expect(fsck.output).not.toContain('duplicateEntries');
} finally {
rmSync(dir, { recursive: true, force: true });
}
});
});

describe('repeated chunks — Merkle tree emission dedupe + fsck regression', () => {
it('deduplicates repeated chunk entries in a Merkle tree and still restores correctly', async () => {
const chunkA = Buffer.alloc(1024, 0x61);
const chunkB = Buffer.alloc(1024, 0x62);
const chunkC = Buffer.alloc(1024, 0x63);
const original = Buffer.concat([chunkA, chunkB, chunkA, chunkC, chunkA]);
const { filePath, dir } = tempFile(original);
const repeatedCas = new ContentAddressableStore({
plumbing: GitPlumbing.createDefault({ cwd: repoDir }),
chunkSize: 1024,
merkleThreshold: 2,
});

try {
const manifest = await repeatedCas.storeFile({ filePath, slug: 'repeat-v2' });
expect(manifest.chunks.map((chunk) => chunk.digest)).toEqual([
manifest.chunks[0].digest,
manifest.chunks[1].digest,
manifest.chunks[0].digest,
manifest.chunks[3].digest,
manifest.chunks[0].digest,
]);

const treeOid = await repeatedCas.createTree({ manifest });
const service = await repeatedCas.getService();
const entries = await service.persistence.readTree(treeOid);
const emittedChunkNames = chunkEntryNames(entries);

expect(entries.some((entry) => entry.name.startsWith('sub-manifest-'))).toBe(true);
// Git stores tree entries in filename-sorted order, so this integration
// check verifies uniqueness/membership while unit tests cover emit order.
expect([...emittedChunkNames].sort()).toEqual([...uniqueChunkDigests(manifest)].sort());
expect(new Set(emittedChunkNames).size).toBe(emittedChunkNames.length);

const restoredManifest = await service.readManifest({ treeOid });
const { buffer } = await repeatedCas.restore({ manifest: restoredManifest });
expect(buffer.equals(original)).toBe(true);

const fsck = runGitFsck();
expect(fsck.status).toBe(0);
expect(fsck.output).not.toContain('duplicateEntries');
} finally {
rmSync(dir, { recursive: true, force: true });
}
});
});

// ---------------------------------------------------------------------------
// Fuzz: 50 file sizes around chunk boundaries
// ---------------------------------------------------------------------------
Expand Down
31 changes: 31 additions & 0 deletions test/unit/domain/services/CasService.merkle.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,37 @@ describe('CasService Merkle – sub-manifest blobs are included as tree entries'
});
});

describe('CasService Merkle – repeated chunk tree entry dedupe', () => {
it('deduplicates repeated chunk entries across sub-manifest groups', async () => {
const { service, trees } = setup(2);
const chunkA = Buffer.alloc(1024, 0x41);
const chunkB = Buffer.alloc(1024, 0x42);
const chunkC = Buffer.alloc(1024, 0x43);
const original = Buffer.concat([chunkA, chunkB, chunkA, chunkC, chunkA]);

const manifest = await service.store({
source: bufferSource(original),
slug: 'repeated-merkle',
filename: 'repeated-merkle.bin',
});

expect(manifest.chunks).toHaveLength(5);

const treeOid = await service.createTree({ manifest });
const treeEntries = trees.get(treeOid);
const chunkEntryNames = treeEntries
.map((entry) => entry.split('\t')[1])
.filter((name) => /^[a-f0-9]{64}$/u.test(name));

expect(chunkEntryNames).toEqual([
manifest.chunks[0].digest,
manifest.chunks[1].digest,
manifest.chunks[3].digest,
]);
expect(new Set(chunkEntryNames).size).toBe(chunkEntryNames.length);
});
});

// ---------------------------------------------------------------------------
// 8. Exactly at threshold boundary uses v1
// ---------------------------------------------------------------------------
Expand Down
37 changes: 36 additions & 1 deletion test/unit/domain/services/CasService.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,41 @@ describe('CasService – createTree', () => {
});
});

describe('CasService – createTree dedupe', () => {
let service;
let mockPersistence;

beforeEach(() => {
({ service, mockPersistence } = setup());
});

it('deduplicates repeated chunk digests while preserving first-seen order', async () => {
const duplicateDigest = digestOf('chunk-a');
const uniqueDigest = digestOf('chunk-b');
const manifest = new Manifest({
slug: 'repeat',
filename: 'repeat.txt',
size: 120,
chunks: [
{ index: 0, size: 40, blob: 'b1', digest: duplicateDigest },
{ index: 1, size: 40, blob: 'b1', digest: duplicateDigest },
{ index: 2, size: 40, blob: 'b2', digest: uniqueDigest }
]
});

await service.createTree({ manifest });

const treeEntries = mockPersistence.writeTree.mock.calls[0][0];
const chunkEntries = treeEntries.filter((entry) => !entry.includes('manifest.json'));

expect(chunkEntries).toEqual([
`100644 blob b1\t${duplicateDigest}`,
`100644 blob b2\t${uniqueDigest}`
]);
expect(new Set(chunkEntries.map((entry) => entry.split('\t')[1])).size).toBe(chunkEntries.length);
});
});

// ---------------------------------------------------------------------------
// verifyIntegrity
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -147,4 +182,4 @@ describe('CasService – verifyIntegrity', () => {
const isValid = await service.verifyIntegrity(manifest);
expect(isValid).toBe(true);
});
});
});
Loading