From cda366b43bc2154d9acc5787fa49ecb3e07f361a Mon Sep 17 00:00:00 2001 From: Shoubhit Dash Date: Sat, 20 Dec 2025 22:44:56 +0530 Subject: [PATCH] wasm instead of cloudflare --- packages/code-chunk/package.json | 10 +- packages/code-chunk/src/parser/index.ts | 125 ++---------- packages/code-chunk/src/parser/shared.ts | 52 +++++ packages/code-chunk/src/parser/wasm.ts | 135 +++++++++++++ packages/code-chunk/src/types.ts | 45 +++++ packages/code-chunk/src/wasm.d.ts | 39 ++++ packages/code-chunk/src/wasm.ts | 187 ++++++++++++++++++ packages/code-chunk/test/wasm.test.ts | 233 +++++++++++++++++++++++ 8 files changed, 711 insertions(+), 115 deletions(-) create mode 100644 packages/code-chunk/src/parser/shared.ts create mode 100644 packages/code-chunk/src/parser/wasm.ts create mode 100644 packages/code-chunk/src/wasm.d.ts create mode 100644 packages/code-chunk/src/wasm.ts create mode 100644 packages/code-chunk/test/wasm.test.ts diff --git a/packages/code-chunk/package.json b/packages/code-chunk/package.json index cb2c216..68c32bf 100644 --- a/packages/code-chunk/package.json +++ b/packages/code-chunk/package.json @@ -15,7 +15,7 @@ "url": "git+https://github.com/supermemoryai/code-chunk.git" }, "scripts": { - "build": "bunup", + "build": "bunup src/index.ts src/wasm.ts", "dev": "bunup --watch", "release": "bumpp --commit --push --tag", "test": "bun test", @@ -46,6 +46,14 @@ "default": "./dist/index.js" } }, + "./wasm": { + "types": "./src/wasm.ts", + "bun": "./src/wasm.ts", + "import": { + "types": "./dist/wasm.d.ts", + "default": "./dist/wasm.js" + } + }, "./package.json": "./package.json" }, "module": "./dist/index.js", diff --git a/packages/code-chunk/src/parser/index.ts b/packages/code-chunk/src/parser/index.ts index 150253a..4b202dd 100644 --- a/packages/code-chunk/src/parser/index.ts +++ b/packages/code-chunk/src/parser/index.ts @@ -1,17 +1,13 @@ import { Effect } from 'effect' -import { - Parser, - type Node as TSNode, - type Tree as TSTree, -} from 'web-tree-sitter' +import { Parser } from 'web-tree-sitter' import type { Language, ParseError, ParseResult } from '../types' import { clearGrammarCache, type GrammarLoadError, getLanguageGrammar, } from './languages' +import { buildParseResult } from './shared' -// Re-export language utilities export { clearGrammarCache, detectLanguage, @@ -19,10 +15,12 @@ export { LANGUAGE_EXTENSIONS, loadGrammar, } from './languages' +export { + buildParseResult, + getParseErrorMessage, + hasParseErrors, +} from './shared' -/** - * Error thrown when parser initialization fails - */ export class ParserInitError extends Error { readonly _tag = 'ParserInitError' override readonly cause?: unknown @@ -34,16 +32,8 @@ export class ParserInitError extends Error { } } -/** - * Flag to track if tree-sitter has been initialized - */ let initialized: boolean = false -/** - * Initialize the tree-sitter WASM module - * - * @returns Effect that initializes tree-sitter - */ export function initParser(): Effect.Effect { return Effect.gen(function* () { if (initialized) { @@ -60,99 +50,28 @@ export function initParser(): Effect.Effect { }) } -/** - * Check if a parse tree has errors - */ -function hasParseErrors(tree: TSTree): boolean { - return tree.rootNode.hasError -} - -/** - * Get error message from a tree with errors - */ -function getParseErrorMessage(tree: TSTree): string { - const errorNodes: string[] = [] - - function findErrors(node: TSNode) { - if (node.isError || node.isMissing) { - const pos = node.startPosition - errorNodes.push( - `${node.isError ? 'ERROR' : 'MISSING'} at line ${pos.row + 1}, column ${pos.column + 1}`, - ) - } - for (const child of node.children) { - findErrors(child) - } - } - - findErrors(tree.rootNode) - return errorNodes.length > 0 - ? errorNodes.slice(0, 3).join('; ') + - (errorNodes.length > 3 ? `; ... and ${errorNodes.length - 3} more` : '') - : 'Unknown parse error' -} - -/** - * Parse source code into an AST - * - * Uses Effect internally for error handling. Tree-sitter always produces a tree - * even with syntax errors (recoverable parsing). - * - * @param parser - The tree-sitter parser instance - * @param code - The source code to parse - * @param language - The programming language - * @returns Effect resolving to ParseResult - */ export function parse( parser: Parser, code: string, language: Language, ): Effect.Effect { return Effect.gen(function* () { - // Load and set the language grammar const grammar = yield* getLanguageGrammar(language) parser.setLanguage(grammar) - // Parse the code const tree = parser.parse(code) + const result = buildParseResult(tree) - if (!tree) { - return yield* Effect.fail({ - message: 'Parser returned null - no language set or parsing cancelled', - recoverable: false, - } satisfies ParseError) + if (result.error && !result.error.recoverable) { + return yield* Effect.fail(result.error) } - // Check for parse errors - if (hasParseErrors(tree)) { - return { - tree, - error: { - message: getParseErrorMessage(tree), - recoverable: true, // Tree-sitter always produces a tree - }, - } satisfies ParseResult - } - - return { - tree, - error: null, - } satisfies ParseResult + return result }) } -// ============================================================================ -// Public API - Unwraps Effect for consumers -// ============================================================================ - -/** - * Shared parser instance for the public API - */ let sharedParser: Parser | null = null -/** - * Get or create the shared parser instance - */ async function getSharedParser(): Promise { if (sharedParser) { return sharedParser @@ -163,14 +82,6 @@ async function getSharedParser(): Promise { return sharedParser } -/** - * Parse source code into an AST (public async API) - * - * @param code - The source code to parse - * @param language - The programming language - * @returns Promise resolving to ParseResult - * @throws ParseError or GrammarLoadError if parsing fails irrecoverably - */ export async function parseCode( code: string, language: Language, @@ -179,24 +90,10 @@ export async function parseCode( return Effect.runPromise(parse(parser, code, language)) } -/** - * Initialize the parser module (public async API) - * - * Call this before using other parser functions to ensure tree-sitter is ready. - * This is called automatically by parseCode, but can be called explicitly for - * early initialization. - * - * @returns Promise that resolves when initialization is complete - * @throws ParserInitError if initialization fails - */ export async function initializeParser(): Promise { await getSharedParser() } -/** - * Reset the shared parser state (useful for testing) - * Also clears the grammar cache to ensure clean reinitialization - */ export function resetParser(): void { if (sharedParser) { sharedParser.delete() diff --git a/packages/code-chunk/src/parser/shared.ts b/packages/code-chunk/src/parser/shared.ts new file mode 100644 index 0000000..19291c4 --- /dev/null +++ b/packages/code-chunk/src/parser/shared.ts @@ -0,0 +1,52 @@ +import type { Node as TSNode, Tree as TSTree } from 'web-tree-sitter' +import type { ParseResult } from '../types' + +export function hasParseErrors(tree: TSTree): boolean { + return tree.rootNode.hasError +} + +export function getParseErrorMessage(tree: TSTree): string { + const errorNodes: string[] = [] + + function findErrors(node: TSNode) { + if (node.isError || node.isMissing) { + const pos = node.startPosition + errorNodes.push( + `${node.isError ? 'ERROR' : 'MISSING'} at line ${pos.row + 1}, column ${pos.column + 1}`, + ) + } + for (const child of node.children) { + findErrors(child) + } + } + + findErrors(tree.rootNode) + return errorNodes.length > 0 + ? errorNodes.slice(0, 3).join('; ') + + (errorNodes.length > 3 ? `; ... and ${errorNodes.length - 3} more` : '') + : 'Unknown parse error' +} + +export function buildParseResult(tree: TSTree | null): ParseResult { + if (!tree) { + return { + tree: undefined as unknown as TSTree, + error: { + message: 'Parser returned null - no language set or parsing cancelled', + recoverable: false, + }, + } + } + + if (hasParseErrors(tree)) { + return { + tree, + error: { + message: getParseErrorMessage(tree), + recoverable: true, + }, + } + } + + return { tree, error: null } +} diff --git a/packages/code-chunk/src/parser/wasm.ts b/packages/code-chunk/src/parser/wasm.ts new file mode 100644 index 0000000..13937a9 --- /dev/null +++ b/packages/code-chunk/src/parser/wasm.ts @@ -0,0 +1,135 @@ +import { Effect } from 'effect' +import { Parser, Language as TSLanguage } from 'web-tree-sitter' + +import type { Language, ParseResult, WasmBinary, WasmConfig } from '../types' +import { buildParseResult } from './shared' + +export class WasmParserError extends Error { + readonly _tag = 'WasmParserError' + override readonly cause?: unknown + + constructor(message: string, cause?: unknown) { + super(message) + this.name = 'WasmParserError' + this.cause = cause + } +} + +export class WasmGrammarError extends Error { + readonly _tag = 'WasmGrammarError' + readonly language: Language + override readonly cause?: unknown + + constructor(language: Language, message?: string, cause?: unknown) { + super(message ?? `No WASM binary provided for language: ${language}`) + this.name = 'WasmGrammarError' + this.language = language + this.cause = cause + } +} + +async function toUint8Array(binary: WasmBinary): Promise { + if (binary instanceof Uint8Array) { + return binary + } + if (binary instanceof ArrayBuffer) { + return new Uint8Array(binary) + } + if (binary instanceof Response) { + const buffer = await binary.arrayBuffer() + return new Uint8Array(buffer) + } + if (typeof binary === 'string') { + const response = await fetch(binary) + const buffer = await response.arrayBuffer() + return new Uint8Array(buffer) + } + throw new WasmParserError('Parser not initialized. Call init() first.') +} + +export class WasmParser { + private config: WasmConfig + private initialized = false + private grammarCache = new Map() + private sharedParser: Parser | null = null + + constructor(config: WasmConfig) { + this.config = config + } + + async init(): Promise { + if (this.initialized) return + + const wasmBinary = await toUint8Array(this.config.treeSitter) + + await Parser.init({ + locateFile: () => '', + wasmBinary: wasmBinary.buffer, + }) + + this.sharedParser = new Parser() + this.initialized = true + } + + private async loadGrammar(language: Language): Promise { + const cached = this.grammarCache.get(language) + if (cached) return cached + + const wasmBinary = this.config.languages[language] + if (!wasmBinary) { + throw new WasmGrammarError(language) + } + + const input = await toUint8Array(wasmBinary) + const grammar = await TSLanguage.load(input) + this.grammarCache.set(language, grammar) + return grammar + } + + async parse(code: string, language: Language): Promise { + if (!this.initialized || !this.sharedParser) { + throw new WasmParserError('Parser not initialized. Call init() first.') + } + + const grammar = await this.loadGrammar(language) + this.sharedParser.setLanguage(grammar) + + const tree = this.sharedParser.parse(code) + return buildParseResult(tree) + } + + parseEffect( + code: string, + language: Language, + ): Effect.Effect { + return Effect.tryPromise({ + try: () => this.parse(code, language), + catch: (error) => { + if ( + error instanceof WasmParserError || + error instanceof WasmGrammarError + ) { + return error + } + return new WasmParserError('Parse failed', error) + }, + }) + } + + reset(): void { + if (this.sharedParser) { + this.sharedParser.delete() + this.sharedParser = null + } + this.grammarCache.clear() + this.initialized = false + } +} + +export async function createWasmParser( + config: WasmConfig, +): Promise { + const parser = new WasmParser(config) + await parser.init() + return parser +} diff --git a/packages/code-chunk/src/types.ts b/packages/code-chunk/src/types.ts index e32d144..b148075 100644 --- a/packages/code-chunk/src/types.ts +++ b/packages/code-chunk/src/types.ts @@ -299,3 +299,48 @@ export interface Chunker { options?: ChunkOptions, ): AsyncIterable } + +// ============================================================================ +// WASM / Cloudflare Workers Types +// ============================================================================ + +/** + * WASM binary input - can be ArrayBuffer, Uint8Array, Response, or URL string + * + * In Cloudflare Workers, WASM files are typically imported as modules that + * resolve to ArrayBuffer or WebAssembly.Module. + */ +export type WasmBinary = + | ArrayBuffer + | Uint8Array + | Response + | WebAssembly.Module + | string + +/** + * Configuration for WASM binaries in Cloudflare Workers + * + * @example + * ```ts + * import treeSitterWasm from 'web-tree-sitter/tree-sitter.wasm' + * import typescriptWasm from 'tree-sitter-typescript/tree-sitter-tsx.wasm' + * + * const config: WasmConfig = { + * treeSitter: treeSitterWasm, + * languages: { + * typescript: typescriptWasm, + * } + * } + * ``` + */ +export interface WasmConfig { + /** + * The core tree-sitter.wasm binary from web-tree-sitter + */ + treeSitter: WasmBinary + /** + * Language grammar WASM binaries + * Only include the languages you need to minimize bundle size + */ + languages: Partial> +} diff --git a/packages/code-chunk/src/wasm.d.ts b/packages/code-chunk/src/wasm.d.ts new file mode 100644 index 0000000..932a440 --- /dev/null +++ b/packages/code-chunk/src/wasm.d.ts @@ -0,0 +1,39 @@ +declare module '*.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'web-tree-sitter/tree-sitter.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-typescript/tree-sitter-tsx.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-javascript/tree-sitter-javascript.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-python/tree-sitter-python.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-rust/tree-sitter-rust.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-go/tree-sitter-go.wasm' { + const content: ArrayBuffer + export default content +} + +declare module 'tree-sitter-java/tree-sitter-java.wasm' { + const content: ArrayBuffer + export default content +} diff --git a/packages/code-chunk/src/wasm.ts b/packages/code-chunk/src/wasm.ts new file mode 100644 index 0000000..fa2865a --- /dev/null +++ b/packages/code-chunk/src/wasm.ts @@ -0,0 +1,187 @@ +import { Effect } from 'effect' + +import type { + Chunk, + Chunker, + ChunkOptions, + Language, + WasmConfig, +} from './types' + +import { + chunk as chunkInternal, + DEFAULT_CHUNK_OPTIONS, + streamChunks as streamChunksInternal, +} from './chunking' +import { extractEntities } from './extract' +import { WasmParser } from './parser/wasm' +import { detectLanguage } from './parser/languages' +import { buildScopeTree } from './scope' + +export type { + Chunk, + ChunkContext, + ChunkEntityInfo, + ChunkOptions, + Chunker, + EntityInfo, + EntityType, + ImportInfo, + Language, + LineRange, + SiblingInfo, + WasmBinary, + WasmConfig, +} from './types' + +export { formatChunkWithContext } from './context/format' +export { + WasmGrammarError, + WasmParser, + WasmParserError, + createWasmParser, +} from './parser/wasm' +export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages' + +export class WasmChunkingError extends Error { + readonly _tag = 'WasmChunkingError' + override readonly cause?: unknown + + constructor(message: string, cause?: unknown) { + super(message) + this.name = 'WasmChunkingError' + this.cause = cause + } +} + +export class UnsupportedLanguageError extends Error { + readonly _tag = 'UnsupportedLanguageError' + readonly filepath: string + + constructor(filepath: string) { + super(`Unsupported file type: ${filepath}`) + this.name = 'UnsupportedLanguageError' + this.filepath = filepath + } +} + +class WasmChunker implements Chunker { + private parser: WasmParser + private defaultOptions: ChunkOptions + + constructor(parser: WasmParser, options: ChunkOptions = {}) { + this.parser = parser + this.defaultOptions = { ...DEFAULT_CHUNK_OPTIONS, ...options } + } + + async chunk( + filepath: string, + code: string, + options?: ChunkOptions, + ): Promise { + const opts = { ...this.defaultOptions, ...options } + const language: Language | null = opts.language ?? detectLanguage(filepath) + + if (!language) { + throw new UnsupportedLanguageError(filepath) + } + + const parseResult = await this.parser.parse(code, language) + + const entities = await Effect.runPromise( + Effect.mapError( + extractEntities(parseResult.tree.rootNode, language, code), + (error: unknown) => + new WasmChunkingError('Failed to extract entities', error), + ), + ) + + const scopeTree = await Effect.runPromise( + Effect.mapError( + buildScopeTree(entities), + (error: unknown) => + new WasmChunkingError('Failed to build scope tree', error), + ), + ) + + const chunks = await Effect.runPromise( + Effect.mapError( + chunkInternal( + parseResult.tree.rootNode, + code, + scopeTree, + language, + opts, + filepath, + ), + (error: unknown) => + new WasmChunkingError('Failed to chunk code', error), + ), + ) + + if (parseResult.error) { + return chunks.map((c: Chunk) => ({ + ...c, + context: { + ...c.context, + parseError: parseResult.error ?? undefined, + }, + })) + } + + return chunks + } + + async *stream( + filepath: string, + code: string, + options?: ChunkOptions, + ): AsyncIterable { + const opts = { ...this.defaultOptions, ...options } + const language: Language | null = opts.language ?? detectLanguage(filepath) + + if (!language) { + throw new UnsupportedLanguageError(filepath) + } + + const parseResult = await this.parser.parse(code, language) + + const entities = await Effect.runPromise( + extractEntities(parseResult.tree.rootNode, language, code), + ) + + const scopeTree = await Effect.runPromise(buildScopeTree(entities)) + + const chunkGenerator = streamChunksInternal( + parseResult.tree.rootNode, + code, + scopeTree, + language, + opts, + filepath, + ) + + for await (const chunk of chunkGenerator) { + if (parseResult.error) { + yield { + ...chunk, + context: { + ...chunk.context, + parseError: parseResult.error ?? undefined, + }, + } + } else { + yield chunk + } + } + } +} + +export async function createChunker( + config: WasmConfig, + options?: ChunkOptions, +): Promise { + const parser = new WasmParser(config) + await parser.init() + return new WasmChunker(parser, options) +} diff --git a/packages/code-chunk/test/wasm.test.ts b/packages/code-chunk/test/wasm.test.ts new file mode 100644 index 0000000..53a5123 --- /dev/null +++ b/packages/code-chunk/test/wasm.test.ts @@ -0,0 +1,233 @@ +import { readFile } from 'node:fs/promises' +import { resolve } from 'node:path' +import { describe, expect, test } from 'bun:test' + +import type { WasmConfig } from '../src/types' + +import { + WasmChunkingError, + WasmGrammarError, + WasmParser, + WasmParserError, + createChunker, + UnsupportedLanguageError, +} from '../src/wasm' + +async function loadWasmBinary(packagePath: string): Promise { + // node_modules is at monorepo root, 2 dirs up from packages/code-chunk + const fullPath = resolve( + process.cwd(), + '..', + '..', + 'node_modules', + ...packagePath.split('/'), + ) + return await readFile(fullPath) +} + +async function getWasmConfig(): Promise { + const [treeSitter, typescript, javascript] = await Promise.all([ + loadWasmBinary('web-tree-sitter/web-tree-sitter.wasm'), + loadWasmBinary('tree-sitter-typescript/tree-sitter-tsx.wasm'), + loadWasmBinary('tree-sitter-javascript/tree-sitter-javascript.wasm'), + ]) + + return { + treeSitter, + languages: { + typescript, + javascript, + }, + } +} + +describe('WasmParser', () => { + test('initializes and parses TypeScript', async () => { + const config = await getWasmConfig() + const parser = new WasmParser(config) + await parser.init() + + const result = await parser.parse('const x: number = 1', 'typescript') + + expect(result.tree).toBeDefined() + expect(result.error).toBeNull() + expect(result.tree.rootNode.type).toBe('program') + }) + + test('initializes and parses JavaScript', async () => { + const config = await getWasmConfig() + const parser = new WasmParser(config) + await parser.init() + + const result = await parser.parse('const x = 1', 'javascript') + + expect(result.tree).toBeDefined() + expect(result.error).toBeNull() + }) + + test('throws error for missing language', async () => { + const config = await getWasmConfig() + const parser = new WasmParser(config) + await parser.init() + + await expect(parser.parse('print("hello")', 'python')).rejects.toThrow( + WasmGrammarError, + ) + }) + + test('throws error if not initialized', async () => { + const config = await getWasmConfig() + const parser = new WasmParser(config) + + await expect(parser.parse('const x = 1', 'typescript')).rejects.toThrow( + WasmParserError, + ) + }) + + test('caches grammar after first load', async () => { + const config = await getWasmConfig() + const parser = new WasmParser(config) + await parser.init() + + await parser.parse('const a = 1', 'typescript') + await parser.parse('const b = 2', 'typescript') + await parser.parse('const c = 3', 'typescript') + + expect(true).toBe(true) + }) +}) + +describe('createChunker (wasm)', () => { + test('creates chunker and chunks TypeScript code', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config) + + const code = ` +export function add(a: number, b: number): number { + return a + b +} + +export function subtract(a: number, b: number): number { + return a - b +} +` + const chunks = await chunker.chunk('math.ts', code) + + expect(chunks.length).toBeGreaterThan(0) + expect(chunks[0].context.language).toBe('typescript') + expect(chunks[0].context.filepath).toBe('math.ts') + }) + + test('streams chunks', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config) + + const code = ` +function first() { return 1 } +function second() { return 2 } +` + const chunks: Awaited> = [] + for await (const chunk of chunker.stream('test.ts', code)) { + chunks.push(chunk) + } + + expect(chunks.length).toBeGreaterThan(0) + }) + + test('respects maxChunkSize option', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config, { maxChunkSize: 100 }) + + const code = ` +export function firstFunction() { + const a = 1 + const b = 2 + return a + b +} + +export function secondFunction() { + const x = 10 + const y = 20 + return x * y +} + +export function thirdFunction() { + const result = [] + for (let i = 0; i < 10; i++) { + result.push(i) + } + return result +} +` + const chunks = await chunker.chunk('large.ts', code) + + expect(chunks.length).toBeGreaterThan(1) + }) + + test('throws UnsupportedLanguageError for unknown file type', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config) + + await expect(chunker.chunk('file.xyz', 'content')).rejects.toThrow( + UnsupportedLanguageError, + ) + }) + + test('includes entities in context', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config) + + const code = ` +export class Calculator { + add(a: number, b: number): number { + return a + b + } +} +` + const chunks = await chunker.chunk('calc.ts', code) + + expect(chunks[0].context.entities.length).toBeGreaterThan(0) + const entityNames = chunks[0].context.entities.map((e) => e.name) + expect(entityNames).toContain('Calculator') + }) + + test('includes imports in context', async () => { + const config = await getWasmConfig() + const chunker = await createChunker(config) + + const code = ` +import { Effect } from 'effect' +import { pipe } from 'effect/Function' + +export const program = Effect.succeed(42) +` + const chunks = await chunker.chunk('program.ts', code) + + expect(chunks[0].context.imports.length).toBeGreaterThan(0) + }) +}) + +describe('error classes', () => { + test('WasmParserError has correct tag', () => { + const error = new WasmParserError('test error') + expect(error._tag).toBe('WasmParserError') + expect(error.name).toBe('WasmParserError') + }) + + test('WasmGrammarError has correct tag and language', () => { + const error = new WasmGrammarError('python') + expect(error._tag).toBe('WasmGrammarError') + expect(error.language).toBe('python') + }) + + test('WasmChunkingError has correct tag', () => { + const error = new WasmChunkingError('chunk failed') + expect(error._tag).toBe('WasmChunkingError') + }) + + test('UnsupportedLanguageError has correct tag and filepath', () => { + const error = new UnsupportedLanguageError('file.xyz') + expect(error._tag).toBe('UnsupportedLanguageError') + expect(error.filepath).toBe('file.xyz') + }) +})