From 4a61412cd1e0a6953c424f1da6fb71836335768b Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 13:25:07 +0800 Subject: [PATCH 1/5] feat(voice): /voice setup check + whisper.cpp detection (slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface the existing core whisper.cpp engine via a `/voice` slash command and add the settings schema for it. No mic capture yet — this is the safe, self-contained foundation per docs/VOICE_INPUT.md. Core: - Add VoiceConfig (provider | binPath | modelPath) to settings types, re-exported from @deepcode/core (the JSON schema already had the block). - New detectVoice() (voice/detect.ts): resolves the whisper binary (settings.binPath, else whisper-cli/whisper on PATH) and the model (settings.modelPath, else ~/.deepcode/models/whisper-base.en.bin), never throws — missing pieces become `problems`. Injectable probes for deterministic tests. - validateSettingsShallow now flags an unknown voice.provider. CLI: - /voice reports readiness or prints actionable setup steps (+ per-issue detail); `/voice setup` always shows install instructions. - SessionContext gains an optional `home` (honors --home) for the default model-path probe; wired in the REPL. Tests: 9 core detection cases, 1 schema case, 3 CLI messaging cases. Updates the /voice BEHAVIOR_PARITY row (✗ → ✓, 🔄 → 🟡). Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/cli/src/commands.ts | 55 ++++++++++ apps/cli/src/repl.ts | 1 + apps/cli/src/voice-cmd.test.ts | 85 +++++++++++++++ docs/BEHAVIOR_PARITY.md | 98 ++++++++--------- packages/core/src/config/index.ts | 1 + packages/core/src/config/schema.test.ts | 6 ++ packages/core/src/config/schema.ts | 6 ++ packages/core/src/config/types.ts | 15 +++ packages/core/src/index.ts | 6 +- packages/core/src/voice/detect.test.ts | 105 ++++++++++++++++++ packages/core/src/voice/detect.ts | 137 ++++++++++++++++++++++++ packages/core/src/voice/index.ts | 13 +++ 12 files changed, 478 insertions(+), 50 deletions(-) create mode 100644 apps/cli/src/voice-cmd.test.ts create mode 100644 packages/core/src/voice/detect.test.ts create mode 100644 packages/core/src/voice/detect.ts diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts index af8a5e1..b7aab71 100644 --- a/apps/cli/src/commands.ts +++ b/apps/cli/src/commands.ts @@ -128,6 +128,9 @@ export interface SessionContext { credsStore?: CredentialsStore; /** User settings.json path (REPL-injected, honors --home) — backs /config set. */ userSettingsPath?: string; + /** Home dir override (REPL-injected from --home) — backs default-path lookups + * like /voice's `~/.deepcode/models/...` model probe. Defaults to os.homedir(). */ + home?: string; sessionId: string; sessions: SessionManager; usage: { @@ -1169,6 +1172,57 @@ export const TasksCommand: SlashCommand = { }, }; +export const VoiceCommand: SlashCommand = { + name: '/voice', + description: 'Check local voice-input (whisper.cpp) setup; `/voice setup` shows install steps.', + async run(args, ctx) { + const { detectVoice } = await import('@deepcode/core'); + const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); + const forceSetup = (args[0] ?? '').toLowerCase() === 'setup'; + + if (status.ready && !forceSetup) { + return [ + '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', + ` binary: ${status.binPath}`, + ` model: ${status.modelPath}`, + '', + 'Dictate from the REPL with the voice key (default Ctrl+V; remap in keybindings.json).', + 'Note: live mic capture lands in a follow-up — this step ships setup + detection.', + ]; + } + + const lines: string[] = [ + status.ready + ? '🎙 Voice input is ready. Setup reference below.' + : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', + '', + 'Detected:', + ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, + ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, + ]; + if (status.problems.length) { + lines.push('', 'Issues:'); + for (const p of status.problems) lines.push(` • ${p}`); + } + lines.push( + '', + 'Setup:', + ' 1. Install whisper.cpp', + ' macOS: brew install whisper-cpp', + ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', + ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', + ' mkdir -p ~/.deepcode/models', + ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', + ' 3. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', + ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', + ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', + '', + 'Full guide: docs/VOICE_INPUT.md', + ); + return lines; + }, +}; + export const BackgroundCommand: SlashCommand = { name: '/background', aliases: ['/bg'], @@ -1229,6 +1283,7 @@ export const BUILTIN_COMMANDS: SlashCommand[] = [ BtwCommand, TasksCommand, BackgroundCommand, + VoiceCommand, ]; // ────────────────────────────────────────────────────────────────────────── diff --git a/apps/cli/src/repl.ts b/apps/cli/src/repl.ts index 3e49860..5b109c5 100644 --- a/apps/cli/src/repl.ts +++ b/apps/cli/src/repl.ts @@ -437,6 +437,7 @@ export async function startRepl(opts: ReplOpts): Promise { creds, credsStore, userSettingsPath: settingsPaths({ cwd, home: opts.home }).userPath, + home: opts.home, sessionId: session.id, sessions, usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, diff --git a/apps/cli/src/voice-cmd.test.ts b/apps/cli/src/voice-cmd.test.ts new file mode 100644 index 0000000..bbdac47 --- /dev/null +++ b/apps/cli/src/voice-cmd.test.ts @@ -0,0 +1,85 @@ +// Tests for the /voice slash command messaging. Detection logic itself is +// unit-tested in core (voice/detect.test.ts); here we drive the command end to +// end with real temp files so the "ready" path is deterministic, and bogus +// configured paths so the "not set up" path never depends on the host's PATH. + +import { afterEach, describe, expect, it } from 'vitest'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { SessionManager } from '@deepcode/core'; +import { CommandRegistry, type SessionContext } from './commands.js'; + +const reg = new CommandRegistry(); +const tmps: string[] = []; +async function tmpDir(): Promise { + const d = await mkdtemp(join(tmpdir(), 'dc-voice-')); + tmps.push(d); + return d; +} +afterEach(async () => { + await Promise.all(tmps.splice(0).map((d) => rm(d, { recursive: true, force: true }))); +}); + +function ctx(overrides: Partial = {}): SessionContext { + return { + cwd: '/tmp/x', + model: 'deepseek-chat', + mode: 'default', + effort: 'medium', + settings: {}, + creds: { apiKey: 'sk-test' }, + sessionId: 's1', + sessions: new SessionManager({ root: '/tmp/x' }), + usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, + ...overrides, + }; +} + +const run = (args: string[], c: SessionContext) => reg.match('/voice')!.cmd.run(args, c); + +describe('/voice', () => { + it('reports ready when configured binary + model both exist', async () => { + const dir = await tmpDir(); + const binPath = join(dir, 'whisper-cli'); + const modelPath = join(dir, 'model.bin'); + await writeFile(binPath, '#!/bin/sh\n'); + await writeFile(modelPath, 'GGML'); + const out = (await run([], ctx({ settings: { voice: { binPath, modelPath } } }))).join('\n'); + expect(out).toMatch(/ready/i); + expect(out).toContain(binPath); + expect(out).toContain(modelPath); + expect(out).toMatch(/Ctrl\+V/); + }); + + it('prints setup steps + issues when configured paths are missing', async () => { + const out = ( + await run( + [], + ctx({ settings: { voice: { binPath: '/no/such/whisper', modelPath: '/no/such/m.bin' } } }), + ) + ).join('\n'); + expect(out).toMatch(/not set up yet/i); + expect(out).toMatch(/brew install whisper-cpp/); + expect(out).toMatch(/docs\/VOICE_INPUT\.md/); + // The specific configured-but-missing problems surface under "Issues:". + expect(out).toMatch(/Issues:/); + expect(out).toContain('Configured voice.binPath not found: /no/such/whisper'); + expect(out).toContain('Configured voice.modelPath not found: /no/such/m.bin'); + }); + + it('`/voice setup` always shows install steps, even when ready', async () => { + const dir = await tmpDir(); + const binPath = join(dir, 'whisper-cli'); + const modelPath = join(dir, 'model.bin'); + await writeFile(binPath, ''); + await writeFile(modelPath, ''); + const out = (await run(['setup'], ctx({ settings: { voice: { binPath, modelPath } } }))).join( + '\n', + ); + expect(out).toMatch(/Setup:/); + expect(out).toMatch(/brew install whisper-cpp/); + // Still acknowledges it's already ready. + expect(out).toMatch(/ready/i); + }); +}); diff --git a/docs/BEHAVIOR_PARITY.md b/docs/BEHAVIOR_PARITY.md index eb5f547..c5c3821 100644 --- a/docs/BEHAVIOR_PARITY.md +++ b/docs/BEHAVIOR_PARITY.md @@ -21,55 +21,55 @@ Legend: `✅` matches · `🟡` matches with caveats · `🔄` deferred · `⚠ ## Slash commands (30+ in Claude Code, ~32 shipped in DeepCode) -| Command | Claude Code | DeepCode | Status | -| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------ | -| `/help` | ✓ | ✓ | ✅ | -| `/clear` | ✓ | ✓ | ✅ | -| `/exit` / `/quit` | ✓ | ✓ | ✅ | -| `/status` / `/doctor` | ✓ | ✓ | ✅ | -| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | -| `/mode` | ✓ | ✓ | ✅ | -| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | -| `/cost` / `/usage` | ✓ | ✓ | ✅ | -| `/context` | ✓ | ✓ | ✅ | -| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | -| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | -| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | -| `/mcp` | ✓ | ✓ | ✅ | -| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | -| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | -| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | -| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | -| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | -| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | -| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | -| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | -| `/voice` | ✓ | ✗ | 🔄 M8 | -| `/teleport` | ✓ | ✗ | 🔄 M8 | -| `/desktop` | ✓ | ✗ | 🔄 M6 | -| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | -| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | -| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | -| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | -| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | -| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | -| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | -| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | -| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | -| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | -| `/loop` | ✓ | ✗ (skill avail) | 🟡 | -| `/terminal-setup` | ✓ | ✗ | 🔄 | -| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | -| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | -| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | -| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | -| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | -| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | -| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | -| `/migrate-installer` | ✓ | ✗ | 🔄 | -| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | +| Command | Claude Code | DeepCode | Status | +| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `/help` | ✓ | ✓ | ✅ | +| `/clear` | ✓ | ✓ | ✅ | +| `/exit` / `/quit` | ✓ | ✓ | ✅ | +| `/status` / `/doctor` | ✓ | ✓ | ✅ | +| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | +| `/mode` | ✓ | ✓ | ✅ | +| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | +| `/cost` / `/usage` | ✓ | ✓ | ✅ | +| `/context` | ✓ | ✓ | ✅ | +| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | +| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | +| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | +| `/mcp` | ✓ | ✓ | ✅ | +| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | +| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | +| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | +| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | +| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | +| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | +| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | +| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | +| `/voice` | ✓ | ✓ | 🟡 — `/voice` detects whisper.cpp + a model and prints setup steps (docs/VOICE_INPUT.md); core `WhisperCppProvider` is wired; live mic capture lands in a follow-up slice | +| `/teleport` | ✓ | ✗ | 🔄 M8 | +| `/desktop` | ✓ | ✗ | 🔄 M6 | +| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | +| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | +| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | +| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | +| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | +| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | +| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | +| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | +| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | +| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | +| `/loop` | ✓ | ✗ (skill avail) | 🟡 | +| `/terminal-setup` | ✓ | ✗ | 🔄 | +| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | +| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | +| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | +| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | +| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | +| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | +| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | +| `/migrate-installer` | ✓ | ✗ | 🔄 | +| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | --- diff --git a/packages/core/src/config/index.ts b/packages/core/src/config/index.ts index 02500ee..32bf474 100644 --- a/packages/core/src/config/index.ts +++ b/packages/core/src/config/index.ts @@ -15,6 +15,7 @@ export type { UpdateConfig, WorktreeConfig, AutoModeConfig, + VoiceConfig, } from './types.js'; export { diff --git a/packages/core/src/config/schema.test.ts b/packages/core/src/config/schema.test.ts index 3ee0774..a84fd80 100644 --- a/packages/core/src/config/schema.test.ts +++ b/packages/core/src/config/schema.test.ts @@ -51,6 +51,12 @@ describe('validateSettingsShallow', () => { expect(errs[0]).toMatch(/OnEverything/); }); + it('flags unknown voice provider but accepts whisper.cpp', () => { + expect(validateSettingsShallow({ voice: { provider: 'whisper.cpp' } })).toEqual([]); + const errs = validateSettingsShallow({ voice: { provider: 'azure' } }); + expect(errs[0]).toMatch(/voice\.provider "azure"/); + }); + it('returns no errors on empty config', () => { expect(validateSettingsShallow({})).toEqual([]); }); diff --git a/packages/core/src/config/schema.ts b/packages/core/src/config/schema.ts index 3c7d233..d648032 100644 --- a/packages/core/src/config/schema.ts +++ b/packages/core/src/config/schema.ts @@ -89,5 +89,11 @@ export function validateSettingsShallow(settings: Record): stri } } + const voiceProviderEnum = ['whisper.cpp', 'stub']; + const voice = settings['voice'] as { provider?: string } | undefined; + if (voice?.provider && !voiceProviderEnum.includes(voice.provider)) { + errors.push(`voice.provider "${voice.provider}" not in ${voiceProviderEnum.join(' | ')}`); + } + return errors; } diff --git a/packages/core/src/config/types.ts b/packages/core/src/config/types.ts index 6fa1a47..f77af1e 100644 --- a/packages/core/src/config/types.ts +++ b/packages/core/src/config/types.ts @@ -108,6 +108,18 @@ export interface AutoModeConfig { fallback?: 'ask' | 'deny'; } +export interface VoiceConfig { + /** + * Speech-to-text engine. Only 'whisper.cpp' is a real local engine; 'stub' + * returns an empty transcript (tests / "disabled"). Spec: docs/VOICE_INPUT.md. + */ + provider?: 'whisper.cpp' | 'stub'; + /** Path to the whisper CLI binary. Defaults to `whisper-cli`/`whisper` on PATH. */ + binPath?: string; + /** Path to the ggml model file (e.g. ~/.deepcode/models/whisper-base.en.bin). */ + modelPath?: string; +} + export interface DeepCodeSettings { // Identity model?: string; @@ -166,6 +178,9 @@ export interface DeepCodeSettings { // Worktree worktree?: WorktreeConfig; + // Voice input (M8 — local whisper.cpp ASR; see docs/VOICE_INPUT.md) + voice?: VoiceConfig; + // Plugins (M5) plugins?: { globalEnabled?: boolean; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 344b3d7..757713a 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -104,6 +104,7 @@ export { type UpdateConfig, type WorktreeConfig, type AutoModeConfig, + type VoiceConfig, } from './config/index.js'; // Credentials (M2; M3c adds ApiKeyHelperRefresher) @@ -334,15 +335,18 @@ export { type AgentStreamEvent, } from './ipc/protocol.js'; -// Voice input (M8 — whisper.cpp wrapper + stub provider) +// Voice input (M8 — whisper.cpp wrapper + stub provider + setup detection) export { WhisperCppProvider, StubVoiceProvider, parseWhisperOutput, + detectVoice, type VoiceProvider, type VoiceTranscript, type TranscribeOpts, type WhisperCppOpts, + type VoiceProbe, + type VoiceStatus, } from './voice/index.js'; // Auto-mode classifier (M3c-rest — LLM-judged tool gate when mode === 'auto') diff --git a/packages/core/src/voice/detect.test.ts b/packages/core/src/voice/detect.test.ts new file mode 100644 index 0000000..321da90 --- /dev/null +++ b/packages/core/src/voice/detect.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from 'vitest'; +import { join } from 'node:path'; +import { detectVoice, expandHome, type VoiceProbe } from './detect.js'; +import type { VoiceConfig } from '../config/types.js'; + +const HOME = '/home/u'; + +/** Build a fake probe where `present` is the set of paths/bins that "exist". */ +function probe( + present: Iterable, + overrides: Partial = {}, +): Partial { + const set = new Set(present); + return { + home: HOME, + fileExists: async (p) => set.has(p), + which: async (name) => (set.has(name) ? `/usr/bin/${name}` : null), + ...overrides, + }; +} + +describe('expandHome', () => { + it('expands ~ and ~/path, leaves others alone', () => { + expect(expandHome('~', HOME)).toBe(HOME); + expect(expandHome('~/m/x.bin', HOME)).toBe(join(HOME, 'm/x.bin')); + expect(expandHome('/abs/x.bin', HOME)).toBe('/abs/x.bin'); + expect(expandHome('rel/x.bin', HOME)).toBe('rel/x.bin'); + }); +}); + +describe('detectVoice', () => { + it('is ready when configured binPath + modelPath both exist', async () => { + const voice: VoiceConfig = { binPath: '/opt/whisper-cli', modelPath: '/models/base.bin' }; + const s = await detectVoice(voice, probe(['/opt/whisper-cli', '/models/base.bin'])); + expect(s.ready).toBe(true); + expect(s.binPath).toBe('/opt/whisper-cli'); + expect(s.modelPath).toBe('/models/base.bin'); + expect(s.problems).toEqual([]); + }); + + it('finds the binary on PATH when binPath is unset', async () => { + // 'whisper-cli' is the first candidate; PATH has it. + const def = join(HOME, '.deepcode', 'models', 'whisper-base.en.bin'); + const s = await detectVoice({ modelPath: def }, probe(['whisper-cli', def])); + expect(s.ready).toBe(true); + expect(s.binPath).toBe('/usr/bin/whisper-cli'); + }); + + it('falls back to the second PATH candidate (whisper)', async () => { + const s = await detectVoice( + { modelPath: '/m.bin' }, + probe(['whisper', '/m.bin']), // no whisper-cli, but whisper exists + ); + expect(s.binPath).toBe('/usr/bin/whisper'); + expect(s.ready).toBe(true); + }); + + it('uses the default ~/.deepcode model path when modelPath is unset', async () => { + const def = join(HOME, '.deepcode', 'models', 'whisper-base.en.bin'); + const s = await detectVoice({ binPath: '/b' }, probe(['/b', def])); + expect(s.ready).toBe(true); + expect(s.modelPath).toBe(def); + }); + + it('reports both missing pieces when nothing is installed', async () => { + const s = await detectVoice(undefined, probe([])); // empty PATH + fs + expect(s.ready).toBe(false); + expect(s.binPath).toBeUndefined(); + expect(s.modelPath).toBeUndefined(); + expect(s.problems.join('\n')).toMatch(/binary not found on PATH/); + expect(s.problems.join('\n')).toMatch(/no model at the default/); + }); + + it('flags a configured binPath / modelPath that does not exist', async () => { + const s = await detectVoice( + { binPath: '/nope/whisper', modelPath: '/nope/model.bin' }, + probe([]), + ); + expect(s.ready).toBe(false); + expect(s.problems).toContain('Configured voice.binPath not found: /nope/whisper'); + expect(s.problems).toContain('Configured voice.modelPath not found: /nope/model.bin'); + }); + + it('expands ~ in configured paths against the probe home', async () => { + const bin = join(HOME, 'bin', 'whisper'); + const model = join(HOME, 'm', 'x.bin'); + const s = await detectVoice( + { binPath: '~/bin/whisper', modelPath: '~/m/x.bin' }, + probe([bin, model]), + ); + expect(s.ready).toBe(true); + expect(s.binPath).toBe(bin); + expect(s.modelPath).toBe(model); + }); + + it('is not ready with an unknown provider even if bin + model resolve', async () => { + const s = await detectVoice( + { provider: 'azure' as unknown as VoiceConfig['provider'], binPath: '/b', modelPath: '/m' }, + probe(['/b', '/m']), + ); + expect(s.ready).toBe(false); + expect(s.provider).toBe('azure'); + expect(s.problems.join('\n')).toMatch(/Unknown voice provider/); + }); +}); diff --git a/packages/core/src/voice/detect.ts b/packages/core/src/voice/detect.ts new file mode 100644 index 0000000..d70d5ef --- /dev/null +++ b/packages/core/src/voice/detect.ts @@ -0,0 +1,137 @@ +// Voice setup detection — resolves the whisper.cpp binary + model so the +// `/voice` command (and, later, the desktop client) can report readiness and +// print actionable setup steps. Pure logic over injectable probes so it is +// unit-testable without touching the real PATH / filesystem. +// Spec: docs/VOICE_INPUT.md + +import { access, stat } from 'node:fs/promises'; +import { constants as FS } from 'node:fs'; +import { homedir } from 'node:os'; +import { delimiter, join } from 'node:path'; +import type { VoiceConfig } from '../config/types.js'; + +/** Binary names searched on PATH when `voice.binPath` is unset, in order. */ +export const WHISPER_BIN_CANDIDATES = ['whisper-cli', 'whisper'] as const; + +/** Default model location probed when `voice.modelPath` is unset (under home). */ +export const DEFAULT_MODEL_RELPATH = ['.deepcode', 'models', 'whisper-base.en.bin'] as const; + +/** Filesystem / PATH probes — injectable so detection is deterministic in tests. */ +export interface VoiceProbe { + /** Resolve an executable `name` on PATH to an absolute path, or null. */ + which(name: string): Promise; + /** True if a readable regular file exists at `path`. */ + fileExists(path: string): Promise; + /** Home dir, for ~ expansion + the default model path. */ + home: string; +} + +export interface VoiceStatus { + /** True iff a supported provider, a binary, and a model were all resolved. */ + ready: boolean; + /** Resolved provider name (defaults to 'whisper.cpp'). */ + provider: string; + /** Resolved whisper binary (absolute path), if found. */ + binPath?: string; + /** Resolved model file (absolute path), if found. */ + modelPath?: string; + /** Human-readable reasons it is not ready (empty when ready). */ + problems: string[]; +} + +/** Expand a leading `~` / `~/` to the home dir. Other paths pass through. */ +export function expandHome(p: string, home: string): string { + if (p === '~') return home; + if (p.startsWith('~/')) return join(home, p.slice(2)); + return p; +} + +/** Real PATH lookup — first dir in $PATH holding an executable `name`. */ +async function whichOnPath(name: string): Promise { + const dirs = (process.env['PATH'] ?? '').split(delimiter).filter(Boolean); + for (const dir of dirs) { + const candidate = join(dir, name); + try { + await access(candidate, FS.X_OK); + return candidate; + } catch { + /* not here, or not executable */ + } + } + return null; +} + +/** Real existence check — true only for a regular file. */ +async function isFile(path: string): Promise { + try { + return (await stat(path)).isFile(); + } catch { + return false; + } +} + +/** + * Detect whether local voice input (whisper.cpp) is ready to use. + * + * Resolution order: + * - binary: `voice.binPath` (if set) else the first of + * {@link WHISPER_BIN_CANDIDATES} found on PATH. + * - model: `voice.modelPath` (if set) else the documented default + * `~/.deepcode/models/whisper-base.en.bin`. + * + * Never throws — every missing/invalid piece becomes a `problems` entry. + */ +export async function detectVoice( + voice: VoiceConfig | undefined, + probe?: Partial, +): Promise { + const home = probe?.home ?? homedir(); + const which = probe?.which ?? whichOnPath; + const fileExists = probe?.fileExists ?? isFile; + + const provider = voice?.provider ?? 'whisper.cpp'; + const problems: string[] = []; + + if (provider !== 'whisper.cpp' && provider !== 'stub') { + problems.push(`Unknown voice provider "${provider}" — expected "whisper.cpp".`); + } + + // Resolve the binary. + let binPath: string | undefined; + if (voice?.binPath) { + const p = expandHome(voice.binPath, home); + if (await fileExists(p)) binPath = p; + else problems.push(`Configured voice.binPath not found: ${voice.binPath}`); + } else { + for (const name of WHISPER_BIN_CANDIDATES) { + const found = await which(name); + if (found) { + binPath = found; + break; + } + } + if (!binPath) { + problems.push( + `whisper.cpp binary not found on PATH (looked for ${WHISPER_BIN_CANDIDATES.join(', ')}).`, + ); + } + } + + // Resolve the model. + let modelPath: string | undefined; + if (voice?.modelPath) { + const p = expandHome(voice.modelPath, home); + if (await fileExists(p)) modelPath = p; + else problems.push(`Configured voice.modelPath not found: ${voice.modelPath}`); + } else { + const def = join(home, ...DEFAULT_MODEL_RELPATH); + if (await fileExists(def)) modelPath = def; + else + problems.push( + `No voice.modelPath set, and no model at the default ~/${DEFAULT_MODEL_RELPATH.join('/')}.`, + ); + } + + const ready = problems.length === 0 && !!binPath && !!modelPath; + return { ready, provider, binPath, modelPath, problems }; +} diff --git a/packages/core/src/voice/index.ts b/packages/core/src/voice/index.ts index 793a1d3..6efe668 100644 --- a/packages/core/src/voice/index.ts +++ b/packages/core/src/voice/index.ts @@ -137,3 +137,16 @@ export class StubVoiceProvider implements VoiceProvider { return { text: '', latencyMs: 0 }; } } + +// ────────────────────────────────────────────────────────────────────────── +// Setup detection — is whisper.cpp + a model installed and configured? +// ────────────────────────────────────────────────────────────────────────── + +export { + detectVoice, + expandHome, + WHISPER_BIN_CANDIDATES, + DEFAULT_MODEL_RELPATH, + type VoiceProbe, + type VoiceStatus, +} from './detect.js'; From 434869d0c485cbc05bf45e85daa672497dc9aabb Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 13:41:58 +0800 Subject: [PATCH 2/5] feat(voice): CLI mic capture + transcribe via /voice (slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type /voice in the REPL to dictate: record from the mic, transcribe locally with whisper.cpp, and pre-fill the input line with the transcript to edit before sending. Builds on slice 1's detection. Spec: docs/VOICE_INPUT.md. Core: - voice/record.ts: detectRecorder() finds ffmpeg / rec / sox on PATH; buildRecordArgs() builds the 16 kHz mono WAV command per tool + OS (avfoundation on macOS, alsa on Linux; rec/sox use the default device); recordToWav() spawns it and stops on an AbortSignal (SIGINT so the WAV trailer flushes — a non-zero exit after abort is expected, a non-zero exit without one rejects, e.g. no mic). Injectable which/spawn for tests. - VoiceConfig gains optional inputDevice (ffmpeg override); schema updated. CLI: - voice-capture.ts: orchestrates detect → record (Enter to stop) → WhisperCppProvider.transcribe → delete the temp WAV (+ .txt side-file) → return transcript + status lines. Handles not-ready / no-recorder / no-speech / failures gracefully. - /voice now triggers capture when the REPL wires ctx.voiceCapture; falls back to readiness/setup output otherwise. `/voice setup` still forces the install steps. Setup lines extracted to pure, reused helpers. - REPL wires voiceCapture and pre-fills the next prompt via rl.write() once the transcript is ready (ctx.prefillInput). Docs: VOICE_INPUT.md usage now describes the /voice flow (was Ctrl+V) + a recorder-install section; BEHAVIOR_PARITY /voice row updated for CLI capture. Tests: 9 core recorder cases (detect/buildArgs/record orchestration) + 3 new CLI cases (capture callback, cancel/empty, setup bypass). Real-mic end-to-end is manual (no audio hardware in CI). core 661 / cli 151, typecheck + lint + format all clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/cli/src/commands.ts | 108 ++++++++----- apps/cli/src/repl.ts | 19 ++- apps/cli/src/voice-capture.ts | 108 +++++++++++++ apps/cli/src/voice-cmd.test.ts | 35 ++++- docs/BEHAVIOR_PARITY.md | 98 ++++++------ docs/VOICE_INPUT.md | 37 +++-- packages/core/schemas/settings.schema.json | 3 +- packages/core/src/config/types.ts | 6 + packages/core/src/index.ts | 6 + packages/core/src/voice/index.ts | 15 ++ packages/core/src/voice/record.test.ts | 103 +++++++++++++ packages/core/src/voice/record.ts | 169 +++++++++++++++++++++ 12 files changed, 605 insertions(+), 102 deletions(-) create mode 100644 apps/cli/src/voice-capture.ts create mode 100644 packages/core/src/voice/record.test.ts create mode 100644 packages/core/src/voice/record.ts diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts index b7aab71..a405449 100644 --- a/apps/cli/src/commands.ts +++ b/apps/cli/src/commands.ts @@ -11,6 +11,7 @@ import type { SessionMeta, StoredMessage, TaskManager, + VoiceStatus, } from '@deepcode/core'; import { contextWindowFor, @@ -131,6 +132,14 @@ export interface SessionContext { /** Home dir override (REPL-injected from --home) — backs default-path lookups * like /voice's `~/.deepcode/models/...` model probe. Defaults to os.homedir(). */ home?: string; + /** + * Interactive voice capture, wired by the REPL (it owns readline + the mic): + * record → press Enter to stop → transcribe → return the text + display lines. + * `transcript` is null on cancel / not-ready / error. Absent in headless mode. + */ + voiceCapture?: () => Promise<{ transcript: string | null; lines: string[] }>; + /** Set by /voice — the REPL pre-fills the next input line with this text. */ + prefillInput?: string; sessionId: string; sessions: SessionManager; usage: { @@ -1172,54 +1181,71 @@ export const TasksCommand: SlashCommand = { }, }; +/** "Ready" status lines for /voice (non-interactive / headless fallback). */ +export function voiceReadyLines(status: VoiceStatus): string[] { + return [ + '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', + ` binary: ${status.binPath}`, + ` model: ${status.modelPath}`, + '', + 'Type /voice in the interactive REPL to dictate (record → Enter to stop → transcribe).', + ]; +} + +/** Setup/troubleshooting instructions for /voice, driven by a detection result. */ +export function voiceSetupLines(status: VoiceStatus): string[] { + const lines: string[] = [ + status.ready + ? '🎙 Voice input is ready. Setup reference below.' + : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', + '', + 'Detected:', + ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, + ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, + ]; + if (status.problems.length) { + lines.push('', 'Issues:'); + for (const p of status.problems) lines.push(` • ${p}`); + } + lines.push( + '', + 'Setup:', + ' 1. Install whisper.cpp', + ' macOS: brew install whisper-cpp', + ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', + ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', + ' mkdir -p ~/.deepcode/models', + ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', + ' 3. Install a mic recorder (either): brew install ffmpeg · brew install sox', + ' 4. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', + ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', + ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', + '', + 'Full guide: docs/VOICE_INPUT.md', + ); + return lines; +} + export const VoiceCommand: SlashCommand = { name: '/voice', - description: 'Check local voice-input (whisper.cpp) setup; `/voice setup` shows install steps.', + description: + 'Dictate via local whisper.cpp (record → Enter → transcribe); `/voice setup` for steps.', async run(args, ctx) { - const { detectVoice } = await import('@deepcode/core'); - const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); const forceSetup = (args[0] ?? '').toLowerCase() === 'setup'; - if (status.ready && !forceSetup) { - return [ - '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', - ` binary: ${status.binPath}`, - ` model: ${status.modelPath}`, - '', - 'Dictate from the REPL with the voice key (default Ctrl+V; remap in keybindings.json).', - 'Note: live mic capture lands in a follow-up — this step ships setup + detection.', - ]; + // Interactive REPL: record + transcribe via the wired callback, then let the + // REPL pre-fill the input line with the transcript for the user to edit. + if (!forceSetup && ctx.voiceCapture) { + const r = await ctx.voiceCapture(); + if (r.transcript) ctx.prefillInput = r.transcript; + return r.lines; } - const lines: string[] = [ - status.ready - ? '🎙 Voice input is ready. Setup reference below.' - : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', - '', - 'Detected:', - ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, - ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, - ]; - if (status.problems.length) { - lines.push('', 'Issues:'); - for (const p of status.problems) lines.push(` • ${p}`); - } - lines.push( - '', - 'Setup:', - ' 1. Install whisper.cpp', - ' macOS: brew install whisper-cpp', - ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', - ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', - ' mkdir -p ~/.deepcode/models', - ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', - ' 3. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', - ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', - ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', - '', - 'Full guide: docs/VOICE_INPUT.md', - ); - return lines; + // Headless / `/voice setup`: report readiness or print setup instructions. + const { detectVoice } = await import('@deepcode/core'); + const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); + if (status.ready && !forceSetup) return voiceReadyLines(status); + return voiceSetupLines(status); }, }; diff --git a/apps/cli/src/repl.ts b/apps/cli/src/repl.ts index 5b109c5..150854d 100644 --- a/apps/cli/src/repl.ts +++ b/apps/cli/src/repl.ts @@ -52,6 +52,7 @@ import { import { createInterface } from 'node:readline/promises'; import type { Readable, Writable } from 'node:stream'; import { CommandRegistry, type SessionContext } from './commands.js'; +import { captureVoice } from './voice-capture.js'; import { resolveEffort } from './parse-args.js'; import { TrustStore } from './trust.js'; import { resolveBuiltinSkillsDir } from './builtin-skills.js'; @@ -453,6 +454,9 @@ export async function startRepl(opts: ReplOpts): Promise { ...(pluginsWire?.spawnFailures.map((n) => `${n}: failed to start`) ?? []), ], initFlow: () => runInitFlow({ cwd, output, rl, provider, model, maxTokens, temperature }), + // M8: /voice records from the mic + transcribes via whisper.cpp, then the + // loop pre-fills the next input line with the transcript (rl is created below). + voiceCapture: () => captureVoice({ rl, output, settings, home: opts.home }), // M7: /rewind needs access to history + provider. provider, history, @@ -549,10 +553,19 @@ export async function startRepl(opts: ReplOpts): Promise { }; await fireLifecycle('SessionStart', { sessionId: session.id, source: 'cli' }); + // Text to inject into the next prompt's input buffer (e.g. a /voice transcript + // the user can edit before submitting). Written right after the prompt renders. + let pendingPrefill: string | undefined; + while (true) { let userInput: string; try { - userInput = await rl.question('› '); + const question = rl.question('› '); + if (pendingPrefill !== undefined) { + rl.write(pendingPrefill); + pendingPrefill = undefined; + } + userInput = await question; } catch { break; } @@ -590,6 +603,10 @@ export async function startRepl(opts: ReplOpts): Promise { history = ctx.newHistory; ctx.newHistory = undefined; } + if (ctx.prefillInput) { + pendingPrefill = ctx.prefillInput; + ctx.prefillInput = undefined; + } if (ctx.exitRequested) break; continue; } diff --git a/apps/cli/src/voice-capture.ts b/apps/cli/src/voice-capture.ts new file mode 100644 index 0000000..e032eec --- /dev/null +++ b/apps/cli/src/voice-capture.ts @@ -0,0 +1,108 @@ +// Interactive voice capture for the REPL: detect whisper.cpp + a recorder, +// record from the mic until the user presses Enter, transcribe locally, and +// return the text so the REPL can pre-fill the input line. The audio file is +// written to $TMPDIR and deleted right after transcription (see VOICE_INPUT.md). + +import { randomUUID } from 'node:crypto'; +import { rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import type { Interface as ReadlineInterface } from 'node:readline/promises'; +import type { Writable } from 'node:stream'; +import { + detectRecorder, + detectVoice, + recordToWav, + WhisperCppProvider, + type DeepCodeSettings, +} from '@deepcode/core'; +import { voiceSetupLines } from './commands.js'; + +export interface VoiceCaptureDeps { + rl: ReadlineInterface; + output: Writable; + settings: DeepCodeSettings; + /** Home override (honors --home), for the default model-path probe. */ + home?: string; +} + +export interface VoiceCaptureResult { + /** Transcribed text, or null on cancel / not-ready / empty / error. */ + transcript: string | null; + /** Lines for the REPL to print (status, errors, or setup steps). */ + lines: string[]; +} + +export async function captureVoice(deps: VoiceCaptureDeps): Promise { + const { rl, output, settings, home } = deps; + + const status = await detectVoice(settings.voice, { home }); + if (!status.ready) return { transcript: null, lines: voiceSetupLines(status) }; + + const rec = await detectRecorder(); + if (!rec.found || !rec.bin || !rec.binPath) { + return { + transcript: null, + lines: [ + '🎙 whisper.cpp is ready, but no microphone recorder was found.', + ` • ${rec.problems[0] ?? 'no recorder on PATH'}`, + ' Install one: brew install ffmpeg · brew install sox', + ], + }; + } + + const wav = join(tmpdir(), `deepcode-voice-${randomUUID()}.wav`); + const cleanup = async (): Promise => { + await rm(wav, { force: true }); + await rm(`${wav}.txt`, { force: true }); // whisper --output-txt side-file + }; + + // Record until the user presses Enter (abort → SIGINT → recorder flushes WAV). + const ac = new AbortController(); + let recErr: Error | undefined; + const recording = recordToWav({ + outPath: wav, + bin: rec.bin, + binPath: rec.binPath, + signal: ac.signal, + device: settings.voice?.inputDevice, + }).catch((e: unknown) => { + recErr = e as Error; + }); + + output.write(` 🎙 Recording with ${rec.bin}… press Enter to stop.\n`); + await rl.question(''); + ac.abort(); + await recording; + + if (recErr) { + await cleanup(); + return { + transcript: null, + lines: [` ⚠ Recording failed: ${recErr.message}`, ' Run `/voice setup` for help.'], + }; + } + + try { + output.write(' … transcribing locally\n'); + const provider = new WhisperCppProvider({ + binPath: status.binPath, + modelPath: status.modelPath!, + }); + const { text } = await provider.transcribe(wav); + await cleanup(); + const transcript = text.trim(); + if (!transcript) { + return { transcript: null, lines: [' (No speech detected — nothing inserted.)'] }; + } + return { + transcript, + lines: [ + ` 🎙 Transcribed (${transcript.length} chars) — review the input line, edit, then press Enter.`, + ], + }; + } catch (e) { + await cleanup(); + return { transcript: null, lines: [` ⚠ Transcription failed: ${(e as Error).message}`] }; + } +} diff --git a/apps/cli/src/voice-cmd.test.ts b/apps/cli/src/voice-cmd.test.ts index bbdac47..55c5ae5 100644 --- a/apps/cli/src/voice-cmd.test.ts +++ b/apps/cli/src/voice-cmd.test.ts @@ -45,11 +45,12 @@ describe('/voice', () => { const modelPath = join(dir, 'model.bin'); await writeFile(binPath, '#!/bin/sh\n'); await writeFile(modelPath, 'GGML'); + // No voiceCapture wired (headless / non-interactive) → report readiness. const out = (await run([], ctx({ settings: { voice: { binPath, modelPath } } }))).join('\n'); expect(out).toMatch(/ready/i); expect(out).toContain(binPath); expect(out).toContain(modelPath); - expect(out).toMatch(/Ctrl\+V/); + expect(out).toMatch(/type \/voice/i); }); it('prints setup steps + issues when configured paths are missing', async () => { @@ -82,4 +83,36 @@ describe('/voice', () => { // Still acknowledges it's already ready. expect(out).toMatch(/ready/i); }); + + it('runs the wired capture callback and pre-fills the transcript', async () => { + const c = ctx({ + voiceCapture: async () => ({ transcript: 'refactor the parser', lines: ['🎙 Transcribed'] }), + }); + const out = (await run([], c)).join('\n'); + expect(out).toContain('Transcribed'); + expect(c.prefillInput).toBe('refactor the parser'); // REPL will inject this + }); + + it('does not pre-fill when capture is cancelled / empty', async () => { + const c = ctx({ + voiceCapture: async () => ({ transcript: null, lines: ['(No speech detected)'] }), + }); + const out = (await run([], c)).join('\n'); + expect(out).toMatch(/no speech/i); + expect(c.prefillInput).toBeUndefined(); + }); + + it('`/voice setup` bypasses capture even when a callback is wired', async () => { + let called = false; + const c = ctx({ + settings: { voice: { binPath: '/no/such', modelPath: '/no/such' } }, + voiceCapture: async () => { + called = true; + return { transcript: 'x', lines: [] }; + }, + }); + const out = (await run(['setup'], c)).join('\n'); + expect(called).toBe(false); + expect(out).toMatch(/Setup:/); + }); }); diff --git a/docs/BEHAVIOR_PARITY.md b/docs/BEHAVIOR_PARITY.md index c5c3821..db0003e 100644 --- a/docs/BEHAVIOR_PARITY.md +++ b/docs/BEHAVIOR_PARITY.md @@ -21,55 +21,55 @@ Legend: `✅` matches · `🟡` matches with caveats · `🔄` deferred · `⚠ ## Slash commands (30+ in Claude Code, ~32 shipped in DeepCode) -| Command | Claude Code | DeepCode | Status | -| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `/help` | ✓ | ✓ | ✅ | -| `/clear` | ✓ | ✓ | ✅ | -| `/exit` / `/quit` | ✓ | ✓ | ✅ | -| `/status` / `/doctor` | ✓ | ✓ | ✅ | -| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | -| `/mode` | ✓ | ✓ | ✅ | -| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | -| `/cost` / `/usage` | ✓ | ✓ | ✅ | -| `/context` | ✓ | ✓ | ✅ | -| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | -| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | -| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | -| `/mcp` | ✓ | ✓ | ✅ | -| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | -| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | -| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | -| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | -| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | -| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | -| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | -| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | -| `/voice` | ✓ | ✓ | 🟡 — `/voice` detects whisper.cpp + a model and prints setup steps (docs/VOICE_INPUT.md); core `WhisperCppProvider` is wired; live mic capture lands in a follow-up slice | -| `/teleport` | ✓ | ✗ | 🔄 M8 | -| `/desktop` | ✓ | ✗ | 🔄 M6 | -| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | -| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | -| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | -| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | -| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | -| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | -| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | -| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | -| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | -| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | -| `/loop` | ✓ | ✗ (skill avail) | 🟡 | -| `/terminal-setup` | ✓ | ✗ | 🔄 | -| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | -| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | -| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | -| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | -| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | -| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | -| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | -| `/migrate-installer` | ✓ | ✗ | 🔄 | -| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | +| Command | Claude Code | DeepCode | Status | +| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `/help` | ✓ | ✓ | ✅ | +| `/clear` | ✓ | ✓ | ✅ | +| `/exit` / `/quit` | ✓ | ✓ | ✅ | +| `/status` / `/doctor` | ✓ | ✓ | ✅ | +| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | +| `/mode` | ✓ | ✓ | ✅ | +| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | +| `/cost` / `/usage` | ✓ | ✓ | ✅ | +| `/context` | ✓ | ✓ | ✅ | +| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | +| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | +| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | +| `/mcp` | ✓ | ✓ | ✅ | +| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | +| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | +| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | +| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | +| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | +| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | +| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | +| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | +| `/voice` | ✓ | ✓ | 🟡 — CLI: `/voice` records via ffmpeg/sox → whisper.cpp → pre-fills the input line (`/voice setup` for steps; fully local). Desktop 🎙 button is a follow-up slice | +| `/teleport` | ✓ | ✗ | 🔄 M8 | +| `/desktop` | ✓ | ✗ | 🔄 M6 | +| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | +| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | +| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | +| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | +| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | +| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | +| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | +| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | +| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | +| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | +| `/loop` | ✓ | ✗ (skill avail) | 🟡 | +| `/terminal-setup` | ✓ | ✗ | 🔄 | +| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | +| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | +| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | +| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | +| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | +| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | +| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | +| `/migrate-installer` | ✓ | ✗ | 🔄 | +| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | --- diff --git a/docs/VOICE_INPUT.md b/docs/VOICE_INPUT.md index 58d12de..0112235 100644 --- a/docs/VOICE_INPUT.md +++ b/docs/VOICE_INPUT.md @@ -63,6 +63,19 @@ mkdir -p ~/.deepcode/models cp models/ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin ``` +## Install a mic recorder + +DeepCode records your microphone with whichever recorder it finds on PATH — +`ffmpeg` is tried first, then sox's `rec` / `sox`: + +```bash +# macOS +brew install ffmpeg # or: brew install sox + +# Linux (Debian/Ubuntu) +sudo apt install ffmpeg # or: sudo apt install sox +``` + ## Configure DeepCode In `~/.deepcode/settings.json`: @@ -77,20 +90,26 @@ In `~/.deepcode/settings.json`: } ``` -(The `binPath` defaults to `whisper` on PATH if you omit it.) +(The `binPath` defaults to `whisper-cli` / `whisper` on PATH if you omit it.) +If ffmpeg captures from the wrong input, set `voice.inputDevice` — e.g. +`":1"` for avfoundation (macOS) or `"hw:1"` for ALSA (Linux). sox/rec always +use the system default device. ## Usage -In the CLI REPL, press the voice toggle key (default `Ctrl+V`; remap in -`~/.deepcode/keybindings.json`). DeepCode: +In the CLI REPL, type `/voice` and press Enter. DeepCode: + +1. Records audio from your default mic (via ffmpeg or sox) into a temp + `.wav` file. +2. Stops when you press Enter again (or after a 60 s safety cap). +3. Spawns whisper.cpp to transcribe the `.wav` locally. +4. Pre-fills the input line with the transcript — edit it if needed, then + press Enter to send. -1. Records audio from your default mic into a temp `.wav` file. -2. Stops recording on the next key press OR after 60 s of silence. -3. Spawns whisper.cpp to transcribe the .wav. -4. Inserts the transcribed text into the input box (you can edit before - submitting). +Run `/voice setup` any time to print install steps and what's detected. -In the Mac client (M6-rest), the same flow appears as a 🎙 button. +In the Mac client (M6-rest), the same flow appears as a 🎙 button in the +composer. ## Privacy diff --git a/packages/core/schemas/settings.schema.json b/packages/core/schemas/settings.schema.json index 030f1ae..35b073e 100644 --- a/packages/core/schemas/settings.schema.json +++ b/packages/core/schemas/settings.schema.json @@ -133,7 +133,8 @@ "properties": { "provider": { "type": "string", "enum": ["whisper.cpp", "stub"] }, "binPath": { "type": "string" }, - "modelPath": { "type": "string" } + "modelPath": { "type": "string" }, + "inputDevice": { "type": "string" } } } }, diff --git a/packages/core/src/config/types.ts b/packages/core/src/config/types.ts index f77af1e..81554aa 100644 --- a/packages/core/src/config/types.ts +++ b/packages/core/src/config/types.ts @@ -118,6 +118,12 @@ export interface VoiceConfig { binPath?: string; /** Path to the ggml model file (e.g. ~/.deepcode/models/whisper-base.en.bin). */ modelPath?: string; + /** + * Override the mic input device passed to ffmpeg (e.g. ':1' for avfoundation, + * 'hw:0' for alsa). Default: ':default' (macOS) / 'default' (Linux). sox/rec + * ignore this and use the system default device. + */ + inputDevice?: string; } export interface DeepCodeSettings { diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 757713a..a685970 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -341,12 +341,18 @@ export { StubVoiceProvider, parseWhisperOutput, detectVoice, + detectRecorder, + recordToWav, + buildRecordArgs, type VoiceProvider, type VoiceTranscript, type TranscribeOpts, type WhisperCppOpts, type VoiceProbe, type VoiceStatus, + type RecorderBin, + type RecorderStatus, + type RecordToWavOpts, } from './voice/index.js'; // Auto-mode classifier (M3c-rest — LLM-judged tool gate when mode === 'auto') diff --git a/packages/core/src/voice/index.ts b/packages/core/src/voice/index.ts index 6efe668..401b679 100644 --- a/packages/core/src/voice/index.ts +++ b/packages/core/src/voice/index.ts @@ -150,3 +150,18 @@ export { type VoiceProbe, type VoiceStatus, } from './detect.js'; + +// ────────────────────────────────────────────────────────────────────────── +// Microphone capture — record a WAV for whisper.cpp to transcribe. +// ────────────────────────────────────────────────────────────────────────── + +export { + detectRecorder, + recordToWav, + buildRecordArgs, + RECORDER_CANDIDATES, + type RecorderBin, + type RecorderStatus, + type RecordToWavOpts, + type RecordArgsOpts, +} from './record.js'; diff --git a/packages/core/src/voice/record.test.ts b/packages/core/src/voice/record.test.ts new file mode 100644 index 0000000..7bd035c --- /dev/null +++ b/packages/core/src/voice/record.test.ts @@ -0,0 +1,103 @@ +import { EventEmitter } from 'node:events'; +import type { ChildProcess } from 'node:child_process'; +import { describe, expect, it } from 'vitest'; +import { buildRecordArgs, detectRecorder, recordToWav } from './record.js'; + +describe('detectRecorder', () => { + it('prefers ffmpeg when present', async () => { + const r = await detectRecorder(async (n) => (n === 'ffmpeg' ? `/usr/bin/${n}` : null)); + expect(r.found).toBe(true); + expect(r.bin).toBe('ffmpeg'); + expect(r.binPath).toBe('/usr/bin/ffmpeg'); + }); + + it('falls back to rec, then sox', async () => { + const recOnly = await detectRecorder(async (n) => (n === 'rec' ? '/usr/bin/rec' : null)); + expect(recOnly.bin).toBe('rec'); + const soxOnly = await detectRecorder(async (n) => (n === 'sox' ? '/usr/bin/sox' : null)); + expect(soxOnly.bin).toBe('sox'); + }); + + it('reports a problem when nothing is installed', async () => { + const r = await detectRecorder(async () => null); + expect(r.found).toBe(false); + expect(r.problems.join('\n')).toMatch(/No microphone recorder/); + }); +}); + +describe('buildRecordArgs', () => { + it('ffmpeg uses avfoundation on macOS and 16k mono', () => { + const a = buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'darwin', maxSeconds: 60 }); + expect(a).toEqual( + expect.arrayContaining(['-f', 'avfoundation', '-i', ':default', '-ar', '16000', '-ac', '1']), + ); + expect(a).toContain('-t'); + expect(a[a.length - 1]).toBe('/t/o.wav'); + }); + + it('ffmpeg uses alsa on Linux and honors a custom device', () => { + const a = buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'linux', device: 'hw:1' }); + expect(a).toEqual(expect.arrayContaining(['-f', 'alsa', '-i', 'hw:1'])); + }); + + it('ffmpeg throws on an unsupported platform without a device', () => { + expect(() => buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'win32' })).toThrow( + /inputDevice/, + ); + }); + + it('rec records the default device (no -d); sox adds -d', () => { + const rec = buildRecordArgs('rec', '/t/o.wav', { maxSeconds: 30 }); + expect(rec).not.toContain('-d'); + expect(rec).toEqual( + expect.arrayContaining(['-r', '16000', '-c', '1', '/t/o.wav', 'trim', '0', '30']), + ); + const sox = buildRecordArgs('sox', '/t/o.wav'); + expect(sox).toContain('-d'); + }); +}); + +/** Fake ChildProcess whose stderr emits `err` then close(code) on next tick. */ +function fakeChild(code: number, err = ''): ChildProcess { + const ee = new EventEmitter() as unknown as ChildProcess; + const stderr = new EventEmitter() as unknown as NodeJS.ReadableStream; + Object.defineProperty(ee, 'stderr', { value: stderr }); + let killed = false; + (ee as unknown as { kill: (s?: string) => boolean }).kill = () => { + killed = true; + // Emulate ffmpeg/sox finalizing + exiting on SIGINT. + setImmediate(() => ee.emit('close', code)); + return true; + }; + setImmediate(() => { + if (err) (stderr as unknown as EventEmitter).emit('data', Buffer.from(err)); + if (!killed) ee.emit('close', code); // self-exit path (no abort) + }); + return ee; +} + +describe('recordToWav', () => { + it('resolves when stopped via the abort signal (non-zero exit is expected)', async () => { + const ac = new AbortController(); + const exec = (() => fakeChild(255)) as unknown as RecordExec; + const p = recordToWav({ + outPath: '/t/o.wav', + bin: 'ffmpeg', + binPath: '/usr/bin/ffmpeg', + platform: 'darwin', + signal: ac.signal, + exec, + }); + ac.abort(); + await expect(p).resolves.toBeUndefined(); + }); + + it('rejects on a non-zero exit when not aborted (e.g. no mic)', async () => { + const exec = (() => fakeChild(1, 'No such audio device')) as unknown as RecordExec; + await expect( + recordToWav({ outPath: '/t/o.wav', bin: 'rec', binPath: '/usr/bin/rec', exec }), + ).rejects.toThrow(/rec exited 1: No such audio device/); + }); +}); + +type RecordExec = NonNullable[0]['exec']>; diff --git a/packages/core/src/voice/record.ts b/packages/core/src/voice/record.ts new file mode 100644 index 0000000..1dbc8c6 --- /dev/null +++ b/packages/core/src/voice/record.ts @@ -0,0 +1,169 @@ +// Microphone capture — spawns a local recorder (ffmpeg or sox) to write a +// 16 kHz mono WAV that whisper.cpp can transcribe. Like the whisper binary, +// the recorder is a user-installed external tool we detect on PATH and, if +// absent, print setup steps for. Spec: docs/VOICE_INPUT.md. + +import { spawn, type ChildProcess } from 'node:child_process'; +import { access } from 'node:fs/promises'; +import { constants as FS } from 'node:fs'; +import { delimiter, join } from 'node:path'; + +/** + * Recorder front-ends we look for, in preference order. `ffmpeg` is the most + * universally installed; `rec` / `sox` are whisper.cpp tutorial favorites and + * pick the default input device automatically. + */ +export const RECORDER_CANDIDATES = ['ffmpeg', 'rec', 'sox'] as const; +export type RecorderBin = (typeof RECORDER_CANDIDATES)[number]; + +export interface RecorderStatus { + /** True if a usable recorder was found on PATH. */ + found: boolean; + /** Which front-end was selected. */ + bin?: RecorderBin; + /** Absolute path to the recorder binary. */ + binPath?: string; + /** Human-readable reason none was found (empty when found). */ + problems: string[]; +} + +/** PATH/`which` probe — injectable for tests. */ +export type WhichFn = (name: string) => Promise; + +async function whichOnPath(name: string): Promise { + const dirs = (process.env['PATH'] ?? '').split(delimiter).filter(Boolean); + for (const dir of dirs) { + const candidate = join(dir, name); + try { + await access(candidate, FS.X_OK); + return candidate; + } catch { + /* not here, or not executable */ + } + } + return null; +} + +/** Find the first available recorder front-end on PATH. Never throws. */ +export async function detectRecorder(which: WhichFn = whichOnPath): Promise { + for (const bin of RECORDER_CANDIDATES) { + const binPath = await which(bin); + if (binPath) return { found: true, bin, binPath, problems: [] }; + } + return { + found: false, + problems: [ + `No microphone recorder found on PATH (looked for ${RECORDER_CANDIDATES.join(', ')}).`, + ], + }; +} + +export interface RecordArgsOpts { + /** Platform, for ffmpeg's OS-specific input format. Defaults to process.platform. */ + platform?: NodeJS.Platform; + /** Override the input device (ffmpeg only). Default: ':default' (mac) / 'default' (linux). */ + device?: string; + /** Hard cap on recording length in seconds (safety net). */ + maxSeconds?: number; +} + +/** + * Build the recorder argv for `bin` writing 16 kHz mono WAV to `outPath`. + * Pure + exported so the per-platform/per-tool command is unit-testable. + * + * - ffmpeg: needs an OS-specific input (avfoundation on macOS, alsa on Linux). + * - rec / sox: capture the system default input device directly. + */ +export function buildRecordArgs( + bin: RecorderBin, + outPath: string, + opts: RecordArgsOpts = {}, +): string[] { + const platform = opts.platform ?? process.platform; + const max = opts.maxSeconds; + + if (bin === 'ffmpeg') { + const input: string[] = + platform === 'darwin' + ? ['-f', 'avfoundation', '-i', opts.device ?? ':default'] + : platform === 'linux' + ? ['-f', 'alsa', '-i', opts.device ?? 'default'] + : (() => { + throw new Error( + `ffmpeg mic capture on ${platform} needs an explicit voice.inputDevice; install sox (rec) or set one.`, + ); + })(); + const dur = max ? ['-t', String(max)] : []; + // -y overwrite, quiet logs, 16 kHz mono PCM WAV (what whisper.cpp expects). + return [ + '-hide_banner', + '-loglevel', + 'error', + '-y', + ...input, + ...dur, + '-ar', + '16000', + '-ac', + '1', + outPath, + ]; + } + + // sox family. `rec OUT` == `sox -d OUT`; both grab the default input device. + const head = bin === 'rec' ? ['-q'] : ['-q', '-d']; + const trim = max ? ['trim', '0', String(max)] : []; + return [...head, '-r', '16000', '-c', '1', outPath, ...trim]; +} + +export interface RecordToWavOpts { + outPath: string; + bin: RecorderBin; + binPath: string; + /** Abort to stop recording (the normal "user pressed Enter" path). */ + signal?: AbortSignal; + /** Override spawn for tests. */ + exec?: typeof spawn; + platform?: NodeJS.Platform; + device?: string; + maxSeconds?: number; +} + +/** + * Record from the default mic into `outPath` until `signal` aborts (or the + * recorder exits / hits `maxSeconds`). Aborting sends SIGINT so ffmpeg/sox + * flush a valid WAV trailer; a non-zero exit *after* an abort is expected and + * resolves cleanly. A non-zero exit *without* an abort (e.g. no microphone) + * rejects with the recorder's stderr. + */ +export function recordToWav(opts: RecordToWavOpts): Promise { + const spawnFn = opts.exec ?? spawn; + const args = buildRecordArgs(opts.bin, opts.outPath, { + platform: opts.platform, + device: opts.device, + maxSeconds: opts.maxSeconds ?? 60, + }); + return new Promise((resolve, reject) => { + const child: ChildProcess = spawnFn(opts.binPath, args); + let stderr = ''; + let aborted = false; + child.stderr?.on('data', (c: Buffer) => (stderr += c.toString())); + child.on('error', reject); + child.on('close', (code) => { + if (aborted || code === 0) resolve(); + else reject(new Error(`${opts.bin} exited ${code}: ${stderr.slice(0, 300).trim()}`)); + }); + if (opts.signal) { + if (opts.signal.aborted) stop(); + else opts.signal.addEventListener('abort', stop, { once: true }); + } + function stop(): void { + aborted = true; + try { + child.kill('SIGINT'); // ffmpeg/sox finalize the WAV on SIGINT + } catch { + /* already exited */ + } + } + }); +} From 22d709f4f932bc248c45fa8d4f30dadec5bf21e7 Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 14:16:27 +0800 Subject: [PATCH 3/5] ci: trigger checks (PR retargeted to main) From 4d52c008010c3403d86aaefb862351dd724db355 Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 13:56:47 +0800 Subject: [PATCH 4/5] =?UTF-8?q?feat(voice):=20desktop=20=F0=9F=8E=99=20com?= =?UTF-8?q?poser=20button=20+=20Tauri=20voice=20commands=20(slice=203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds local voice dictation to the Mac desktop client, mirroring the CLI: click 🎙 to record, click again to stop → transcribe with whisper.cpp → splice the text into the composer. Spec: docs/VOICE_INPUT.md. Rust (src-tauri): - voice.rs: voice_status (detect whisper bin + model + ffmpeg), voice_start / voice_stop / voice_cancel. The in-flight recording Child lives in tauri-managed VoiceState between start/stop. Desktop uses ffmpeg and stops it gracefully by writing `q` to stdin (flushes a valid WAV), then runs whisper and deletes the temp clip. parse_whisper_output ported from core. - Registered in lib.rs (+ .manage(VoiceState)). - Entitlements: com.apple.security.device.audio-input; Info.plist: NSMicrophoneUsageDescription (merged into the bundle by Tauri). Renderer: - lib/voice.ts: typed voice_* wrappers + pure insertTranscript() helper. - lib/use-voice.ts: idle→recording→transcribing state machine; probes voice_status on mount to disable the button (with a tooltip) when unset. - Repl.tsx: 🎙 button in the composer toolbar; transcript splices at the caret. index.css: button styles incl. a recording pulse. - preview-app.tsx: mock voice_* so the dev harness can exercise the button. Docs: VOICE_INPUT.md desktop usage; BEHAVIOR_PARITY /voice row now covers CLI + desktop. Testing: cargo test (4 voice cases: parse/expand-home/ffmpeg-args/status) + cargo build clean (no warnings). Renderer: 6 voice.ts cases (IPC names + insertTranscript); desktop suite 60 pass; typecheck + vite build + lint + format all clean. Verified the full idle→record→stop→insert flow in the mock-Tauri preview harness (screenshots). The real-microphone round-trip (avfoundation capture + TCC permission) needs manual on-device verification — CI compiles neither Rust nor the mic path. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/desktop/src-tauri/Entitlements.plist | 3 + apps/desktop/src-tauri/Info.plist | 10 + apps/desktop/src-tauri/src/lib.rs | 7 + apps/desktop/src-tauri/src/voice.rs | 379 ++++++++++++++++++++++ apps/desktop/src/index.css | 45 +++ apps/desktop/src/lib/use-voice.ts | 76 +++++ apps/desktop/src/lib/voice.test.ts | 57 ++++ apps/desktop/src/lib/voice.ts | 53 +++ apps/desktop/src/preview-app.tsx | 14 + apps/desktop/src/screens/Repl.tsx | 56 +++- docs/BEHAVIOR_PARITY.md | 98 +++--- docs/VOICE_INPUT.md | 6 +- 12 files changed, 752 insertions(+), 52 deletions(-) create mode 100644 apps/desktop/src-tauri/Info.plist create mode 100644 apps/desktop/src-tauri/src/voice.rs create mode 100644 apps/desktop/src/lib/use-voice.ts create mode 100644 apps/desktop/src/lib/voice.test.ts create mode 100644 apps/desktop/src/lib/voice.ts diff --git a/apps/desktop/src-tauri/Entitlements.plist b/apps/desktop/src-tauri/Entitlements.plist index 7c4be01..8d0723e 100644 --- a/apps/desktop/src-tauri/Entitlements.plist +++ b/apps/desktop/src-tauri/Entitlements.plist @@ -20,5 +20,8 @@ com.apple.security.files.user-selected.read-write + + com.apple.security.device.audio-input + diff --git a/apps/desktop/src-tauri/Info.plist b/apps/desktop/src-tauri/Info.plist new file mode 100644 index 0000000..e20cb50 --- /dev/null +++ b/apps/desktop/src-tauri/Info.plist @@ -0,0 +1,10 @@ + + + + + + NSMicrophoneUsageDescription + DeepCode records your microphone locally to transcribe voice input with whisper.cpp. Audio never leaves your machine. + + diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index ca09a52..866d119 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -14,6 +14,7 @@ mod credentials; mod settings; mod snapshots; mod tools; +mod voice; use commands::{ append_allow_matcher, cli_path, get_app_info, get_settings_path, list_plugins, list_sessions, @@ -24,6 +25,7 @@ use commands::{ use snapshots::session_snapshots; use tauri::Manager; use tools::{tool_bash, tool_edit, tool_glob, tool_grep, tool_read, tool_write}; +use voice::{voice_cancel, voice_start, voice_status, voice_stop, VoiceState}; #[cfg_attr(mobile, tauri::mobile_entry_point)] pub fn run() { @@ -34,6 +36,7 @@ pub fn run() { .plugin(tauri_plugin_shell::init()) .plugin(tauri_plugin_updater::Builder::new().build()) .plugin(tauri_plugin_process::init()) + .manage(VoiceState::default()) .invoke_handler(tauri::generate_handler![ get_app_info, read_credentials, @@ -62,6 +65,10 @@ pub fn run() { tool_glob, tool_grep, session_snapshots, + voice_status, + voice_start, + voice_stop, + voice_cancel, ]) .setup(|app| { // macOS: hide window menu items we don't use. diff --git a/apps/desktop/src-tauri/src/voice.rs b/apps/desktop/src-tauri/src/voice.rs new file mode 100644 index 0000000..d05474b --- /dev/null +++ b/apps/desktop/src-tauri/src/voice.rs @@ -0,0 +1,379 @@ +// Voice input — record the mic with ffmpeg and transcribe locally with +// whisper.cpp, entirely on-device (no audio leaves the machine). The renderer +// drives a start → stop flow; the in-flight recording Child lives in +// Tauri-managed state between the two calls. Mirrors the CLI voice subsystem +// (packages/core/src/voice/*). Spec: docs/VOICE_INPUT.md. +// +// Desktop uses ffmpeg specifically because we stop it gracefully by writing +// `q` to its stdin, which flushes a valid WAV trailer — sox/rec have no such +// stdin command, so the CLI (which can send SIGINT) supports them but the +// desktop sticks to ffmpeg. + +use serde::Serialize; +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::sync::Mutex; +use tokio::io::AsyncWriteExt; +use tokio::process::{Child, Command}; + +const WHISPER_BINS: [&str; 2] = ["whisper-cli", "whisper"]; +const MODEL_RELPATH: [&str; 3] = [".deepcode", "models", "whisper-base.en.bin"]; +const MAX_SECONDS: u32 = 60; + +#[derive(Serialize, Default)] +#[serde(rename_all = "camelCase")] +pub struct VoiceStatus { + /// True iff whisper binary, model, and ffmpeg were all resolved. + pub ready: bool, + pub bin_path: Option, + pub model_path: Option, + pub recorder_path: Option, + /// Human-readable reasons it is not ready (empty when ready). + pub problems: Vec, +} + +/// In-flight recording, parked in Tauri state between voice_start and voice_stop. +struct Recording { + child: Child, + wav: PathBuf, + bin_path: String, + model_path: String, +} + +#[derive(Default)] +pub struct VoiceState(Mutex>); + +// ── detection ─────────────────────────────────────────────────────────────── + +/// First dir in $PATH holding a file named `name` (no exec-bit check — good +/// enough for reporting; the spawn would surface a real perms error). +fn which_on_path(name: &str) -> Option { + let path = std::env::var_os("PATH")?; + for dir in std::env::split_paths(&path) { + let cand = dir.join(name); + if cand.is_file() { + return Some(cand.to_string_lossy().into_owned()); + } + } + None +} + +/// Expand a leading `~` / `~/` against `home`; other paths pass through. +fn expand_home(p: &str, home: &Path) -> PathBuf { + if p == "~" { + return home.to_path_buf(); + } + if let Some(rest) = p.strip_prefix("~/") { + return home.join(rest); + } + PathBuf::from(p) +} + +/// (binPath, modelPath, inputDevice) from ~/.deepcode/settings.json `voice` block. +fn read_voice_settings(home: &Path) -> (Option, Option, Option) { + let path = home.join(".deepcode").join("settings.json"); + let Ok(text) = std::fs::read_to_string(path) else { + return (None, None, None); + }; + let Ok(v) = serde_json::from_str::(&text) else { + return (None, None, None); + }; + let voice = v.get("voice"); + let get = |k: &str| { + voice + .and_then(|o| o.get(k)) + .and_then(|x| x.as_str()) + .map(String::from) + }; + (get("binPath"), get("modelPath"), get("inputDevice")) +} + +/// Resolve whisper + model + ffmpeg under `home`, collecting problems. Never panics. +fn compute_status(home: &Path) -> VoiceStatus { + let (bin_cfg, model_cfg, _device) = read_voice_settings(home); + let mut problems = Vec::new(); + + let bin_path = match bin_cfg { + Some(b) => { + let p = expand_home(&b, home); + if p.is_file() { + Some(p.to_string_lossy().into_owned()) + } else { + problems.push(format!("Configured voice.binPath not found: {b}")); + None + } + } + None => { + let found = WHISPER_BINS.iter().find_map(|n| which_on_path(n)); + if found.is_none() { + problems.push(format!( + "whisper.cpp not found on PATH (looked for {}).", + WHISPER_BINS.join(", ") + )); + } + found + } + }; + + let model_path = match model_cfg { + Some(m) => { + let p = expand_home(&m, home); + if p.is_file() { + Some(p.to_string_lossy().into_owned()) + } else { + problems.push(format!("Configured voice.modelPath not found: {m}")); + None + } + } + None => { + let def = MODEL_RELPATH.iter().fold(home.to_path_buf(), |a, c| a.join(c)); + if def.is_file() { + Some(def.to_string_lossy().into_owned()) + } else { + problems.push(format!( + "No voice.modelPath set, and no model at ~/{}.", + MODEL_RELPATH.join("/") + )); + None + } + } + }; + + let recorder_path = which_on_path("ffmpeg"); + if recorder_path.is_none() { + problems.push("ffmpeg not found on PATH (brew install ffmpeg).".to_string()); + } + + let ready = bin_path.is_some() && model_path.is_some() && recorder_path.is_some(); + VoiceStatus { + ready, + bin_path, + model_path, + recorder_path, + problems, + } +} + +/// ffmpeg argv to record the default mic into a 16 kHz mono WAV (capped length). +fn ffmpeg_record_args(device: &str, wav: &Path) -> Vec { + vec![ + "-hide_banner".into(), + "-loglevel".into(), + "error".into(), + "-y".into(), + "-f".into(), + "avfoundation".into(), + "-i".into(), + device.into(), + "-t".into(), + MAX_SECONDS.to_string(), + "-ar".into(), + "16000".into(), + "-ac".into(), + "1".into(), + wav.to_string_lossy().into_owned(), + ] +} + +/// Strip whisper.cpp's per-line timestamps + log lines to the bare transcript. +/// Ported from packages/core/src/voice/index.ts `parseWhisperOutput`. +fn parse_whisper_output(raw: &str) -> String { + let mut parts: Vec = Vec::new(); + for line in raw.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with("whisper_") || t.starts_with("system_info:") { + continue; + } + if t.starts_with('[') { + if let Some(idx) = t.find(']') { + let text = t[idx + 1..].trim(); + if !text.is_empty() { + parts.push(text.to_string()); + } + } + } else { + parts.push(t.to_string()); + } + } + parts.join(" ").trim().to_string() +} + +async fn transcribe(bin: &str, model: &str, wav: &Path) -> Result { + if !wav.is_file() { + return Err("recording produced no audio (is a microphone available?)".into()); + } + let out = Command::new(bin) + .args(["-m", model, "-f", &wav.to_string_lossy()]) + .output() + .await + .map_err(|e| format!("spawn whisper: {e}"))?; + if !out.status.success() { + let err: String = String::from_utf8_lossy(&out.stderr).chars().take(300).collect(); + return Err(format!( + "whisper exited {}: {}", + out.status.code().unwrap_or(-1), + err.trim() + )); + } + Ok(parse_whisper_output(&String::from_utf8_lossy(&out.stdout))) +} + +// ── commands ──────────────────────────────────────────────────────────────── + +/// Report whether local voice input is set up (whisper.cpp + model + ffmpeg). +#[tauri::command] +pub fn voice_status() -> VoiceStatus { + match dirs::home_dir() { + Some(home) => compute_status(&home), + None => VoiceStatus { + problems: vec!["could not resolve home directory".into()], + ..Default::default() + }, + } +} + +/// Begin recording from the default mic. Errors if voice isn't set up. +#[tauri::command] +pub async fn voice_start(state: tauri::State<'_, VoiceState>) -> Result<(), String> { + let home = dirs::home_dir().ok_or("could not resolve home directory")?; + let status = compute_status(&home); + if !status.ready { + return Err(status.problems.join("; ")); + } + let (_, _, device) = read_voice_settings(&home); + let device = device.unwrap_or_else(|| ":default".to_string()); + let recorder = status.recorder_path.clone().expect("ready ⇒ recorder"); + let wav = std::env::temp_dir().join(format!( + "deepcode-voice-{}-{}.wav", + std::process::id(), + crate::snapshots::now_ms() + )); + + // Replace any orphaned prior recording. + if let Ok(mut guard) = state.0.lock() { + if let Some(mut old) = guard.take() { + let _ = old.child.start_kill(); + } + } + + let child = Command::new(&recorder) + .args(ffmpeg_record_args(&device, &wav)) + .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .map_err(|e| format!("spawn ffmpeg: {e}"))?; + + let rec = Recording { + child, + wav, + bin_path: status.bin_path.expect("ready ⇒ bin"), + model_path: status.model_path.expect("ready ⇒ model"), + }; + state + .0 + .lock() + .map_err(|_| "voice state poisoned")? + .replace(rec); + Ok(()) +} + +/// Stop recording, transcribe the clip, delete the audio, return the text. +#[tauri::command] +pub async fn voice_stop(state: tauri::State<'_, VoiceState>) -> Result { + let mut rec = state + .0 + .lock() + .map_err(|_| "voice state poisoned")? + .take() + .ok_or("not recording")?; + + // Graceful stop: 'q' on ffmpeg's stdin flushes a valid WAV trailer. + if let Some(mut stdin) = rec.child.stdin.take() { + let _ = stdin.write_all(b"q\n").await; + let _ = stdin.flush().await; + } + if tokio::time::timeout(std::time::Duration::from_secs(5), rec.child.wait()) + .await + .is_err() + { + let _ = rec.child.start_kill(); + let _ = rec.child.wait().await; + } + + let result = transcribe(&rec.bin_path, &rec.model_path, &rec.wav).await; + let _ = tokio::fs::remove_file(&rec.wav).await; + let _ = tokio::fs::remove_file(format!("{}.txt", rec.wav.to_string_lossy())).await; + result +} + +/// Abort an in-flight recording without transcribing; deletes the audio. +#[tauri::command] +pub async fn voice_cancel(state: tauri::State<'_, VoiceState>) -> Result<(), String> { + let rec = state.0.lock().map_err(|_| "voice state poisoned")?.take(); + if let Some(mut rec) = rec { + let _ = rec.child.start_kill(); + let _ = rec.child.wait().await; + let _ = tokio::fs::remove_file(&rec.wav).await; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_whisper_strips_timestamps_and_logs() { + let raw = "[00:00:00.000 --> 00:00:02.500] hello world\n\ + [00:00:02.500 --> 00:00:05.000] another line"; + assert_eq!(parse_whisper_output(raw), "hello world another line"); + + let with_logs = "whisper_init_from_file: loading\n\ + system_info: AVX2\n\ + [00:00:00.000 --> 00:00:01.000] real text"; + assert_eq!(parse_whisper_output(with_logs), "real text"); + + assert_eq!(parse_whisper_output("whisper_init\nsystem_info: X"), ""); + } + + #[test] + fn expand_home_handles_tilde() { + let home = Path::new("/home/u"); + assert_eq!(expand_home("~", home), PathBuf::from("/home/u")); + assert_eq!(expand_home("~/m/x.bin", home), PathBuf::from("/home/u/m/x.bin")); + assert_eq!(expand_home("/abs", home), PathBuf::from("/abs")); + } + + #[test] + fn ffmpeg_args_are_16k_mono_wav() { + let args = ffmpeg_record_args(":default", Path::new("/t/o.wav")); + assert!(args.windows(2).any(|w| w == ["-ar", "16000"])); + assert!(args.windows(2).any(|w| w == ["-ac", "1"])); + assert!(args.windows(2).any(|w| w == ["-i", ":default"])); + assert_eq!(args.last().unwrap(), "/t/o.wav"); + } + + #[test] + fn compute_status_flags_missing_configured_paths() { + // A temp home with a settings.json pointing at non-existent bin/model. + let home = std::env::temp_dir().join(format!("dc-voice-{}", std::process::id())); + let cfg_dir = home.join(".deepcode"); + std::fs::create_dir_all(&cfg_dir).unwrap(); + std::fs::write( + cfg_dir.join("settings.json"), + r#"{"voice":{"binPath":"/no/such/whisper","modelPath":"/no/such/m.bin"}}"#, + ) + .unwrap(); + + let s = compute_status(&home); + let _ = std::fs::remove_dir_all(&home); + + assert!(!s.ready); + assert!(s.bin_path.is_none()); + assert!(s.model_path.is_none()); + let joined = s.problems.join("\n"); + assert!(joined.contains("Configured voice.binPath not found"), "got {joined}"); + assert!(joined.contains("Configured voice.modelPath not found"), "got {joined}"); + } +} diff --git a/apps/desktop/src/index.css b/apps/desktop/src/index.css index 90252d1..053763d 100644 --- a/apps/desktop/src/index.css +++ b/apps/desktop/src/index.css @@ -1065,6 +1065,51 @@ select { color: var(--text-0); border-color: var(--line); } +.composer .mic-btn { + width: 28px; + height: 28px; + border-radius: 6px; + background: var(--bg-2); + color: var(--text-2); + border: 1px solid var(--line-soft); + display: inline-flex; + align-items: center; + justify-content: center; + font-size: 13px; + cursor: pointer; +} +.composer .mic-btn:hover:not(:disabled) { + color: var(--text-0); + border-color: var(--line); +} +.composer .mic-btn:disabled { + opacity: 0.45; + cursor: not-allowed; +} +.composer .mic-btn.recording { + color: var(--error); + background: rgba(255, 84, 112, 0.12); + border-color: rgba(255, 84, 112, 0.4); + animation: mic-pulse 1.2s ease-in-out infinite; +} +.composer .mic-btn.transcribing { + color: #b4c2ff; + border-color: var(--brand-line); +} +@keyframes mic-pulse { + 0%, + 100% { + box-shadow: 0 0 0 0 rgba(255, 84, 112, 0.4); + } + 50% { + box-shadow: 0 0 0 4px rgba(255, 84, 112, 0); + } +} +.composer .voice-error { + font-size: 11px; + color: var(--error); + cursor: default; +} .composer .mode-badge { font-size: 11px; padding: 3px 9px; diff --git a/apps/desktop/src/lib/use-voice.ts b/apps/desktop/src/lib/use-voice.ts new file mode 100644 index 0000000..cd7000d --- /dev/null +++ b/apps/desktop/src/lib/use-voice.ts @@ -0,0 +1,76 @@ +// React hook driving the composer's 🎙 voice button. A small state machine — +// idle → recording → transcribing → idle — over the native voice_* commands. +// `onTranscript` receives the final text so the composer can splice it in. + +import { useCallback, useEffect, useRef, useState } from 'react'; +import { voiceCancel, voiceStart, voiceStatus, voiceStop } from './voice.js'; + +export type VoiceState = 'idle' | 'recording' | 'transcribing'; + +export interface UseVoice { + state: VoiceState; + /** null until the status probe resolves; then whether voice is set up. */ + available: boolean | null; + /** Setup problems from the status probe (for a tooltip when unavailable). */ + problems: string[]; + /** Last error (start/stop failure), or null. */ + error: string | null; + /** idle → start recording; recording → stop + transcribe. No-op while busy. */ + toggle: () => void; + /** Abort an in-flight recording without transcribing. */ + cancel: () => void; +} + +export function useVoice(onTranscript: (text: string) => void): UseVoice { + const [state, setState] = useState('idle'); + const [available, setAvailable] = useState(null); + const [problems, setProblems] = useState([]); + const [error, setError] = useState(null); + const busy = useRef(false); + + useEffect(() => { + let live = true; + voiceStatus() + .then((s) => { + if (!live) return; + setAvailable(s.ready); + setProblems(s.problems); + }) + .catch(() => live && setAvailable(false)); + return () => { + live = false; + }; + }, []); + + const toggle = useCallback(() => { + if (busy.current) return; + setError(null); + if (state === 'idle') { + busy.current = true; + void voiceStart() + .then(() => setState('recording')) + .catch((e: unknown) => setError(String(e))) + .finally(() => (busy.current = false)); + } else if (state === 'recording') { + busy.current = true; + setState('transcribing'); + void voiceStop() + .then((text) => { + const t = text.trim(); + if (t) onTranscript(t); + }) + .catch((e: unknown) => setError(String(e))) + .finally(() => { + setState('idle'); + busy.current = false; + }); + } + }, [state, onTranscript]); + + const cancel = useCallback(() => { + if (state === 'idle') return; + void voiceCancel().finally(() => setState('idle')); + }, [state]); + + return { state, available, problems, error, toggle, cancel }; +} diff --git a/apps/desktop/src/lib/voice.test.ts b/apps/desktop/src/lib/voice.test.ts new file mode 100644 index 0000000..378f00b --- /dev/null +++ b/apps/desktop/src/lib/voice.test.ts @@ -0,0 +1,57 @@ +// voice.ts — IPC wrapper command names (mirrors tauri-api.test.ts) + the pure +// transcript-insertion helper. `invoke` is mocked so no Tauri runtime is needed. + +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { invoke } from '@tauri-apps/api/core'; +import { insertTranscript, voiceCancel, voiceStart, voiceStatus, voiceStop } from './voice.js'; + +vi.mock('@tauri-apps/api/core', () => ({ invoke: vi.fn() })); +const invokeMock = vi.mocked(invoke); + +beforeEach(() => invokeMock.mockReset()); + +describe('voice IPC wrappers', () => { + it('call the matching voice_* commands', async () => { + invokeMock.mockResolvedValue(undefined); + await voiceStart(); + expect(invokeMock).toHaveBeenCalledWith('voice_start'); + await voiceCancel(); + expect(invokeMock).toHaveBeenCalledWith('voice_cancel'); + + invokeMock.mockResolvedValue('hello there'); + expect(await voiceStop()).toBe('hello there'); + expect(invokeMock).toHaveBeenCalledWith('voice_stop'); + + invokeMock.mockResolvedValue({ ready: true, problems: [] }); + const s = await voiceStatus(); + expect(invokeMock).toHaveBeenCalledWith('voice_status'); + expect(s.ready).toBe(true); + }); +}); + +describe('insertTranscript', () => { + it('inserts into an empty composer without a leading space', () => { + expect(insertTranscript('', 0, 'hello world')).toEqual({ value: 'hello world', caret: 11 }); + }); + + it('adds a single space when the preceding char is not whitespace', () => { + const r = insertTranscript('write a', 7, 'function'); + expect(r.value).toBe('write a function'); + expect(r.caret).toBe('write a function'.length); + }); + + it('does not double-space after existing whitespace', () => { + expect(insertTranscript('write ', 6, 'tests').value).toBe('write tests'); + expect(insertTranscript('line\n', 5, 'two').value).toBe('line\ntwo'); + }); + + it('splices at the cursor, keeping the tail', () => { + const r = insertTranscript('abXY', 2, 'foo'); // cursor between "ab" and "XY" + expect(r.value).toBe('ab fooXY'); + expect(r.caret).toBe('ab foo'.length); + }); + + it('clamps an out-of-range cursor to the end', () => { + expect(insertTranscript('ab', 99, 'c').value).toBe('ab c'); + }); +}); diff --git a/apps/desktop/src/lib/voice.ts b/apps/desktop/src/lib/voice.ts new file mode 100644 index 0000000..02cbd43 --- /dev/null +++ b/apps/desktop/src/lib/voice.ts @@ -0,0 +1,53 @@ +// Renderer side of voice input — thin wrappers over the Rust voice_* commands +// (apps/desktop/src-tauri/src/voice.rs) plus a pure transcript-insertion helper. +// The whole record → transcribe flow runs natively; the renderer just toggles it. + +import { invoke } from '@tauri-apps/api/core'; + +export interface VoiceStatus { + ready: boolean; + binPath: string | null; + modelPath: string | null; + recorderPath: string | null; + problems: string[]; +} + +/** Is whisper.cpp + a model + ffmpeg installed/configured? */ +export async function voiceStatus(): Promise { + return invoke('voice_status'); +} + +/** Begin recording from the default mic. Rejects if voice isn't set up. */ +export async function voiceStart(): Promise { + await invoke('voice_start'); +} + +/** Stop recording and return the locally-transcribed text. */ +export async function voiceStop(): Promise { + return invoke('voice_stop'); +} + +/** Abort an in-flight recording without transcribing. */ +export async function voiceCancel(): Promise { + await invoke('voice_cancel'); +} + +/** + * Splice `text` into `value` at the cursor, adding a single space separator when + * needed. Pure — returns the new value and the caret position after the inserted + * text. Used to drop a transcript into the composer without clobbering what's + * already typed. + */ +export function insertTranscript( + value: string, + cursor: number, + text: string, +): { value: string; caret: number } { + const pos = Math.max(0, Math.min(cursor, value.length)); + const before = value.slice(0, pos); + const after = value.slice(pos); + const needsLead = before.length > 0 && !/\s$/.test(before); + const lead = needsLead ? ' ' : ''; + const insert = lead + text; + return { value: before + insert + after, caret: pos + insert.length }; +} diff --git a/apps/desktop/src/preview-app.tsx b/apps/desktop/src/preview-app.tsx index 0b90f81..89e0a7a 100644 --- a/apps/desktop/src/preview-app.tsx +++ b/apps/desktop/src/preview-app.tsx @@ -170,6 +170,20 @@ const MOCK_MESSAGES = [ // Session snapshots back the file panel's Diff + History tabs. case 'session_snapshots': return MOCK_SNAPSHOTS; + // Voice input (🎙 composer button). Pretend it's set up; stop returns text. + case 'voice_status': + return { + ready: true, + binPath: '/opt/homebrew/bin/whisper-cli', + modelPath: '/Users/oratis/.deepcode/models/whisper-base.en.bin', + recorderPath: '/opt/homebrew/bin/ffmpeg', + problems: [], + }; + case 'voice_start': + case 'voice_cancel': + return null; + case 'voice_stop': + return 'add a dark mode toggle to the settings screen'; default: console.warn('[preview] unmocked invoke:', cmd); return null; diff --git a/apps/desktop/src/screens/Repl.tsx b/apps/desktop/src/screens/Repl.tsx index 69ae1bb..bf25d70 100644 --- a/apps/desktop/src/screens/Repl.tsx +++ b/apps/desktop/src/screens/Repl.tsx @@ -17,7 +17,7 @@ // CSS class names (which now match the design tokens) and the addition // of richer tool-card rendering. -import { useEffect, useMemo, useRef, useState } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { DEFAULT_KEYBINDINGS, VimState, @@ -31,6 +31,8 @@ import { Pill } from '../components/Pill.js'; import { PlusMenu } from '../components/PlusMenu.js'; import { ToolCard } from '../components/ToolCard.js'; import { projectName } from '../lib/project.js'; +import { useVoice } from '../lib/use-voice.js'; +import { insertTranscript } from '../lib/voice.js'; import { appendTextDelta, appendToolUse, @@ -264,6 +266,26 @@ export function ReplScreen({ const listRef = useRef(null); const composerRef = useRef(null); + // ── Voice input (🎙) — record + transcribe locally, splice into the composer ── + const handleTranscript = useCallback((text: string) => { + const ta = composerRef.current; + if (!ta) { + setInput((v) => insertTranscript(v, v.length, text).value); + return; + } + const { value, caret } = insertTranscript(ta.value, ta.selectionStart, text); + setInput(value); + ta.focus(); + requestAnimationFrame(() => { + try { + ta.setSelectionRange(caret, caret); + } catch { + /* element gone */ + } + }); + }, []); + const voice = useVoice(handleTranscript); + // ── Load settings + keybindings on mount ── useEffect(() => { void (async () => { @@ -744,6 +766,38 @@ export function ReplScreen({ ]} /> + + {voice.error && ( + + ⚠ voice + + )} + value={mode} onChange={setMode} diff --git a/docs/BEHAVIOR_PARITY.md b/docs/BEHAVIOR_PARITY.md index db0003e..36b1842 100644 --- a/docs/BEHAVIOR_PARITY.md +++ b/docs/BEHAVIOR_PARITY.md @@ -21,55 +21,55 @@ Legend: `✅` matches · `🟡` matches with caveats · `🔄` deferred · `⚠ ## Slash commands (30+ in Claude Code, ~32 shipped in DeepCode) -| Command | Claude Code | DeepCode | Status | -| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `/help` | ✓ | ✓ | ✅ | -| `/clear` | ✓ | ✓ | ✅ | -| `/exit` / `/quit` | ✓ | ✓ | ✅ | -| `/status` / `/doctor` | ✓ | ✓ | ✅ | -| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | -| `/mode` | ✓ | ✓ | ✅ | -| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | -| `/cost` / `/usage` | ✓ | ✓ | ✅ | -| `/context` | ✓ | ✓ | ✅ | -| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | -| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | -| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | -| `/mcp` | ✓ | ✓ | ✅ | -| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | -| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | -| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | -| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | -| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | -| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | -| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | -| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | -| `/voice` | ✓ | ✓ | 🟡 — CLI: `/voice` records via ffmpeg/sox → whisper.cpp → pre-fills the input line (`/voice setup` for steps; fully local). Desktop 🎙 button is a follow-up slice | -| `/teleport` | ✓ | ✗ | 🔄 M8 | -| `/desktop` | ✓ | ✗ | 🔄 M6 | -| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | -| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | -| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | -| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | -| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | -| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | -| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | -| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | -| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | -| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | -| `/loop` | ✓ | ✗ (skill avail) | 🟡 | -| `/terminal-setup` | ✓ | ✗ | 🔄 | -| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | -| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | -| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | -| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | -| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | -| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | -| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | -| `/migrate-installer` | ✓ | ✗ | 🔄 | -| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | +| Command | Claude Code | DeepCode | Status | +| -------------------------- | ----------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `/help` | ✓ | ✓ | ✅ | +| `/clear` | ✓ | ✓ | ✅ | +| `/exit` / `/quit` | ✓ | ✓ | ✅ | +| `/status` / `/doctor` | ✓ | ✓ | ✅ | +| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | +| `/mode` | ✓ | ✓ | ✅ | +| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | +| `/cost` / `/usage` | ✓ | ✓ | ✅ | +| `/context` | ✓ | ✓ | ✅ | +| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | +| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | +| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | +| `/mcp` | ✓ | ✓ | ✅ | +| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | +| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | +| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | +| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | +| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | +| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | +| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | +| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | +| `/voice` | ✓ | ✓ | 🟡 — local whisper.cpp dictation on CLI (`/voice`: ffmpeg/sox → transcribe → pre-fill) **and** desktop (🎙 composer button, ffmpeg). Fully on-device; real-mic round-trip needs local verification | +| `/teleport` | ✓ | ✗ | 🔄 M8 | +| `/desktop` | ✓ | ✗ | 🔄 M6 | +| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | +| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | +| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | +| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | +| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | +| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | +| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | +| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | +| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | +| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | +| `/loop` | ✓ | ✗ (skill avail) | 🟡 | +| `/terminal-setup` | ✓ | ✗ | 🔄 | +| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | +| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | +| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | +| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | +| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | +| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | +| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | +| `/migrate-installer` | ✓ | ✗ | 🔄 | +| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | --- diff --git a/docs/VOICE_INPUT.md b/docs/VOICE_INPUT.md index 0112235..fd99640 100644 --- a/docs/VOICE_INPUT.md +++ b/docs/VOICE_INPUT.md @@ -108,8 +108,10 @@ In the CLI REPL, type `/voice` and press Enter. DeepCode: Run `/voice setup` any time to print install steps and what's detected. -In the Mac client (M6-rest), the same flow appears as a 🎙 button in the -composer. +In the Mac desktop client, the same flow is a 🎙 button in the composer: +click to record, click again to stop and transcribe. The desktop path uses +ffmpeg specifically (it stops recording by sending `q` to ffmpeg's stdin) and +prompts for microphone access on first use. ## Privacy From cabd3a6175f8e3a193de37d2ba418f434d895049 Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 14:16:24 +0800 Subject: [PATCH 5/5] ci: trigger checks (PR retargeted to main)