diff --git a/examples/370-swift-ios-live-transcription/.env.example b/examples/370-swift-ios-live-transcription/.env.example new file mode 100644 index 0000000..99314a3 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/.env.example @@ -0,0 +1,2 @@ +# Deepgram — https://console.deepgram.com/ +DEEPGRAM_API_KEY= diff --git a/examples/370-swift-ios-live-transcription/Package.swift b/examples/370-swift-ios-live-transcription/Package.swift new file mode 100644 index 0000000..4117da6 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/Package.swift @@ -0,0 +1,18 @@ +// swift-tools-version: 5.9 +import PackageDescription + +// This Package.swift exists so the example can be opened as a Swift package. +// For a full Xcode project, create a new iOS App target and drag the files +// from src/ into it. The app has no external dependencies — only Apple +// frameworks (AVFoundation, SwiftUI, Foundation). + +let package = Package( + name: "DeepgramLiveTranscription", + platforms: [.iOS(.v17)], + targets: [ + .executableTarget( + name: "DeepgramLiveTranscription", + path: "src" + ), + ] +) diff --git a/examples/370-swift-ios-live-transcription/README.md b/examples/370-swift-ios-live-transcription/README.md new file mode 100644 index 0000000..3bfb7f7 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/README.md @@ -0,0 +1,71 @@ +# Swift iOS Live Transcription + +A native SwiftUI iOS app that streams microphone audio to Deepgram's live speech-to-text API over WebSocket using AVAudioEngine. Displays real-time transcription with interim and final results — no third-party dependencies required. + +## What you'll build + +A SwiftUI screen with a microphone button that captures audio from the device microphone using AVAudioEngine, streams 16 kHz mono PCM audio to Deepgram via URLSessionWebSocketTask, and renders a live transcript. Interim results appear in grey as you speak; final results replace them in the primary text color. + +## Prerequisites + +- Xcode 15+ with iOS 17 SDK +- Physical iOS device (microphone access requires a real device; Simulator has limited mic support) +- Deepgram account — [get a free API key](https://console.deepgram.com/) + +## Environment variables + +| Variable | Where to find it | +|----------|-----------------| +| `DEEPGRAM_API_KEY` | [Deepgram console](https://console.deepgram.com/) → Settings → API Keys | + +## Install and run + +### Option 1: Open as Swift Package + +```bash +cd examples/370-swift-ios-live-transcription +open Package.swift +``` + +Xcode will open the package. Select an iOS device target and run. + +### Option 2: Add to an existing Xcode project + +1. Create a new iOS App project in Xcode (SwiftUI lifecycle) +2. Drag all files from `src/` into the project navigator +3. Merge `Info.plist` entries (microphone permission + background audio) +4. Set `DEEPGRAM_API_KEY` in your scheme's environment variables + +### Setting the API key + +In Xcode: **Product → Scheme → Edit Scheme → Run → Arguments → Environment Variables** — add `DEEPGRAM_API_KEY` with your key. + +## Key parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| `model` | `nova-3` | Deepgram's flagship STT model (2025) — best accuracy and speed | +| `encoding` | `linear16` | Raw 16-bit signed integer PCM — what AVAudioEngine produces | +| `sample_rate` | `16000` | 16 kHz — sufficient for speech; keeps bandwidth low on mobile | +| `interim_results` | `true` | Get partial transcripts while the user is still speaking | +| `utterance_end_ms` | `1000` | Silence threshold (ms) before Deepgram considers an utterance complete | +| `tag` | `deepgram-examples` | Tags traffic in the Deepgram console for identification | + +## How it works + +1. User taps the microphone button; `AudioCaptureManager` requests microphone permission and starts `AVAudioEngine` +2. The engine's input node tap delivers audio buffers at the hardware sample rate; `AVAudioConverter` resamples to 16 kHz mono Int16 PCM +3. `DeepgramClient` opens a WebSocket to `wss://api.deepgram.com/v1/listen` with model, encoding, and sample rate as query parameters; the API key is sent as an `Authorization: Token ` header +4. PCM buffers are sent as binary WebSocket frames (~100 ms chunks); Deepgram returns JSON `Results` messages with `is_final` and `speech_final` flags +5. `TranscriptionViewModel` accumulates final transcripts and shows interim partials; SwiftUI updates the view reactively + +## Production considerations + +- **Don't ship API keys in the binary.** Use a backend token endpoint that issues short-lived Deepgram API keys or proxies the WebSocket connection +- **Handle network transitions** — mobile apps switch between WiFi and cellular; implement WebSocket reconnection with exponential backoff +- **Battery life** — stop the audio engine and close the WebSocket when the app goes to background (`scenePhase` observer) +- **Background audio** — if you need transcription while backgrounded, enable the Audio background mode in Xcode capabilities and keep the audio session active + +## Starter templates + +[deepgram-starters](https://github.com/orgs/deepgram-starters/repositories) diff --git a/examples/370-swift-ios-live-transcription/src/AudioCaptureManager.swift b/examples/370-swift-ios-live-transcription/src/AudioCaptureManager.swift new file mode 100644 index 0000000..1984d1f --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/AudioCaptureManager.swift @@ -0,0 +1,105 @@ +import AVFoundation + +// Captures microphone audio using AVAudioEngine and delivers raw PCM buffers. +// AVAudioEngine is preferred over AVAudioRecorder because it provides a +// streaming tap (real-time buffer callback) rather than writing to a file. + +protocol AudioCaptureDelegate: AnyObject { + func audioCaptureDidReceive(pcmData: Data) +} + +final class AudioCaptureManager { + weak var delegate: AudioCaptureDelegate? + + private let engine = AVAudioEngine() + // 16 kHz mono LINEAR16 — matches the DeepgramClient's default encoding + private let desiredSampleRate: Double = 16000.0 + private let desiredChannels: UInt32 = 1 + + func startCapture() throws { + let session = AVAudioSession.sharedInstance() + // .measurement avoids system audio processing (echo cancellation, AGC) + // which would distort the audio for transcription + try session.setCategory(.record, mode: .measurement, options: .duckOthers) + try session.setActive(true, options: .notifyOthersOnDeactivation) + + let inputNode = engine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + // Convert hardware format → 16 kHz mono Int16 for Deepgram + guard let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatInt16, + sampleRate: desiredSampleRate, + channels: AVAudioChannelCount(desiredChannels), + interleaved: true + ) else { + throw AudioCaptureError.formatCreationFailed + } + + guard let converter = AVAudioConverter(from: inputFormat, to: targetFormat) else { + throw AudioCaptureError.converterCreationFailed + } + + // Buffer size: 100 ms of audio at input sample rate + let bufferSize = AVAudioFrameCount(inputFormat.sampleRate * 0.1) + + inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: inputFormat) { [weak self] buffer, _ in + self?.convert(buffer: buffer, converter: converter, targetFormat: targetFormat) + } + + try engine.start() + } + + func stopCapture() { + engine.inputNode.removeTap(onBus: 0) + engine.stop() + + try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) + } + + private func convert(buffer: AVAudioPCMBuffer, converter: AVAudioConverter, targetFormat: AVAudioFormat) { + let frameCapacity = AVAudioFrameCount( + Double(buffer.frameLength) * (targetFormat.sampleRate / buffer.format.sampleRate) + ) + guard frameCapacity > 0, + let outputBuffer = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: frameCapacity) + else { return } + + var error: NSError? + var hasData = true + converter.convert(to: outputBuffer, error: &error) { _, outStatus in + if hasData { + hasData = false + outStatus.pointee = .haveData + return buffer + } + outStatus.pointee = .noDataNow + return nil + } + + if let error = error { + print("Audio conversion error: \(error)") + return + } + + // Extract raw Int16 bytes from the converted buffer + guard let channelData = outputBuffer.int16ChannelData else { return } + let byteCount = Int(outputBuffer.frameLength) * MemoryLayout.size + let data = Data(bytes: channelData[0], count: byteCount) + delegate?.audioCaptureDidReceive(pcmData: data) + } +} + +enum AudioCaptureError: LocalizedError { + case formatCreationFailed + case converterCreationFailed + + var errorDescription: String? { + switch self { + case .formatCreationFailed: + return "Failed to create target audio format (16 kHz mono Int16)" + case .converterCreationFailed: + return "Failed to create audio converter from input to target format" + } + } +} diff --git a/examples/370-swift-ios-live-transcription/src/DeepgramClient.swift b/examples/370-swift-ios-live-transcription/src/DeepgramClient.swift new file mode 100644 index 0000000..eb91bef --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/DeepgramClient.swift @@ -0,0 +1,134 @@ +import Foundation + +// Deepgram live STT WebSocket client. +// There is no official Deepgram Swift SDK. This wraps the WebSocket API +// (wss://api.deepgram.com/v1/listen) with URLSessionWebSocketTask. +// If an official SDK is released, replace this file with SDK calls. + +struct DeepgramTranscriptMessage: Decodable { + let type: String + let channel: Channel? + let isFinal: Bool? + let speechFinal: Bool? + + enum CodingKeys: String, CodingKey { + case type, channel + case isFinal = "is_final" + case speechFinal = "speech_final" + } + + struct Channel: Decodable { + let alternatives: [Alternative] + } + + struct Alternative: Decodable { + let transcript: String + let confidence: Double + } +} + +protocol DeepgramClientDelegate: AnyObject { + func deepgramDidConnect() + func deepgramDidDisconnect(error: Error?) + func deepgramDidReceiveTranscript(_ text: String, isFinal: Bool) +} + +final class DeepgramClient { + weak var delegate: DeepgramClientDelegate? + + private var webSocketTask: URLSessionWebSocketTask? + private let apiKey: String + // nova-3 is the current flagship STT model (2025) + private let model: String + private let sampleRate: Int + private let encoding: String + + init(apiKey: String, model: String = "nova-3", sampleRate: Int = 16000, encoding: String = "linear16") { + self.apiKey = apiKey + self.model = model + self.sampleRate = sampleRate + self.encoding = encoding + } + + func connect() { + // tag=deepgram-examples — required to identify example traffic in the Deepgram console + var components = URLComponents(string: "wss://api.deepgram.com/v1/listen")! + components.queryItems = [ + URLQueryItem(name: "model", value: model), + URLQueryItem(name: "encoding", value: encoding), + URLQueryItem(name: "sample_rate", value: String(sampleRate)), + URLQueryItem(name: "channels", value: "1"), + URLQueryItem(name: "interim_results", value: "true"), + // ← THIS enables utterance-level endpointing so we get speech_final + URLQueryItem(name: "utterance_end_ms", value: "1000"), + URLQueryItem(name: "tag", value: "deepgram-examples"), + ] + + var request = URLRequest(url: components.url!) + // iOS URLSession supports custom headers on WebSocket (unlike browsers) + request.setValue("Token \(apiKey)", forHTTPHeaderField: "Authorization") + + let session = URLSession(configuration: .default) + webSocketTask = session.webSocketTask(with: request) + webSocketTask?.resume() + + delegate?.deepgramDidConnect() + listenForMessages() + } + + func sendAudio(_ data: Data) { + webSocketTask?.send(.data(data)) { error in + if let error = error { + print("WebSocket send error: \(error)") + } + } + } + + func disconnect() { + // Send CloseStream message per Deepgram protocol to flush final results + let closeMessage = #"{"type": "CloseStream"}"# + webSocketTask?.send(.string(closeMessage)) { [weak self] _ in + self?.webSocketTask?.cancel(with: .normalClosure, reason: nil) + self?.webSocketTask = nil + self?.delegate?.deepgramDidDisconnect(error: nil) + } + } + + private func listenForMessages() { + webSocketTask?.receive { [weak self] result in + switch result { + case .success(let message): + switch message { + case .string(let text): + self?.handleMessage(text) + case .data(let data): + if let text = String(data: data, encoding: .utf8) { + self?.handleMessage(text) + } + @unknown default: + break + } + // Keep listening for more messages + self?.listenForMessages() + + case .failure(let error): + self?.delegate?.deepgramDidDisconnect(error: error) + } + } + } + + private func handleMessage(_ text: String) { + guard let data = text.data(using: .utf8), + let message = try? JSONDecoder().decode(DeepgramTranscriptMessage.self, from: data), + message.type == "Results", + let transcript = message.channel?.alternatives.first?.transcript, + !transcript.isEmpty + else { return } + + // is_final=true means Deepgram won't revise this segment further + let isFinal = message.isFinal ?? false + DispatchQueue.main.async { [weak self] in + self?.delegate?.deepgramDidReceiveTranscript(transcript, isFinal: isFinal) + } + } +} diff --git a/examples/370-swift-ios-live-transcription/src/DeepgramLiveTranscriptionApp.swift b/examples/370-swift-ios-live-transcription/src/DeepgramLiveTranscriptionApp.swift new file mode 100644 index 0000000..a7d8138 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/DeepgramLiveTranscriptionApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct DeepgramLiveTranscriptionApp: App { + var body: some Scene { + WindowGroup { + TranscriptionView() + } + } +} diff --git a/examples/370-swift-ios-live-transcription/src/Info.plist b/examples/370-swift-ios-live-transcription/src/Info.plist new file mode 100644 index 0000000..997b72a --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/Info.plist @@ -0,0 +1,12 @@ + + + + + NSMicrophoneUsageDescription + This app needs microphone access to transcribe your speech in real-time using Deepgram. + UIBackgroundModes + + audio + + + diff --git a/examples/370-swift-ios-live-transcription/src/TranscriptionView.swift b/examples/370-swift-ios-live-transcription/src/TranscriptionView.swift new file mode 100644 index 0000000..880f54c --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/TranscriptionView.swift @@ -0,0 +1,75 @@ +import SwiftUI + +struct TranscriptionView: View { + @StateObject private var viewModel = TranscriptionViewModel() + + var body: some View { + NavigationStack { + VStack(spacing: 24) { + ScrollViewReader { proxy in + ScrollView { + VStack(alignment: .leading, spacing: 8) { + if !viewModel.finalTranscript.isEmpty { + Text(viewModel.finalTranscript) + .font(.body) + .frame(maxWidth: .infinity, alignment: .leading) + } + + if !viewModel.interimText.isEmpty { + Text(viewModel.interimText) + .font(.body) + .foregroundStyle(.secondary) + .frame(maxWidth: .infinity, alignment: .leading) + .id("interim") + } + + if viewModel.finalTranscript.isEmpty && viewModel.interimText.isEmpty { + Text("Tap the microphone to start transcribing.") + .font(.body) + .foregroundStyle(.tertiary) + .frame(maxWidth: .infinity, alignment: .center) + .padding(.top, 40) + } + } + .padding(.horizontal) + } + .onChange(of: viewModel.interimText) { + withAnimation { proxy.scrollTo("interim", anchor: .bottom) } + } + } + + if case .error(let message) = viewModel.state { + Text(message) + .font(.caption) + .foregroundStyle(.red) + .padding(.horizontal) + } + + HStack(spacing: 20) { + Button(action: viewModel.toggleListening) { + Image(systemName: isListening ? "stop.circle.fill" : "mic.circle.fill") + .font(.system(size: 56)) + .foregroundStyle(isListening ? .red : .blue) + } + .accessibilityLabel(isListening ? "Stop transcription" : "Start transcription") + + if !viewModel.finalTranscript.isEmpty || !viewModel.interimText.isEmpty { + Button(action: viewModel.clearTranscript) { + Image(systemName: "trash.circle.fill") + .font(.system(size: 40)) + .foregroundStyle(.secondary) + } + .accessibilityLabel("Clear transcript") + } + } + .padding(.bottom, 16) + } + .navigationTitle("Deepgram Live STT") + } + } + + private var isListening: Bool { + if case .listening = viewModel.state { return true } + return false + } +} diff --git a/examples/370-swift-ios-live-transcription/src/TranscriptionViewModel.swift b/examples/370-swift-ios-live-transcription/src/TranscriptionViewModel.swift new file mode 100644 index 0000000..6dc78b5 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/src/TranscriptionViewModel.swift @@ -0,0 +1,111 @@ +import Foundation +import SwiftUI + +@MainActor +final class TranscriptionViewModel: ObservableObject { + enum State { + case idle + case listening + case error(String) + } + + @Published var state: State = .idle + @Published var finalTranscript: String = "" + @Published var interimText: String = "" + + private var deepgramClient: DeepgramClient? + private let audioCapture = AudioCaptureManager() + + // In production, fetch a short-lived key from your backend instead. + // Shipping a long-lived API key in a mobile binary means anyone can + // extract it with a decompiler. + private var apiKey: String { + ProcessInfo.processInfo.environment["DEEPGRAM_API_KEY"] ?? "" + } + + func toggleListening() { + switch state { + case .idle, .error: + startListening() + case .listening: + stopListening() + } + } + + func clearTranscript() { + finalTranscript = "" + interimText = "" + } + + private func startListening() { + guard !apiKey.isEmpty else { + state = .error("DEEPGRAM_API_KEY not set. Add it to your environment or scheme.") + return + } + + let client = DeepgramClient(apiKey: apiKey) + let coordinator = Coordinator(viewModel: self) + client.delegate = coordinator + audioCapture.delegate = coordinator + self.deepgramClient = client + self._coordinator = coordinator + + client.connect() + + do { + try audioCapture.startCapture() + state = .listening + } catch { + state = .error("Microphone access failed: \(error.localizedDescription)") + } + } + + private func stopListening() { + audioCapture.stopCapture() + deepgramClient?.disconnect() + deepgramClient = nil + state = .idle + } + + // Coordinator bridges delegate callbacks to the @MainActor view model + private var _coordinator: Coordinator? + + private final class Coordinator: DeepgramClientDelegate, AudioCaptureDelegate { + private weak var viewModel: TranscriptionViewModel? + + init(viewModel: TranscriptionViewModel) { + self.viewModel = viewModel + } + + func deepgramDidConnect() {} + + func deepgramDidDisconnect(error: Error?) { + Task { @MainActor [weak self] in + guard let vm = self?.viewModel else { return } + if let error = error { + vm.state = .error("Disconnected: \(error.localizedDescription)") + } + } + } + + func deepgramDidReceiveTranscript(_ text: String, isFinal: Bool) { + Task { @MainActor [weak self] in + guard let vm = self?.viewModel else { return } + if isFinal { + // Append to committed transcript; clear partial + let separator = vm.finalTranscript.isEmpty ? "" : " " + vm.finalTranscript += separator + text + vm.interimText = "" + } else { + vm.interimText = text + } + } + } + + func audioCaptureDidReceive(pcmData: Data) { + Task { @MainActor [weak self] in + self?.viewModel?.deepgramClient?.sendAudio(pcmData) + } + } + } +} diff --git a/examples/370-swift-ios-live-transcription/tests/test_example.py b/examples/370-swift-ios-live-transcription/tests/test_example.py new file mode 100644 index 0000000..f6e4a88 --- /dev/null +++ b/examples/370-swift-ios-live-transcription/tests/test_example.py @@ -0,0 +1,136 @@ +"""Test Deepgram live STT integration used by the Swift iOS example. + +The SwiftUI app itself requires an iOS device/simulator with a microphone. +This test verifies the Deepgram WebSocket and REST endpoints that the Swift +client wraps — same API, same parameters — using the Python SDK as a +convenient test harness. +""" + +import os +import sys +from pathlib import Path + +# ── Credential check ──────────────────────────────────────────────────────── +# Exit code convention across all examples in this repo: +# 0 = all tests passed +# 1 = real test failure (code bug, assertion error, unexpected API response) +# 2 = missing credentials (expected in CI until secrets are configured) +env_example = Path(__file__).parent.parent / ".env.example" +required = [ + line.split("=")[0].strip() + for line in env_example.read_text().splitlines() + if line and not line.startswith("#") and "=" in line and line[0].isupper() +] +missing = [k for k in required if not os.environ.get(k)] +if missing: + print(f"MISSING_CREDENTIALS: {','.join(missing)}", file=sys.stderr) + sys.exit(2) +# ──────────────────────────────────────────────────────────────────────────── + +from deepgram import DeepgramClient + + +def test_deepgram_live_stt(): + """Verify the Deepgram API key works and nova-3 returns a transcript. + + This exercises the same REST pre-recorded endpoint with the same model + (nova-3) the Swift app uses for live streaming. If pre-recorded works, + the live WebSocket will too — both use the same API key and model. + """ + client = DeepgramClient() + response = client.listen.v1.media.transcribe_url( + url="https://dpgr.am/spacewalk.wav", + model="nova-3", + smart_format=True, + tag="deepgram-examples", + ) + alt = response.results.channels[0].alternatives[0] + transcript = alt.transcript + assert len(transcript) > 50, f"Transcript too short for a spacewalk audio file: '{transcript}'" + words = alt.words or [] + duration = words[-1].end if words else 0.0 + assert duration > 10, f"Expected audio longer than 10s, got {duration}s" + + print("✓ Deepgram STT integration working (validates API key + nova-3 model)") + print(f" Transcript preview: '{transcript[:80]}...'") + + +def test_deepgram_live_websocket(): + """Verify the WebSocket live endpoint accepts our connection and returns results. + + This mirrors the Swift app's URLSessionWebSocketTask connection to + wss://api.deepgram.com/v1/listen with the same query parameters. + """ + import json + import threading + import websocket + + api_key = os.environ["DEEPGRAM_API_KEY"] + url = ( + "wss://api.deepgram.com/v1/listen" + "?model=nova-3&encoding=linear16&sample_rate=16000" + "&channels=1&interim_results=true&utterance_end_ms=1000" + "&tag=deepgram-examples" + ) + + results = [] + connected = threading.Event() + done = threading.Event() + + def on_open(ws): + connected.set() + # Send a small silent audio buffer then close + # 16000 Hz * 2 bytes * 0.5s = 16000 bytes of silence + silence = b"\x00" * 16000 + ws.send(silence, opcode=websocket.ABNF.OPCODE_BINARY) + ws.send(json.dumps({"type": "CloseStream"})) + + def on_message(ws, message): + data = json.loads(message) + results.append(data) + if data.get("type") == "Metadata" or data.get("type") == "Results": + pass + if data.get("type") == "Finalize": + done.set() + + def on_close(ws, close_status, close_msg): + done.set() + + def on_error(ws, error): + msg = str(error) + if "\x03\xe8" in msg or "1000" in msg: + done.set() + return + results.append({"error": msg}) + done.set() + + ws = websocket.WebSocketApp( + url, + header={"Authorization": f"Token {api_key}"}, + on_open=on_open, + on_message=on_message, + on_close=on_close, + on_error=on_error, + ) + + thread = threading.Thread(target=ws.run_forever, daemon=True) + thread.start() + + connected.wait(timeout=10) + assert connected.is_set(), "WebSocket failed to connect within 10s" + + done.wait(timeout=15) + + errors = [r for r in results if "error" in r] + assert not errors, f"WebSocket errors: {errors}" + + has_results = any(r.get("type") in ("Results", "Metadata") for r in results) + assert has_results, f"No Results/Metadata messages received. Got: {[r.get('type') for r in results]}" + + print("✓ Deepgram WebSocket live STT connection working") + print(f" Received {len(results)} message(s)") + + +if __name__ == "__main__": + test_deepgram_live_stt() + test_deepgram_live_websocket()