diff --git a/README.md b/README.md index f7ab5b7..ffe01e6 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,19 @@ scribe transcribe meeting.wav --language es # Spanish scribe transcribe meeting.wav --language fr # French ``` +### Live streaming from microphone + +```bash +scribe stream # multilingual (25 languages, ~11s latency) +scribe stream --engine nemotron # English-only, low latency (~560ms), with punctuation +scribe stream --format jsonl # JSONL output (one JSON object per line) +scribe stream --output meeting.txt # save to file while streaming +``` + +> **Note on streaming engines:** +> - **Default** (Parakeet TDT v3): Supports all 25 languages including Spanish. Higher latency (~11s for confirmed text) because it uses a batch model in sliding windows. Live preview appears on screen while speaking. +> - **Nemotron** (`--engine nemotron`): English-only but much faster (~560ms latency). Includes punctuation and capitalization. Recommended for English-only meetings. + ### Pre-download models ```bash @@ -143,8 +156,10 @@ Tested on Apple Silicon (M-series): |------|-------|---------| | Transcription only | ~130x real-time | 4-min file in 1.7s | | Transcription + diarization | ~30x real-time | 4-min file in 7.5s | +| Live streaming (Nemotron) | ~560ms latency | English, with punctuation | +| Live streaming (default) | ~11s latency | 25 languages | -Models are downloaded automatically on first use (~600MB for ASR, ~50MB for diarization). +Models are downloaded automatically on first use (~600MB per model). First run may take a minute. ## Requirements @@ -159,7 +174,7 @@ Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, G scribe is built on the shoulders of excellent open-source projects: -- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — The speech recognition model that powers transcription +- **[NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3)** (CC-BY-4.0) — Speech recognition models (TDT v3 for batch, Nemotron for streaming) - **[FluidAudio](https://github.com/FluidInference/FluidAudio)** (Apache 2.0) by FluidInference — CoreML speech processing SDK for Apple Silicon - **[pyannote.audio](https://github.com/pyannote/pyannote-audio)** (MIT) by Herve Bredin — The diarization model architecture - **[swift-argument-parser](https://github.com/apple/swift-argument-parser)** (Apache 2.0) by Apple — CLI argument parsing diff --git a/Sources/scribe/Commands/Stream.swift b/Sources/scribe/Commands/Stream.swift new file mode 100644 index 0000000..705dbb2 --- /dev/null +++ b/Sources/scribe/Commands/Stream.swift @@ -0,0 +1,574 @@ +import ArgumentParser +import AVFoundation +import FluidAudio +import Foundation + +/// Thread-safe state for tracking incremental transcript output. +private actor StreamState { + private var lastFullTranscript = "" + private var printedLength = 0 + + /// Given the full accumulated transcript, return only the new portion. + /// Handles model revisions by finding the longest common prefix. + func getNewText(_ fullTranscript: String) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + + lastFullTranscript = text + + // Find how much of the text we've already printed + if text.count > printedLength { + let newPortion = String(text.dropFirst(printedLength)).trimmingCharacters(in: .whitespaces) + if !newPortion.isEmpty { + printedLength = text.count + return newPortion + } + } + + return nil + } + + /// Get the live preview (last N chars of full transcript). + func getPreview(_ fullTranscript: String, maxLen: Int = 100) -> String? { + let text = fullTranscript.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { return nil } + guard text != lastFullTranscript else { return nil } + return String(text.suffix(maxLen)) + } +} + +struct Stream: AsyncParsableCommand { + static let configuration = CommandConfiguration( + abstract: "Stream live audio transcription from microphone." + ) + + @Option(name: .long, help: "Output format: text or jsonl.") + var format: StreamOutputFormat = .text + + @Option(name: .long, help: "Streaming engine: default (multilingual, higher latency) or nemotron (English-only, low latency ~560ms).") + var engine: StreamEngine = .default + + @Option(name: .shortAndLong, help: "Save transcript to file.") + var output: String? + + @Option(name: .long, help: "Save captured audio to WAV file. After streaming ends, the saved audio is automatically re-transcribed with the batch engine for higher accuracy.") + var saveAudio: String? + + @Flag(name: .long, help: "Show status information.") + var verbose: Bool = false + + @Option(name: .long, help: "Read audio from a WAV/audio file instead of microphone (for testing/eval). Exits when file is consumed.") + var audioFile: String? + + func run() async throws { + switch engine { + case .nemotron: + try await runNemotron() + case .default: + try await runSlidingWindow() + } + } + + // MARK: - Nemotron Engine (English-only, low latency) + + private func runNemotron() async throws { + log("Initializing streaming ASR (Nemotron 1120ms, English-only)...") + log("Downloading model if needed (first run only, ~600MB)...") + + let engine = StreamingAsrEngineFactory.create(.nemotron1120ms) + + do { + try await engine.loadModels() + } catch { + log("Model loading failed: \(error.localizedDescription)") + log("Cleaning cache and retrying download...") + + let cacheDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + .appendingPathComponent("FluidAudio/Models/nemotron-streaming") + try? FileManager.default.removeItem(at: cacheDir) + + let freshEngine = StreamingAsrEngineFactory.create(.nemotron1120ms) + try await freshEngine.loadModels() + log("Retry successful. Warming up...") + try await warmUpEngine(freshEngine) + log("Ready.") + try await runNemotronWithEngine(freshEngine) + return + } + + log("Models loaded. Warming up...") + try await warmUpEngine(engine) + log("Ready.") + try await runNemotronWithEngine(engine) + } + + /// Force CoreML model compilation by running one silent chunk through the engine. + /// Without this, the first real audio chunk blocks for 10-20s while CoreML compiles + /// the neural network, causing all live output to batch up and appear at once. + private func warmUpEngine(_ engine: any StreamingAsrEngine) async throws { + guard let fmt = AVAudioFormat( + commonFormat: .pcmFormatFloat32, sampleRate: 16000, channels: 1, interleaved: false + ) else { return } + + // One complete chunk of silence (1120ms = 17920 samples at 16kHz). + let chunkSamples: AVAudioFrameCount = 17920 + guard let silence = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: chunkSamples) else { return } + silence.frameLength = chunkSamples + // Buffer is zero-initialized by default — silence. + + nonisolated(unsafe) let buf = silence + try await engine.appendAudio(buf) + try await engine.processBufferedAudio() + // Discard the silence tokens so they don't pollute the real transcript. + try await engine.reset() + } + + /// Unified entry point for the Nemotron pipeline. Both file and mic modes + /// flow through the same drain loop — the only difference is the audio source. + private func runNemotronWithEngine(_ engine: any StreamingAsrEngine) async throws { + let startTime = Date() + let outputFile = openOutputFile() + + // Open audio recording file if --save-audio was passed. + let audioWriter = try openAudioWriter() + + // Shared queue between the audio source (file or mic) and the drain loop. + // AsyncStream.Continuation.yield() is non-async and audio-thread-safe — this is + // FluidAudio's own pattern (see SlidingWindowAsrManager:63-65, 182-184). + let (stream, continuation) = AsyncStream.makeStream() + + // Start the source. Mic mode keeps the AVAudioEngine and DispatchSourceSignal + // alive in `micResources` for the lifetime of the stream. + var micResources: NemotronMicResources? = nil + if let path = audioFile { + try await feedNemotronFromFile(path: path, continuation: continuation) + } else { + micResources = try startMicSource(continuation: continuation) + } + + // Drain loop runs until the continuation finishes (file end or SIGINT). + try await runNemotronDrainLoop( + engine: engine, + stream: stream, + startTime: startTime, + outputFile: outputFile, + audioWriter: audioWriter + ) + + // Cleanup mic resources after the drain loop has flushed via finish(). + if let res = micResources { + res.audioEngine.stop() + res.audioEngine.inputNode.removeTap(onBus: 0) + res.signalSource.cancel() + } + + outputFile?.closeFile() + + // If we recorded audio, run batch transcription for a polished final transcript. + if let audioPath = saveAudio, audioWriter != nil { + log("Saved audio to: \(audioPath)") + log("Running batch transcription for polished output...") + try await runBatchTranscription(audioPath: audioPath, outputFile: outputFile) + } + } + + /// Drain the audio queue, feed the engine, poll the partial transcript, and emit + /// deltas. Used for both mic and file modes — the queue is the only difference. + private func runNemotronDrainLoop( + engine: any StreamingAsrEngine, + stream: AsyncStream, + startTime: Date, + outputFile: FileHandle?, + audioWriter: AVAudioFile? = nil + ) async throws { + let state = StreamState() + + for await buffer in stream { + // Save audio for later batch processing if --save-audio was passed. + if let writer = audioWriter { + try? writer.write(from: buffer) + } + + nonisolated(unsafe) let buf = buffer + try await engine.appendAudio(buf) + try await engine.processBufferedAudio() + + // Live preview to stderr (text format only) — overwritten on each tick. + let current = await engine.getPartialTranscript() + emitLivePreview(current, startTime: startTime) + + // Delta emission to stdout (and output file). + if let newText = await state.getNewText(current) { + emitDelta(newText, startTime: startTime, outputFile: outputFile) + } + } + + // Stream ended — flush any tail audio via finish() and emit remaining content. + // This is the bug fix from the previous phase: finish() may decode tokens that + // never reached the per-chunk path, so we always run its result through the + // delta logic to capture them. + let finalText = try await engine.finish() + if let newText = await state.getNewText(finalText) { + emitDelta(newText, startTime: startTime, outputFile: outputFile) + } + + // Newline so the final stderr preview line doesn't run into the next prompt. + if format == .text { + FileHandle.standardError.write(Data("\n".utf8)) + } + } + + /// File source adapter — reads the whole file via AudioConverter (16kHz mono Float32), + /// pushes it as a single buffer, and finishes the continuation so the drain loop exits. + private func feedNemotronFromFile( + path: String, + continuation: AsyncStream.Continuation + ) async throws { + let url = URL(fileURLWithPath: path) + guard FileManager.default.fileExists(atPath: url.path) else { + throw ValidationError("Audio file not found: \(path)") + } + + if verbose { log("Loading audio file: \(path)") } + let converter = AudioConverter() + let samples = try converter.resampleAudioFile(url) + let duration = Double(samples.count) / 16000.0 + if verbose { + log(String(format: "Loaded %.0fs (%.1f min), %d samples", duration, duration / 60, samples.count)) + } + log("Streaming from file (no microphone)...") + + guard let buffer = makeMonoFloat32Buffer(from: samples) else { + throw ValidationError("Failed to allocate audio buffer") + } + continuation.yield(buffer) + continuation.finish() + } + + /// Mic source adapter — installs the input tap, pre-resamples each tap buffer to + /// 16kHz mono Float32 (matching the file path), yields it onto the shared stream, + /// and wires SIGINT to a DispatchSourceSignal that finishes the continuation + /// (replacing the brutal Darwin.exit(0) that used to drop tail audio). + private func startMicSource( + continuation: AsyncStream.Continuation + ) throws -> NemotronMicResources { + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + } + + // AudioConverter is a final class with only immutable state — safe to call + // resampleBuffer() from the audio render thread. + nonisolated(unsafe) let converter = AudioConverter() + nonisolated(unsafe) let cont = continuation + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + do { + let samples = try converter.resampleBuffer(buffer) + if let outBuffer = makeMonoFloat32Buffer(from: samples) { + cont.yield(outBuffer) + } + } catch { + // Drop the buffer on resample error rather than crash the audio thread. + } + } + + try audioEngine.start() + log("Listening (English, low latency)... press Ctrl+C to stop") + + // Graceful shutdown via DispatchSourceSignal — replaces Darwin.exit(0). + // The handler runs on a regular dispatch queue (not the signal context), so + // it's safe to call continuation.finish() from here. + signal(SIGINT, SIG_IGN) + let sigSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: .global()) + sigSource.setEventHandler { + FileHandle.standardError.write(Data("\n[scribe] Stopping...\n".utf8)) + cont.finish() + } + sigSource.resume() + + return NemotronMicResources(audioEngine: audioEngine, signalSource: sigSource) + } + + // MARK: - Nemotron emission helpers + + /// Live preview line on stderr (text format only). Overwrites itself with `\r`. + private func emitLivePreview(_ fullTranscript: String, startTime: Date) { + guard format == .text else { return } + let preview = String(fullTranscript.trimmingCharacters(in: .whitespaces).suffix(100)) + guard !preview.isEmpty else { return } + let elapsed = Date().timeIntervalSince(startTime) + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + } + + /// Emit a confirmed delta to stdout (and output file). Format-aware. + /// + /// Text mode: tokens flow naturally without timestamps. A newline is inserted + /// after sentence-ending punctuation (. ? !) so output reads as prose: + /// "Hello computer. Can you help me with something?\n" + /// + /// JSONL mode: one JSON object per delta, with timestamp (for machine consumption). + private func emitDelta(_ newText: String, startTime: Date, outputFile: FileHandle?) { + let elapsed = Date().timeIntervalSince(startTime) + + switch format { + case .text: + // Clear the live preview line before writing confirmed text. + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + + // Stream text naturally: append tokens, newline on sentence boundaries. + var textToWrite = newText + // Insert newline after sentence-ending punctuation followed by a space or end. + textToWrite = textToWrite.replacingOccurrences(of: ". ", with: ".\n") + textToWrite = textToWrite.replacingOccurrences(of: "? ", with: "?\n") + textToWrite = textToWrite.replacingOccurrences(of: "! ", with: "!\n") + // Also handle punctuation at the very end of the delta. + if textToWrite.hasSuffix(".") || textToWrite.hasSuffix("?") || textToWrite.hasSuffix("!") { + textToWrite += "\n" + } + + // Write WITHOUT trailing newline (tokens append naturally on the same line). + FileHandle.standardOutput.write(Data(textToWrite.utf8)) + fflush(stdout) + outputFile?.write(Data(textToWrite.utf8)) + + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": newText, + ] + guard let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) else { + return + } + print(jsonStr) + fflush(stdout) + outputFile?.write(Data((jsonStr + "\n").utf8)) + } + } + + private func openOutputFile() -> FileHandle? { + guard let outputPath = output else { return nil } + FileManager.default.createFile(atPath: outputPath, contents: nil) + return FileHandle(forWritingAtPath: outputPath) + } + + /// Open an AVAudioFile for writing 16kHz mono Float32 WAV — the same format the + /// engine sees, so the saved file can be batch-transcribed directly. + private func openAudioWriter() throws -> AVAudioFile? { + guard let path = saveAudio else { return nil } + guard let fmt = AVAudioFormat( + commonFormat: .pcmFormatFloat32, sampleRate: 16000, channels: 1, interleaved: false + ) else { + throw ValidationError("Failed to create audio format for recording") + } + return try AVAudioFile(forWriting: URL(fileURLWithPath: path), settings: fmt.settings) + } + + /// Run batch transcription (Parakeet TDT v3) on a saved audio file for a polished + /// final transcript. Called after the stream ends when --save-audio is used. + private func runBatchTranscription(audioPath: String, outputFile: FileHandle?) async throws { + let converter = AudioConverter() + let samples = try converter.resampleAudioFile(URL(fileURLWithPath: audioPath)) + let duration = Double(samples.count) / 16000.0 + + let asrModels = try await AsrModels.downloadAndLoad(version: .v3) + let asrManager = AsrManager() + try await asrManager.initialize(models: asrModels) + + let startTime = Date() + let asrResult = try await asrManager.transcribe(samples, source: .system) + let elapsed = Date().timeIntervalSince(startTime) + + log(String(format: "Batch transcription: %.0fs audio in %.1fs (%.0fx real-time)", + duration, elapsed, duration / elapsed)) + + // Emit the polished transcript. + let text = asrResult.text.trimmingCharacters(in: .whitespaces) + + FileHandle.standardError.write(Data("\n--- Polished transcript (batch) ---\n".utf8)) + print(text) + fflush(stdout) + + if let file = outputFile { + file.write(Data("\n--- Polished transcript (batch) ---\n".utf8)) + file.write(Data((text + "\n").utf8)) + } + } + + // MARK: - SlidingWindow Engine (Multilingual, default) + + private func runSlidingWindow() async throws { + if audioFile != nil { + throw ValidationError("--audio-file is not yet supported with the multilingual engine. Use --engine nemotron for now.") + } + + log("Initializing streaming ASR (Parakeet TDT v3, multilingual)...") + log("Downloading model if needed (first run only, ~600MB)...") + + let streamManager = SlidingWindowAsrManager(config: .streaming) + try await streamManager.start(source: .microphone) + + log("Models loaded.") + + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + let inputFormat = inputNode.outputFormat(forBus: 0) + + if verbose { + log("Microphone: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) ch") + } + + let startTime = Date() + var outputFile: FileHandle? = nil + var lastConfirmedText = "" + var lastVolatileText = "" + + if let outputPath = output { + FileManager.default.createFile(atPath: outputPath, contents: nil) + outputFile = FileHandle(forWritingAtPath: outputPath) + } + + setupSignalHandler() + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { buffer, _ in + nonisolated(unsafe) let buf = buffer + streamManager.streamAudio(buf) + } + + try audioEngine.start() + log("Listening (multilingual)... press Ctrl+C to stop") + + let updates = await streamManager.transcriptionUpdates + for await update in updates { + let text = update.text.trimmingCharacters(in: .whitespaces) + guard !text.isEmpty else { continue } + + let elapsed = Date().timeIntervalSince(startTime) + + if update.isConfirmed { + guard text != lastConfirmedText else { continue } + lastConfirmedText = text + lastVolatileText = "" + emitLine(text: text, elapsed: elapsed, outputFile: outputFile) + } else { + guard text != lastVolatileText else { continue } + lastVolatileText = text + + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + let preview = String(text.suffix(100)) + FileHandle.standardError.write(Data("\r\u{1B}[K[\(ts)] \(preview)".utf8)) + case .jsonl: + break + } + } + } + + audioEngine.stop() + inputNode.removeTap(onBus: 0) + _ = try await streamManager.finish() + } + + // MARK: - Helpers + + private func emitLine(text: String, elapsed: Double, outputFile: FileHandle?) { + let line: String + + switch format { + case .text: + let ts = formatStreamTimestamp(elapsed) + FileHandle.standardError.write(Data("\r\u{1B}[K".utf8)) + line = "[\(ts)] \(text)" + case .jsonl: + let jsonObj: [String: Any] = [ + "time": round(elapsed * 10) / 10, + "text": text, + ] + if let data = try? JSONSerialization.data(withJSONObject: jsonObj, options: [.sortedKeys]), + let jsonStr = String(data: data, encoding: .utf8) { + line = jsonStr + } else { + return + } + } + + print(line) + fflush(stdout) + + if let file = outputFile { + file.write(Data((line + "\n").utf8)) + } + } + + private func setupSignalHandler() { + signal(SIGINT) { _ in + FileHandle.standardError.write(Data("\n".utf8)) + Darwin.exit(0) + } + } + + private func log(_ message: String) { + FileHandle.standardError.write(Data("[scribe] \(message)\n".utf8)) + } +} + +private func formatStreamTimestamp(_ seconds: Double) -> String { + let totalSeconds = Int(seconds) + let hours = totalSeconds / 3600 + let minutes = (totalSeconds % 3600) / 60 + let secs = totalSeconds % 60 + if hours > 0 { + return String(format: "%02d:%02d:%02d", hours, minutes, secs) + } + return String(format: "%02d:%02d", minutes, secs) +} + +/// Resources held alive by mic mode for the duration of a stream session. +/// The AVAudioEngine and DispatchSourceSignal must outlive the drain loop — +/// dropping them would stop audio capture and cancel the signal handler. +private struct NemotronMicResources { + let audioEngine: AVAudioEngine + let signalSource: any DispatchSourceSignal +} + +/// Wrap a `[Float]` of 16kHz mono samples into a fresh AVAudioPCMBuffer. +/// Used by both the file source (whole-file buffer) and the mic source +/// (per-tap-buffer wrapping after resampling). +private func makeMonoFloat32Buffer(from samples: [Float]) -> AVAudioPCMBuffer? { + guard let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16000, + channels: 1, + interleaved: false + ) else { + return nil + } + let frameCount = AVAudioFrameCount(samples.count) + guard frameCount > 0, + let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else { + return nil + } + buffer.frameLength = frameCount + if let channelData = buffer.floatChannelData { + samples.withUnsafeBufferPointer { src in + memcpy(channelData[0], src.baseAddress!, samples.count * MemoryLayout.stride) + } + } + return buffer +} + +enum StreamOutputFormat: String, ExpressibleByArgument, CaseIterable, Sendable { + case text, jsonl +} + +enum StreamEngine: String, ExpressibleByArgument, CaseIterable, Sendable { + case `default` + case nemotron +} diff --git a/Sources/scribe/ScribeCLI.swift b/Sources/scribe/ScribeCLI.swift index 8d2e877..8a58abb 100644 --- a/Sources/scribe/ScribeCLI.swift +++ b/Sources/scribe/ScribeCLI.swift @@ -6,7 +6,7 @@ struct ScribeCLI: AsyncParsableCommand { commandName: "scribe", abstract: "State-of-the-art local audio transcription with speaker diarization for macOS.", version: "0.2.1", - subcommands: [Transcribe.self, Models.self], + subcommands: [Transcribe.self, Stream.self, Models.self], defaultSubcommand: Transcribe.self ) } diff --git a/experiments/eval/eval.py b/experiments/eval/eval.py index 1ce2a98..0649bef 100644 --- a/experiments/eval/eval.py +++ b/experiments/eval/eval.py @@ -235,24 +235,28 @@ def load_dipco(session_ids: list[str]) -> list[dict]: # ========== Core eval functions ========== def run_scribe(wav_path: Path, model: str = "large-v3-turbo", - scribe_bin: str = "scribe") -> tuple[str, float]: - """Run scribe transcribe on an audio file. Returns (transcript, elapsed_secs).""" - # Build command — v0.2+ doesn't have --model flag (uses Parakeet by default) + scribe_bin: str = "scribe", mode: str = "batch") -> tuple[str, float]: + """Run scribe on an audio file. Returns (transcript, elapsed_secs). + + mode="batch": calls `scribe transcribe ...` (offline, full audio at once) + mode="streaming": calls `scribe stream --audio-file ...` (chunked through streaming engine) + """ + if mode == "streaming": + return _run_scribe_streaming(wav_path, scribe_bin) + return _run_scribe_batch(wav_path, model, scribe_bin) + + +def _run_scribe_batch(wav_path: Path, model: str, scribe_bin: str) -> tuple[str, float]: cmd = [scribe_bin, "transcribe", str(wav_path), "--format", "txt"] - # Check if scribe supports --model flag (v0.1.x only) + # Check if scribe supports --model flag (v0.1.x only — v0.2+ uses Parakeet by default) version_result = subprocess.run([scribe_bin, "--version"], capture_output=True, text=True) version = version_result.stdout.strip() if version.startswith("0.1"): cmd.extend(["--model", model]) start = time.monotonic() - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=1800, - ) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800) elapsed = time.monotonic() - start if result.returncode != 0: @@ -263,13 +267,48 @@ def run_scribe(wav_path: Path, model: str = "large-v3-turbo", text_parts = [] for line in lines: cleaned = re.sub(r"^\[[\d:]+\]\s*(?:(?:Speaker\s+\d+|Unknown):\s*)?", "", line) - cleaned = re.sub(r"\*([^*]+)\*", r"\1", cleaned) # Remove italic markers + cleaned = re.sub(r"\*([^*]+)\*", r"\1", cleaned) cleaned = cleaned.strip() if cleaned and cleaned != "-": text_parts.append(cleaned) - transcript = " ".join(text_parts) - return transcript, elapsed + return " ".join(text_parts), elapsed + + +def _run_scribe_streaming(wav_path: Path, scribe_bin: str) -> tuple[str, float]: + """Feed an audio file through `scribe stream --audio-file` and parse JSONL output. + + Each JSONL line is `{"text": "...", "time": 1.2}` representing a confirmed delta. + Concatenate the `text` fields in order to build the full hypothesis. + """ + cmd = [ + scribe_bin, "stream", + "--audio-file", str(wav_path), + "--engine", "nemotron", + "--format", "jsonl", + ] + + start = time.monotonic() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800) + elapsed = time.monotonic() - start + + if result.returncode != 0: + raise RuntimeError(f"scribe stream failed on {wav_path}: {result.stderr}") + + text_parts = [] + for line in result.stdout.strip().split("\n"): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + text = obj.get("text", "").strip() + if text: + text_parts.append(text) + + return " ".join(text_parts), elapsed def compute_wer(reference: str, hypothesis: str) -> dict: @@ -298,7 +337,7 @@ def get_scribe_version(scribe_bin: str = "scribe") -> str: def run_eval(entries: list[dict], model: str, scribe_version: str, - scribe_bin: str = "scribe") -> list[dict]: + scribe_bin: str = "scribe", mode: str = "batch") -> list[dict]: """Run evaluation on all entries.""" results = [] @@ -308,17 +347,21 @@ def run_eval(entries: list[dict], model: str, scribe_version: str, dur_min = entry["duration_secs"] / 60 print(f"\n--- {fid}: {label} ({dur_min:.1f}m, {entry['speakers']} speakers) ---") - print(f" Transcribing with {model}...") + if mode == "streaming": + print(f" Streaming via scribe stream --audio-file (engine: nemotron)...") + else: + print(f" Transcribing with {model}...") try: - hypothesis, elapsed = run_scribe(entry["wav_path"], model=model, scribe_bin=scribe_bin) + hypothesis, elapsed = run_scribe(entry["wav_path"], model=model, scribe_bin=scribe_bin, mode=mode) except Exception as e: print(f" ERROR: {e}") results.append({ "file_id": fid, "dataset": entry["dataset"], "label": label, - "model": model, + "model": "nemotron-streaming" if mode == "streaming" else model, + "mode": mode, "duration_secs": round(entry["duration_secs"], 1), "speakers": entry["speakers"], "scribe_version": scribe_version, @@ -361,7 +404,8 @@ def run_eval(entries: list[dict], model: str, scribe_version: str, "file_id": fid, "dataset": entry["dataset"], "label": label, - "model": model, + "model": "nemotron-streaming" if mode == "streaming" else model, + "mode": mode, "duration_secs": round(entry["duration_secs"], 1), "speakers": entry["speakers"], "scribe_version": scribe_version, @@ -391,7 +435,7 @@ def write_csv(results: list[dict], output_path: Path): return fieldnames = [ - "file_id", "dataset", "label", "model", + "file_id", "dataset", "label", "model", "mode", "duration_secs", "speakers", "scribe_version", "processing_secs", "rtf", "wer", "mer", "wil", @@ -460,10 +504,13 @@ def main(): parser.add_argument("--download-only", action="store_true") parser.add_argument("--output", type=str, help="Output CSV path") parser.add_argument("--scribe-bin", default="scribe", help="Path to scribe binary") + parser.add_argument("--mode", choices=["batch", "streaming"], default="batch", + help="Eval mode: 'batch' (scribe transcribe) or 'streaming' (scribe stream --audio-file)") args = parser.parse_args() scribe_version = get_scribe_version(args.scribe_bin) print(f"scribe version: {scribe_version} ({args.scribe_bin})") + print(f"Mode: {args.mode}") print(f"Model: {args.model}") print(f"Dataset: {args.dataset}") @@ -481,8 +528,11 @@ def main(): entries.extend(load_tedlium(file_ids)) if args.dataset in ("mls-spanish", "all"): - print("\n--- Loading MLS Spanish ---") - entries.extend(load_mls_spanish(args.max_files)) + if args.mode == "streaming": + print("\n--- Skipping MLS Spanish (Nemotron streaming engine is English-only) ---") + else: + print("\n--- Loading MLS Spanish ---") + entries.extend(load_mls_spanish(args.max_files)) if args.dataset == "dipco": print("\n--- Loading DiPCo ---") @@ -499,12 +549,15 @@ def main(): # Run eval print(f"\n--- Running evaluation ({len(entries)} files) ---") - results = run_eval(entries, args.model, scribe_version, scribe_bin=args.scribe_bin) + results = run_eval(entries, args.model, scribe_version, scribe_bin=args.scribe_bin, mode=args.mode) # Write results - output_path = Path(args.output) if args.output else ( - RESULTS_DIR / f"scribe-{scribe_version}-{args.model}-{args.dataset}.csv" - ) + if args.output: + output_path = Path(args.output) + elif args.mode == "streaming": + output_path = RESULTS_DIR / f"scribe-{scribe_version}-streaming-{args.dataset}.csv" + else: + output_path = RESULTS_DIR / f"scribe-{scribe_version}-{args.model}-{args.dataset}.csv" write_csv(results, output_path) print_summary(results) diff --git a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-earnings21.csv b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-earnings21.csv index faf6aed..9c3ce7e 100644 --- a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-earnings21.csv +++ b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-earnings21.csv @@ -1,6 +1,6 @@ -file_id,dataset,label,model,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error -4386541,earnings21,Cumulus Media Inc,large-v3-turbo,1097.1,5,0.2.1,8.1,0.0073,0.1473,0.1404,0.2086,208,59,133,2715,2789,30,22,0.7333, -4387332,earnings21,Zagg Inc,large-v3-turbo,1310.2,6,0.2.1,10.5,0.008,0.127,0.1226,0.1878,287,75,142,3969,4036,36,26,0.7222, -4392809,earnings21,Lands End Inc,large-v3-turbo,1438.0,6,0.2.1,11.0,0.0076,0.1081,0.1052,0.1482,183,143,109,4025,3991,30,24,0.8, -4384683,earnings21,One Gas Inc,large-v3-turbo,1476.8,8,0.2.1,10.6,0.0072,0.1487,0.1419,0.1934,199,166,170,3599,3603,40,29,0.725, -4367318,earnings21,AcelRX Pharmaceuticals Inc,large-v3-turbo,1720.8,7,0.2.1,12.5,0.0073,0.1221,0.1192,0.1727,248,174,106,4323,4255,42,25,0.5952, +file_id,dataset,label,model,mode,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error +4386541,earnings21,Cumulus Media Inc,large-v3-turbo,batch,1097.1,5,0.2.1,8.2,0.0075,0.1473,0.1404,0.2086,208,59,133,2715,2789,30,22,0.7333, +4387332,earnings21,Zagg Inc,large-v3-turbo,batch,1310.2,6,0.2.1,10.4,0.008,0.127,0.1226,0.1878,287,75,142,3969,4036,36,26,0.7222, +4392809,earnings21,Lands End Inc,large-v3-turbo,batch,1438.0,6,0.2.1,11.0,0.0076,0.1081,0.1052,0.1482,183,143,109,4025,3991,30,24,0.8, +4384683,earnings21,One Gas Inc,large-v3-turbo,batch,1476.8,8,0.2.1,10.7,0.0073,0.1487,0.1419,0.1934,199,166,170,3599,3603,40,29,0.725, +4367318,earnings21,AcelRX Pharmaceuticals Inc,large-v3-turbo,batch,1720.8,7,0.2.1,12.5,0.0073,0.1221,0.1192,0.1727,248,174,106,4323,4255,42,25,0.5952, diff --git a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-mls-spanish.csv b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-mls-spanish.csv index 19ad949..d8aafb7 100644 --- a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-mls-spanish.csv +++ b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-mls-spanish.csv @@ -1,4 +1,4 @@ -file_id,dataset,label,model,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,error -chapter_11991,mls-spanish,MLS Spanish chapter_11991,large-v3-turbo,4019.9,1,0.2.1,49.9,0.0124,0.1489,0.1473,0.2229,786,520,100,9443,9023, -chapter_1378,mls-spanish,MLS Spanish chapter_1378,large-v3-turbo,5640.8,1,0.2.1,70.6,0.0125,0.0499,0.0495,0.0799,461,145,121,14580,14556, -chapter_567,mls-spanish,MLS Spanish chapter_567,large-v3-turbo,5262.6,1,0.2.1,66.1,0.0126,0.0787,0.0777,0.1228,646,230,186,13489,13445, +file_id,dataset,label,model,mode,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error +chapter_11991,mls-spanish,MLS Spanish chapter_11991,large-v3-turbo,batch,4019.9,1,0.2.1,50.2,0.0125,0.1489,0.1473,0.2229,786,520,100,9443,9023,0,0,0.0, +chapter_1378,mls-spanish,MLS Spanish chapter_1378,large-v3-turbo,batch,5640.8,1,0.2.1,70.6,0.0125,0.0499,0.0495,0.0799,461,145,121,14580,14556,0,0,0.0, +chapter_567,mls-spanish,MLS Spanish chapter_567,large-v3-turbo,batch,5262.6,1,0.2.1,66.7,0.0127,0.0787,0.0777,0.1228,646,230,186,13489,13445,0,0,0.0, diff --git a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-tedlium.csv b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-tedlium.csv index 07f94a6..8c8b894 100644 --- a/experiments/eval/results/scribe-0.2.1-large-v3-turbo-tedlium.csv +++ b/experiments/eval/results/scribe-0.2.1-large-v3-turbo-tedlium.csv @@ -1,6 +1,6 @@ -file_id,dataset,label,model,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,error -talk_0,tedlium,TED talk talk_0,large-v3-turbo,1249.0,1,0.2.1,8.6,0.0068,0.0652,0.0646,0.0889,72,90,27,2897,2834, -talk_1,tedlium,TED talk talk_1,large-v3-turbo,1505.8,1,0.2.1,11.1,0.0074,0.0663,0.065,0.091,117,82,89,4342,4349, -talk_2,tedlium,TED talk talk_2,large-v3-turbo,834.0,1,0.2.1,6.3,0.0076,0.0734,0.073,0.1027,73,89,13,2383,2307, -talk_3,tedlium,TED talk talk_3,large-v3-turbo,1095.5,1,0.2.1,7.8,0.0071,0.0378,0.0376,0.0545,52,42,20,3015,2993, -talk_4,tedlium,TED talk talk_4,large-v3-turbo,459.1,1,0.2.1,3.6,0.0078,0.0563,0.0552,0.0776,35,19,31,1510,1522, +file_id,dataset,label,model,mode,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error +talk_0,tedlium,TED talk talk_0,large-v3-turbo,batch,1249.0,1,0.2.1,8.7,0.007,0.0652,0.0646,0.0889,72,90,27,2897,2834,0,0,0.0, +talk_1,tedlium,TED talk talk_1,large-v3-turbo,batch,1505.8,1,0.2.1,11.2,0.0074,0.0663,0.065,0.091,117,82,89,4342,4349,0,0,0.0, +talk_2,tedlium,TED talk talk_2,large-v3-turbo,batch,834.0,1,0.2.1,6.3,0.0076,0.0734,0.073,0.1027,73,89,13,2383,2307,0,0,0.0, +talk_3,tedlium,TED talk talk_3,large-v3-turbo,batch,1095.5,1,0.2.1,7.8,0.0071,0.0378,0.0376,0.0545,52,42,20,3015,2993,0,0,0.0, +talk_4,tedlium,TED talk talk_4,large-v3-turbo,batch,459.1,1,0.2.1,3.6,0.0079,0.0563,0.0552,0.0776,35,19,31,1510,1522,0,0,0.0, diff --git a/experiments/eval/results/scribe-0.2.1-streaming-earnings21.csv b/experiments/eval/results/scribe-0.2.1-streaming-earnings21.csv new file mode 100644 index 0000000..da380c5 --- /dev/null +++ b/experiments/eval/results/scribe-0.2.1-streaming-earnings21.csv @@ -0,0 +1,6 @@ +file_id,dataset,label,model,mode,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error +4386541,earnings21,Cumulus Media Inc,nemotron-streaming,streaming,1097.1,5,0.2.1,102.9,0.0938,0.2063,0.1987,0.2615,182,274,104,2715,2545,30,22,0.7333, +4387332,earnings21,Zagg Inc,nemotron-streaming,streaming,1310.2,6,0.2.1,121.9,0.093,0.2273,0.2216,0.2789,238,562,102,3969,3509,36,23,0.6389, +4392809,earnings21,Lands End Inc,nemotron-streaming,streaming,1438.0,6,0.2.1,130.1,0.0905,0.1588,0.1535,0.1988,189,312,138,4025,3851,30,22,0.7333, +4384683,earnings21,One Gas Inc,nemotron-streaming,streaming,1476.8,8,0.2.1,137.5,0.0931,0.1814,0.1739,0.2262,197,300,156,3599,3455,40,26,0.65, +4367318,earnings21,AcelRX Pharmaceuticals Inc,nemotron-streaming,streaming,1720.8,7,0.2.1,167.1,0.0971,0.1594,0.1558,0.208,238,353,98,4323,4068,42,24,0.5714, diff --git a/experiments/eval/results/scribe-0.2.1-streaming-tedlium.csv b/experiments/eval/results/scribe-0.2.1-streaming-tedlium.csv new file mode 100644 index 0000000..1e4c22c --- /dev/null +++ b/experiments/eval/results/scribe-0.2.1-streaming-tedlium.csv @@ -0,0 +1,6 @@ +file_id,dataset,label,model,mode,duration_secs,speakers,scribe_version,processing_secs,rtf,wer,mer,wil,substitutions,deletions,insertions,ref_words,hyp_words,proper_noun_total,proper_noun_found,proper_noun_recall,error +talk_0,tedlium,TED talk talk_0,nemotron-streaming,streaming,1249.0,1,0.2.1,106.7,0.0854,0.0746,0.0743,0.0986,72,133,11,2897,2775,0,0,0.0, +talk_1,tedlium,TED talk talk_1,nemotron-streaming,streaming,1505.8,1,0.2.1,139.3,0.0925,0.0905,0.0903,0.1215,140,242,11,4342,4111,0,0,0.0, +talk_2,tedlium,TED talk talk_2,nemotron-streaming,streaming,834.0,1,0.2.1,75.3,0.0903,0.0793,0.0788,0.1116,81,93,15,2383,2305,0,0,0.0, +talk_3,tedlium,TED talk talk_3,nemotron-streaming,streaming,1095.5,1,0.2.1,101.8,0.0929,0.0441,0.044,0.0641,62,64,7,3015,2958,0,0,0.0, +talk_4,tedlium,TED talk talk_4,nemotron-streaming,streaming,459.1,1,0.2.1,44.3,0.0965,0.0609,0.0606,0.0776,26,58,8,1510,1460,0,0,0.0, diff --git a/experiments/eval/summarize.py b/experiments/eval/summarize.py index d915d88..8dc6031 100644 --- a/experiments/eval/summarize.py +++ b/experiments/eval/summarize.py @@ -121,25 +121,33 @@ def print_markdown(results: list[dict]): row += " — |" print(row) - # --- Proper noun recall section (Earnings-21 only) --- + # --- Proper noun recall section (Earnings-21 only, split by mode) --- pn_results = [r for r in results if r["dataset"] == "earnings21" and int(r.get("proper_noun_total", 0)) > 0] if pn_results: print("\n## Proper Noun Recall (Earnings-21)\n") print("WER averages all words equally. Proper noun recall measures how many entity-tagged ") print("phrases (PERSON, ORG, GPE, PRODUCT, LAW, WORK_OF_ART, FAC) appear in the hypothesis.\n") - print("| File | Company | WER | Proper Noun Recall |") - print("|---|---|---|---|") - total_pn = found_pn = 0 - for r in sorted(pn_results, key=lambda x: x["file_id"]): - wer = float(r["wer"]) - pn_total = int(r["proper_noun_total"]) - pn_found = int(r["proper_noun_found"]) - pn_recall = float(r["proper_noun_recall"]) - print(f"| {r['file_id']} | {r['label']} | {wer:.1%} | {pn_recall:.1%} ({pn_found}/{pn_total}) |") - total_pn += pn_total - found_pn += pn_found - agg_recall = found_pn / total_pn if total_pn else 0 - print(f"\n**Aggregate**: {agg_recall:.1%} ({found_pn}/{total_pn} entities)") + + modes = sorted(set(r.get("mode", "batch") for r in pn_results)) + for mode in modes: + mode_results = [r for r in pn_results if r.get("mode", "batch") == mode] + if not mode_results: + continue + label_model = mode_results[0]["model"] + print(f"\n### {mode} ({label_model})\n") + print("| File | Company | WER | Proper Noun Recall |") + print("|---|---|---|---|") + total_pn = found_pn = 0 + for r in sorted(mode_results, key=lambda x: x["file_id"]): + wer = float(r["wer"]) + pn_total = int(r["proper_noun_total"]) + pn_found = int(r["proper_noun_found"]) + pn_recall = float(r["proper_noun_recall"]) + print(f"| {r['file_id']} | {r['label']} | {wer:.1%} | {pn_recall:.1%} ({pn_found}/{pn_total}) |") + total_pn += pn_total + found_pn += pn_found + agg_recall = found_pn / total_pn if total_pn else 0 + print(f"\n**Aggregate**: {agg_recall:.1%} ({found_pn}/{total_pn} entities)") # --- Table 2: Per-file detail for primary model --- primary_model = "large-v3-turbo"