diff --git a/CHANGELOG.md b/CHANGELOG.md
index c215a3f..b00415d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@
## Unreleased
### Added
+- **JARVIS-style TTS status reports** — Neural voice (edge-tts, en-GB-RyanNeural) speaks concise status after each action ("Got it. 12 words captured", "Located 4 files, sir")
+- **TTS HTTP API server** — `POST http://127.0.0.1:7865/api/speak` endpoint lets any external tool (Claude Code, scripts, etc.) trigger spoken feedback
+- **Feedback hotkey mode** (`Cmd+Shift+F`) — Pastes transcription with TTS endpoint instructions so the receiving LLM can speak back
+- **Chunked TTS playback** — Long text splits into rolling chunks of 2-3 sentences; first chunk plays immediately while the rest generate in the background
+- **Cross-platform TTS** — edge-tts with ffplay/afplay for headless playback on Windows, macOS, and Linux; SAPI/AVSpeechSynthesizer/espeak-ng fallback when offline
+- **TTS config keys** — `tts_enabled`, `tts_voice`, `tts_edge_rate`, `tts_edge_pitch`, `tts_rate`, `tts_volume` in `~/.vibetotext/config.json`
- **Gemini LLM integration** — New `llm.py` module that uses Google Gemini to clean up rambling voice transcriptions into clear prompts and generate structured implementation plans
- **Window state persistence** — History app now remembers its position and size between sessions
- **Startup/stop scripts** — `start-all.sh` and `stop-all.sh` to launch and kill both services in one command
diff --git a/README.md b/README.md
index 47ae3d5..6a825bd 100644
--- a/README.md
+++ b/README.md
@@ -20,12 +20,20 @@ All implementations share the same SQLite database at `~/.vibetotext/history.db`
- `Cmd+Shift` — **Greppy** mode with semantic code search
- `Alt+Shift` — **Cleanup** mode (AI refines rambling into clear prompts)
- `Cmd+Alt` — **Plan** mode (generates structured implementation plans)
+- `Cmd+Shift+F` — **Feedback** mode (pastes transcription with TTS endpoint instructions so any LLM can speak back)
**Fast Local Transcription**
- Whisper.cpp for 2-4x faster transcription than Python Whisper
- Technical vocabulary bias for programming terms
- Auto-paste to cursor
+**JARVIS-style TTS Status Reports**
+- Neural voice (edge-tts, en-GB-RyanNeural) speaks status after each action
+- Chunked playback for long text — first sentence plays immediately while rest generates in background
+- HTTP API server at `http://127.0.0.1:7865` enables any external tool to speak via `POST /api/speak`
+- Configurable voice, rate, pitch, and volume in `~/.vibetotext/config.json`
+- Falls back to platform TTS (SAPI/say/espeak-ng) when offline
+
## Analytics & Settings

diff --git a/macos-native/Sources/Core/ApiServer.swift b/macos-native/Sources/Core/ApiServer.swift
new file mode 100644
index 0000000..93be85a
--- /dev/null
+++ b/macos-native/Sources/Core/ApiServer.swift
@@ -0,0 +1,166 @@
+import Foundation
+import Network
+
+/// Minimal HTTP API server using NWListener — exposes TTS and status
+/// endpoints for external tools (e.g. DevGlide MCP voice server).
+final class ApiServer {
+ private var listener: NWListener?
+ let port: UInt16
+
+ init(port: UInt16 = 7865) {
+ self.port = port
+ }
+
+ // MARK: - Lifecycle
+
+ func start() {
+ do {
+ let params = NWParameters.tcp
+ listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!)
+ listener?.newConnectionHandler = { [weak self] connection in
+ self?.handleConnection(connection)
+ }
+ listener?.stateUpdateHandler = { state in
+ switch state {
+ case .ready:
+ print("[API] Server running on http://127.0.0.1:\(self.port)")
+ case .failed(let error):
+ print("[API] Listener failed: \(error)")
+ default:
+ break
+ }
+ }
+ listener?.start(queue: .global())
+ } catch {
+ print("[API] Failed to start: \(error)")
+ }
+ }
+
+ func stop() {
+ listener?.cancel()
+ listener = nil
+ }
+
+ // MARK: - Connection handling
+
+ private func handleConnection(_ connection: NWConnection) {
+ connection.start(queue: .global())
+ connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) { [weak self] data, _, _, error in
+ if let error {
+ print("[API] Receive error: \(error)")
+ connection.cancel()
+ return
+ }
+ guard let data, let request = String(data: data, encoding: .utf8) else {
+ connection.cancel()
+ return
+ }
+ self?.handleHTTP(request: request, connection: connection)
+ }
+ }
+
+ // MARK: - HTTP routing
+
+ private func handleHTTP(request: String, connection: NWConnection) {
+ let lines = request.components(separatedBy: "\r\n")
+ guard let requestLine = lines.first else {
+ sendResponse(connection, status: 400, body: #"{"error":"malformed request"}"#)
+ return
+ }
+
+ let parts = requestLine.split(separator: " ", maxSplits: 2)
+ guard parts.count >= 2 else {
+ sendResponse(connection, status: 400, body: #"{"error":"malformed request line"}"#)
+ return
+ }
+
+ let method = String(parts[0])
+ let path = String(parts[1])
+
+ // Handle CORS preflight
+ if method == "OPTIONS" {
+ sendCORSPreflight(connection)
+ return
+ }
+
+ switch (method, path) {
+ case ("POST", "/api/speak"):
+ handleSpeak(request: request, connection: connection)
+
+ case ("POST", "/api/stop"):
+ TtsService.shared.stop()
+ sendResponse(connection, status: 200, body: #"{"status":"stopped"}"#)
+
+ case ("GET", "/api/status"):
+ sendResponse(connection, status: 200, body: #"{"status":"ok","tts_enabled":\#(ConfigStore.shared.ttsEnabled)}"#)
+
+ default:
+ sendResponse(connection, status: 404, body: #"{"error":"not found"}"#)
+ }
+ }
+
+ // MARK: - Endpoint handlers
+
+ private func handleSpeak(request: String, connection: NWConnection) {
+ // Extract JSON body after the blank line separating headers from body
+ guard let bodyRange = request.range(of: "\r\n\r\n") else {
+ sendResponse(connection, status: 400, body: #"{"error":"no body"}"#)
+ return
+ }
+ let bodyStr = String(request[bodyRange.upperBound...])
+
+ guard let bodyData = bodyStr.data(using: .utf8),
+ let json = try? JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
+ let text = json["text"] as? String, !text.isEmpty else {
+ sendResponse(connection, status: 400, body: #"{"error":"missing or empty \"text\" field"}"#)
+ return
+ }
+
+ TtsService.shared.speak(text)
+ sendResponse(connection, status: 200, body: #"{"status":"speaking"}"#)
+ }
+
+ // MARK: - Response helpers
+
+ private func sendResponse(_ connection: NWConnection, status: Int, body: String) {
+ let statusText: String
+ switch status {
+ case 200: statusText = "OK"
+ case 400: statusText = "Bad Request"
+ case 404: statusText = "Not Found"
+ case 405: statusText = "Method Not Allowed"
+ default: statusText = "Error"
+ }
+
+ let response = [
+ "HTTP/1.1 \(status) \(statusText)",
+ "Content-Type: application/json",
+ "Access-Control-Allow-Origin: *",
+ "Access-Control-Allow-Methods: GET, POST, OPTIONS",
+ "Access-Control-Allow-Headers: Content-Type",
+ "Content-Length: \(body.utf8.count)",
+ "",
+ body,
+ ].joined(separator: "\r\n")
+
+ connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
+ connection.cancel()
+ })
+ }
+
+ private func sendCORSPreflight(_ connection: NWConnection) {
+ let response = [
+ "HTTP/1.1 204 No Content",
+ "Access-Control-Allow-Origin: *",
+ "Access-Control-Allow-Methods: GET, POST, OPTIONS",
+ "Access-Control-Allow-Headers: Content-Type",
+ "Content-Length: 0",
+ "",
+ "",
+ ].joined(separator: "\r\n")
+
+ connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
+ connection.cancel()
+ })
+ }
+}
diff --git a/macos-native/Sources/Core/GeminiService.swift b/macos-native/Sources/Core/GeminiService.swift
index aa2fc04..1310beb 100644
--- a/macos-native/Sources/Core/GeminiService.swift
+++ b/macos-native/Sources/Core/GeminiService.swift
@@ -29,6 +29,18 @@ final class GeminiService {
return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.4, maxTokens: 4096)
}
+ // MARK: - Feedback mode
+
+ func feedback(text: String) async throws -> String? {
+ guard let apiKey = ConfigStore.shared.geminiAPIKey else {
+ print("[Gemini] No API key configured")
+ return nil
+ }
+
+ let prompt = Self.feedbackPrompt.replacingOccurrences(of: "{text}", with: text)
+ return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.5, maxTokens: 256)
+ }
+
// MARK: - REST API call
private func generateContent(prompt: String, apiKey: String, temperature: Double, maxTokens: Int) async throws -> String? {
@@ -88,6 +100,23 @@ final class GeminiService {
Refined output:
"""
+ static let feedbackPrompt = """
+ You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response.
+
+ Rules:
+ - Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS.
+ - Be direct and helpful. No filler, no "I think", no hedging.
+ - Use natural spoken English — contractions, simple words. No markdown, no bullet points.
+ - If they asked a question, answer it. If they described something, give concise feedback.
+ - Address the user as "sir" occasionally but not every sentence.
+ - Sound like a knowledgeable, confident AI assistant.
+
+ User said:
+ {text}
+
+ Your spoken response:
+ """
+
static let planPrompt = """
You are a senior software architect. Transform a rambling voice description into a concise implementation plan.
diff --git a/macos-native/Sources/Core/HotkeyManager.swift b/macos-native/Sources/Core/HotkeyManager.swift
index d72cadc..2b19870 100644
--- a/macos-native/Sources/Core/HotkeyManager.swift
+++ b/macos-native/Sources/Core/HotkeyManager.swift
@@ -33,6 +33,8 @@ final class HotkeyManager {
private let hotkeys: [HotkeyDef] = [
// cmd+alt+p (key code 35 = 'p')
HotkeyDef(modifiers: [.maskCommand, .maskAlternate], keyCode: 35, mode: "plan"),
+ // cmd+shift+f (key code 3 = 'f')
+ HotkeyDef(modifiers: [.maskCommand, .maskShift], keyCode: 3, mode: "feedback"),
// cmd+alt+shift (modifiers only) — must be before alt+shift (more specific)
HotkeyDef(modifiers: [.maskCommand, .maskAlternate, .maskShift], keyCode: nil, mode: "greppy"),
// alt+shift (modifiers only)
@@ -76,6 +78,7 @@ final class HotkeyManager {
print(" [alt+shift] = cleanup")
print(" [cmd+alt+shift] = greppy")
print(" [cmd+alt+p] = plan")
+ print(" [cmd+shift+f] = feedback")
}
func stop() {
diff --git a/macos-native/Sources/Core/TranscriptionPipeline.swift b/macos-native/Sources/Core/TranscriptionPipeline.swift
index 6ea648e..4d436e6 100644
--- a/macos-native/Sources/Core/TranscriptionPipeline.swift
+++ b/macos-native/Sources/Core/TranscriptionPipeline.swift
@@ -10,6 +10,7 @@ final class TranscriptionPipeline {
private var geminiService: GeminiService?
private var greppyService: GreppyService?
private var waveformController: WaveformOverlayController?
+ private var apiServer: ApiServer?
private var isRecording = false
private var currentMode: String?
@@ -32,6 +33,10 @@ final class TranscriptionPipeline {
}
hotkeyManager?.start()
+
+ apiServer = ApiServer()
+ apiServer?.start()
+
print("[Pipeline] Started — hold hotkey to record")
}
@@ -41,6 +46,8 @@ final class TranscriptionPipeline {
_ = recorder?.stop()
}
waveformController?.hide()
+ TtsService.shared.stop()
+ apiServer?.stop()
}
private func startRecording(mode: String) {
@@ -90,6 +97,7 @@ final class TranscriptionPipeline {
// 2. Process based on mode
var output = text
+ var fileCount = 0
switch mode {
case "cleanup":
if let refined = try await geminiService?.cleanup(text: text) {
@@ -103,7 +111,15 @@ final class TranscriptionPipeline {
let context = await greppyService?.search(query: text) ?? ""
if !context.isEmpty {
output = text + "\n\n" + context
+ fileCount = context.components(separatedBy: "### ").count - 1
}
+ case "feedback":
+ if let feedback = try await geminiService?.feedback(text: text) {
+ TtsService.shared.speak(feedback)
+ } else {
+ TtsService.shared.speak("I couldn't generate feedback, sir")
+ }
+ // output stays as original text for paste
default:
break // transcribe mode: use raw text
}
@@ -119,8 +135,14 @@ final class TranscriptionPipeline {
PasteService.pasteAtCursor(output)
print("[Pipeline] Pasted at cursor.")
+ // 5. Speak status report
+ let status = TtsService.generateStatusMessage(
+ mode: mode, text: text, output: output, fileCount: fileCount)
+ TtsService.shared.speak(status)
+
} catch {
print("[Pipeline] Error: \(error)")
+ TtsService.shared.speak("Processing failed")
}
}
}
diff --git a/macos-native/Sources/Core/TtsService.swift b/macos-native/Sources/Core/TtsService.swift
new file mode 100644
index 0000000..851712c
--- /dev/null
+++ b/macos-native/Sources/Core/TtsService.swift
@@ -0,0 +1,144 @@
+import AVFoundation
+
+/// Fire-and-forget text-to-speech — neural TTS via edge-tts CLI, with
+/// AVSpeechSynthesizer as fallback when edge-tts is unavailable.
+final class TtsService: NSObject, AVSpeechSynthesizerDelegate {
+ static let shared = TtsService()
+
+ private let synthesizer = AVSpeechSynthesizer()
+
+ /// Active afplay process for single-slot cancellation.
+ private var activeProcess: Process?
+
+ private override init() {
+ super.init()
+ synthesizer.delegate = self
+ }
+
+ // MARK: - Public
+
+ func speak(_ text: String) {
+ let config = ConfigStore.shared
+ guard config.ttsEnabled else { return }
+ guard !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return }
+
+ // Cancel any previous speech
+ stop()
+
+ Task.detached { [weak self] in
+ let tmpPath = NSTemporaryDirectory() + "vibetotext_tts.mp3"
+ let voice = config.ttsVoice ?? "en-GB-RyanNeural"
+ let rate = config.ttsEdgeRate ?? "+12%"
+ let pitch = config.ttsEdgePitch ?? "+1Hz"
+
+ // 1. Generate mp3 via edge-tts CLI
+ let genProcess = Process()
+ genProcess.executableURL = URL(fileURLWithPath: "/usr/bin/env")
+ genProcess.arguments = [
+ "edge-tts",
+ "--voice", voice,
+ "--rate", rate,
+ "--pitch", pitch,
+ "--text", text,
+ "--write-media", tmpPath,
+ ]
+ genProcess.standardOutput = FileHandle.nullDevice
+ genProcess.standardError = FileHandle.nullDevice
+
+ do {
+ try genProcess.run()
+ genProcess.waitUntilExit()
+
+ guard genProcess.terminationStatus == 0 else {
+ print("[TTS] edge-tts exited with status \(genProcess.terminationStatus), falling back")
+ self?.speakFallback(text)
+ return
+ }
+
+ // 2. Play with afplay
+ let playProcess = Process()
+ playProcess.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
+ playProcess.arguments = [tmpPath]
+ try playProcess.run()
+ self?.activeProcess = playProcess
+ } catch {
+ print("[TTS] edge-tts failed: \(error), falling back")
+ self?.speakFallback(text)
+ }
+ }
+ }
+
+ func stop() {
+ // Kill active afplay process
+ if let proc = activeProcess, proc.isRunning {
+ proc.terminate()
+ }
+ activeProcess = nil
+
+ // Also stop AVSpeechSynthesizer fallback
+ if synthesizer.isSpeaking {
+ synthesizer.stopSpeaking(at: .immediate)
+ }
+ }
+
+ // MARK: - Fallback (AVSpeechSynthesizer)
+
+ private func speakFallback(_ text: String) {
+ let config = ConfigStore.shared
+
+ // Cancel any previous fallback speech
+ if synthesizer.isSpeaking {
+ synthesizer.stopSpeaking(at: .immediate)
+ }
+
+ let utterance = AVSpeechUtterance(string: text)
+
+ // Convert WPM (default 200) to AVSpeechUtterance rate (0.0 - 1.0)
+ let wpm = Float(config.ttsRate)
+ utterance.rate = max(AVSpeechUtteranceMinimumSpeechRate,
+ min(AVSpeechUtteranceMaximumSpeechRate,
+ (wpm / 200.0) * AVSpeechUtteranceDefaultSpeechRate))
+
+ // Volume: 0-100 -> 0.0-1.0
+ utterance.volume = Float(config.ttsVolume) / 100.0
+
+ let voiceId = (config.ttsVoice?.isEmpty == false) ? config.ttsVoice : nil
+ if let id = voiceId {
+ utterance.voice = AVSpeechSynthesisVoice(identifier: id)
+ ?? AVSpeechSynthesisVoice(language: id)
+ } else {
+ // Default: Daniel (British male) for Jarvis feel
+ utterance.voice = AVSpeechSynthesisVoice(identifier: "com.apple.voice.compact.en-GB.Daniel")
+ ?? AVSpeechSynthesisVoice(language: "en-GB")
+ }
+
+ synthesizer.speak(utterance)
+ }
+
+ // MARK: - Status messages
+
+ static func generateStatusMessage(mode: String, text: String, output: String, fileCount: Int = 0) -> String {
+ switch mode {
+ case "greppy":
+ return fileCount == 1 ? "Located one file, sir" : "Located \(fileCount) files, sir"
+ case "cleanup":
+ let n = output.components(separatedBy: "\n\n")
+ .filter { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }
+ .count
+ return "All tidied up. \(n) paragraphs ready"
+ case "plan":
+ let n = output.components(separatedBy: "\n")
+ .filter { line in
+ let trimmed = line.trimmingCharacters(in: .whitespaces)
+ return trimmed.range(of: #"^(\d+[\.\):]|[-*])\s"#, options: .regularExpression) != nil
+ }
+ .count
+ return "Plan's ready. \(n) steps laid out"
+ case "feedback":
+ return "Feedback spoken, sir"
+ default:
+ let n = text.split(separator: " ").count
+ return "Got it. \(n) words captured"
+ }
+ }
+}
diff --git a/macos-native/Sources/Data/ConfigStore.swift b/macos-native/Sources/Data/ConfigStore.swift
index db7283b..3bf9e87 100644
--- a/macos-native/Sources/Data/ConfigStore.swift
+++ b/macos-native/Sources/Data/ConfigStore.swift
@@ -11,6 +11,12 @@ final class ConfigStore: ObservableObject {
@Published var audioDeviceName: String?
@Published var codebasePath: String?
@Published var customDictionary: [String]
+ @Published var ttsEnabled: Bool = true
+ @Published var ttsRate: Int = 200
+ @Published var ttsVolume: Int = 80
+ @Published var ttsVoice: String?
+ @Published var ttsEdgeRate: String?
+ @Published var ttsEdgePitch: String?
private init() {
let dir = FileManager.default.homeDirectoryForCurrentUser
@@ -32,6 +38,12 @@ final class ConfigStore: ObservableObject {
audioDeviceName = json["audio_device_name"] as? String
codebasePath = json["codebase_path"] as? String
customDictionary = json["custom_dictionary"] as? [String] ?? []
+ ttsEnabled = json["tts_enabled"] as? Bool ?? true
+ ttsRate = json["tts_rate"] as? Int ?? 200
+ ttsVolume = json["tts_volume"] as? Int ?? 80
+ ttsVoice = json["tts_voice"] as? String
+ ttsEdgeRate = json["tts_edge_rate"] as? String
+ ttsEdgePitch = json["tts_edge_pitch"] as? String
} catch {
print("[ConfigStore] Failed to load config: \(error)")
}
@@ -68,6 +80,24 @@ final class ConfigStore: ObservableObject {
json.removeValue(forKey: "codebase_path")
}
json["custom_dictionary"] = customDictionary
+ json["tts_enabled"] = ttsEnabled
+ json["tts_rate"] = ttsRate
+ json["tts_volume"] = ttsVolume
+ if let voice = ttsVoice {
+ json["tts_voice"] = voice
+ } else {
+ json.removeValue(forKey: "tts_voice")
+ }
+ if let edgeRate = ttsEdgeRate {
+ json["tts_edge_rate"] = edgeRate
+ } else {
+ json.removeValue(forKey: "tts_edge_rate")
+ }
+ if let edgePitch = ttsEdgePitch {
+ json["tts_edge_pitch"] = edgePitch
+ } else {
+ json.removeValue(forKey: "tts_edge_pitch")
+ }
let data = try JSONSerialization.data(withJSONObject: json, options: [.prettyPrinted, .sortedKeys])
try data.write(to: configURL, options: .atomic)
diff --git a/src/vibetotext/__main__.py b/src/vibetotext/__main__.py
index b463bfc..4306bc2 100644
--- a/src/vibetotext/__main__.py
+++ b/src/vibetotext/__main__.py
@@ -18,8 +18,10 @@
from vibetotext.greppy import search_files, format_files_for_context
from vibetotext.llm import cleanup_text, generate_implementation_plan
from vibetotext.output import paste_at_cursor
+from vibetotext.tts import speak, speak_status
from vibetotext.history import TranscriptionHistory
from vibetotext.socket_server import TranscriptionSocketServer
+from vibetotext.api_server import ApiServer
IS_WINDOWS = platform.system() == "Windows"
@@ -117,6 +119,11 @@ def main():
default="cmd+alt+/",
help="Hotkey for implementation plan mode (default: cmd+alt+/)",
)
+ parser.add_argument(
+ "--feedback-hotkey",
+ default="cmd+shift+f",
+ help="Hotkey for voice feedback mode (default: cmd+shift+f)",
+ )
parser.add_argument(
"--history-hotkey",
default="ctrl+alt",
@@ -215,6 +222,7 @@ def main():
args.greppy_hotkey: "greppy",
args.cleanup_hotkey: "cleanup",
args.plan_hotkey: "plan",
+ args.feedback_hotkey: "feedback",
args.history_hotkey: "history",
args.viz_hotkey: "viz",
}
@@ -239,8 +247,13 @@ def main():
socket_server = TranscriptionSocketServer(transcriber)
socket_server.start()
+ # Start HTTP API server (TTS endpoint for external tools)
+ api_server = ApiServer(port=saved_config.get("api_port", 7865))
+ api_server.start()
+
# Ensure cleanup on any exit (crash, signal, etc.)
def _cleanup():
+ api_server.stop()
if socket_server:
try:
socket_server.stop()
@@ -330,6 +343,7 @@ def on_stop(mode):
if text_lower in noise_markers:
return
+ files = []
if mode == "greppy":
# Greppy mode: search for relevant files and attach them
files = search_files(text, limit=args.greppy_limit, codebase=args.codebase)
@@ -347,6 +361,16 @@ def on_stop(mode):
plan = generate_implementation_plan(text)
output = plan if plan else text
+ elif mode == "feedback":
+ # Feedback mode: paste transcription + TTS endpoint instructions
+ # so the receiving LLM knows how to speak back
+ speak_url = f"{api_server.url}/api/speak"
+ output = (
+ f"{text}\n\n"
+ f"[To respond verbally to the user, POST to {speak_url} "
+ f'with body: {{"text": "your response here"}}]'
+ )
+
else:
# Regular transcribe mode - just transcribe, no context search
output = text
@@ -357,7 +381,12 @@ def on_stop(mode):
# Paste at cursor
paste_at_cursor(output)
+ # Speak status report (skip for feedback mode — already spoke)
+ if mode != "feedback":
+ speak_status(mode, text, output, file_count=len(files))
+
except Exception:
+ speak("Processing failed")
# Log error to file
error_log = os.path.join(tempfile.gettempdir(), "vibetotext_crash.log")
error_msg = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Error in on_stop (mode={mode}):\n"
diff --git a/src/vibetotext/api_server.py b/src/vibetotext/api_server.py
new file mode 100644
index 0000000..fdbe562
--- /dev/null
+++ b/src/vibetotext/api_server.py
@@ -0,0 +1,92 @@
+"""HTTP API server — exposes TTS and status endpoints for external tools."""
+
+import json
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+from vibetotext.tts import speak
+
+DEFAULT_PORT = 7865
+
+
+class _Handler(BaseHTTPRequestHandler):
+ """Handle API requests."""
+
+ def log_message(self, format, *args):
+ # Suppress default stderr logging
+ pass
+
+ def _send_json(self, status: int, data: dict):
+ self.send_response(status)
+ self.send_header("Content-Type", "application/json")
+ self.send_header("Access-Control-Allow-Origin", "*")
+ self.end_headers()
+ self.wfile.write(json.dumps(data).encode())
+
+ def do_OPTIONS(self):
+ """Handle CORS preflight."""
+ self.send_response(204)
+ self.send_header("Access-Control-Allow-Origin", "*")
+ self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
+ self.send_header("Access-Control-Allow-Headers", "Content-Type")
+ self.end_headers()
+
+ def do_GET(self):
+ if self.path == "/api/status":
+ self._send_json(200, {"status": "ok", "service": "vibetotext"})
+ else:
+ self._send_json(404, {"error": "not found"})
+
+ def do_POST(self):
+ if self.path == "/api/speak":
+ self._handle_speak()
+ else:
+ self._send_json(404, {"error": "not found"})
+
+ def _handle_speak(self):
+ try:
+ length = int(self.headers.get("Content-Length", 0))
+ body = self.rfile.read(length)
+ data = json.loads(body)
+ text = data.get("text", "").strip()
+
+ if not text:
+ self._send_json(400, {"error": "missing 'text' field"})
+ return
+
+ speak(text)
+ self._send_json(200, {"status": "speaking", "text": text})
+
+ except json.JSONDecodeError:
+ self._send_json(400, {"error": "invalid JSON"})
+ except Exception as e:
+ self._send_json(500, {"error": str(e)})
+
+
+class ApiServer:
+ """Lightweight HTTP server for TTS and status endpoints."""
+
+ def __init__(self, port: int = DEFAULT_PORT):
+ self.port = port
+ self._server: HTTPServer | None = None
+ self._thread: threading.Thread | None = None
+
+ def start(self):
+ try:
+ self._server = HTTPServer(("127.0.0.1", self.port), _Handler)
+ self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
+ self._thread.start()
+ print(f"[API] Server running on http://127.0.0.1:{self.port}")
+ print(f"[API] POST /api/speak {{\"text\": \"...\"}}")
+ print(f"[API] GET /api/status")
+ except OSError as e:
+ print(f"[API] Failed to start server on port {self.port}: {e}")
+
+ def stop(self):
+ if self._server:
+ self._server.shutdown()
+ self._server = None
+
+ @property
+ def url(self) -> str:
+ return f"http://127.0.0.1:{self.port}"
diff --git a/src/vibetotext/cli.py b/src/vibetotext/cli.py
index c2f9a73..b8841c7 100644
--- a/src/vibetotext/cli.py
+++ b/src/vibetotext/cli.py
@@ -15,8 +15,10 @@
from .greppy import search_files, format_files_for_context
from .llm import cleanup_text, generate_implementation_plan
from .output import paste_at_cursor
+from .tts import speak, speak_status
from .history import TranscriptionHistory
from .history_ui import toggle_history, refresh_history
+from .api_server import ApiServer
def main():
@@ -49,6 +51,11 @@ def main():
default="cmd+alt+p",
help="Hotkey for implementation plan mode (default: cmd+alt+p)",
)
+ parser.add_argument(
+ "--feedback-hotkey",
+ default="cmd+shift+f",
+ help="Hotkey for voice feedback mode (default: cmd+shift+f)",
+ )
parser.add_argument(
"--history-hotkey",
default="ctrl+alt",
@@ -139,6 +146,7 @@ def main():
args.greppy_hotkey: "greppy",
args.cleanup_hotkey: "cleanup",
args.plan_hotkey: "plan",
+ args.feedback_hotkey: "feedback",
args.history_hotkey: "history",
args.viz_hotkey: "viz",
}
@@ -151,6 +159,10 @@ def main():
if ui:
recorder.on_level = ui.update_waveform
+ # Start HTTP API server (TTS endpoint for external tools)
+ api_server = ApiServer()
+ api_server.start()
+
print(f"vibetotext ready. Hold hotkey to record, release to process.")
print(f" [{args.hotkey}] = transcribe + paste")
print(f" [{args.greppy_hotkey}] = Greppy search + attach files")
@@ -277,6 +289,7 @@ def on_stop(mode):
print(f"Transcribed: {text}")
_log(f"on_stop: transcribed text='{text[:80]}'")
+ files = []
if mode == "greppy":
# Greppy mode: search for relevant files and attach them
print("Searching with Greppy...", end="", flush=True)
@@ -324,6 +337,17 @@ def on_stop(mode):
print(" failed, using original.")
output = text
+ elif mode == "feedback":
+ # Feedback mode: paste transcription + TTS endpoint instructions
+ _log("on_stop: feedback mode — appending TTS instructions")
+ speak_url = f"{api_server.url}/api/speak"
+ output = (
+ f"{text}\n\n"
+ f"[To respond verbally to the user, POST to {speak_url} "
+ f'with body: {{"text": "your response here"}}]'
+ )
+ print(f"Feedback mode: pasting with TTS endpoint {speak_url}")
+
else:
# Regular transcribe mode
if not args.no_context:
@@ -352,10 +376,15 @@ def on_stop(mode):
_log(f"on_stop: paste_at_cursor() done ({time.time() - t:.3f}s)")
print("Pasted at cursor.\n")
+ # Speak status report (skip for feedback mode — already spoke)
+ if mode != "feedback":
+ speak_status(mode, text, output, file_count=len(files))
+
_log(f"on_stop: END total={time.time() - stop_t0:.3f}s")
except Exception as e:
_log(f"on_stop: EXCEPTION after {time.time() - stop_t0:.3f}s: {e}")
+ speak("Processing failed")
# Log error to file and print to console
error_log = os.path.join(tempfile.gettempdir(), "vibetotext_crash.log")
diff --git a/src/vibetotext/llm.py b/src/vibetotext/llm.py
index 0f3b7b1..74e4190 100644
--- a/src/vibetotext/llm.py
+++ b/src/vibetotext/llm.py
@@ -178,3 +178,51 @@ def generate_implementation_plan(text: str) -> Optional[str]:
except Exception as e:
print(f"Gemini plan generation error: {e}")
return None
+
+
+FEEDBACK_PROMPT = """You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response.
+
+Rules:
+- Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS.
+- Be direct and helpful. No filler, no "I think", no hedging.
+- Use natural spoken English — contractions, simple words. No markdown, no bullet points.
+- If they asked a question, answer it. If they described something, give concise feedback.
+- Address the user as "sir" occasionally but not every sentence.
+- Sound like a knowledgeable, confident AI assistant.
+
+To speak your response aloud, send a POST request to the user's TTS endpoint:
+ POST {speak_url}
+ Content-Type: application/json
+ {{"text": "your response here"}}
+
+User said:
+{text}
+
+Your spoken response:"""
+
+
+def generate_feedback(text: str, speak_url: str = "http://127.0.0.1:7865/api/speak") -> Optional[str]:
+ """Use Gemini to generate a concise spoken feedback response."""
+ if not _genai_available:
+ print("Gemini feedback error: google-genai package not installed")
+ return None
+ if not _client:
+ print("Gemini feedback error: No API key configured")
+ return None
+
+ try:
+ prompt = FEEDBACK_PROMPT.format(text=text, speak_url=speak_url)
+ response = _client.models.generate_content(
+ model="gemini-2.5-flash",
+ contents=prompt,
+ config=genai_types.GenerateContentConfig(
+ temperature=0.5,
+ max_output_tokens=256,
+ ),
+ )
+ if response.text:
+ return response.text.strip()
+ return None
+ except Exception as e:
+ print(f"Gemini feedback error: {e}")
+ return None
diff --git a/src/vibetotext/tts.py b/src/vibetotext/tts.py
new file mode 100644
index 0000000..71dff6e
--- /dev/null
+++ b/src/vibetotext/tts.py
@@ -0,0 +1,291 @@
+"""Text-to-speech status reports — neural TTS via edge-tts, fire-and-forget."""
+
+import atexit
+import asyncio
+import json
+import os
+import platform
+import subprocess
+import re
+import tempfile
+import threading
+from pathlib import Path
+
+SYSTEM = platform.system()
+
+_active_process: subprocess.Popen | None = None
+_tts_lock = threading.Lock()
+_stop_event = threading.Event()
+
+# edge-tts voice settings (JARVIS-style: Ryan British, fast & crisp)
+_EDGE_VOICE = "en-GB-RyanNeural"
+_EDGE_RATE = "+12%"
+_EDGE_PITCH = "+1Hz"
+
+# Chunking threshold: texts longer than this many chars get chunked
+_CHUNK_THRESHOLD = 100
+
+
+def _load_config() -> dict:
+ """Load TTS settings from ~/.vibetotext/config.json."""
+ try:
+ config_file = Path.home() / ".vibetotext" / "config.json"
+ if config_file.exists():
+ with open(config_file, "r") as f:
+ return json.load(f)
+ except Exception:
+ pass
+ return {}
+
+
+def stop():
+ """Kill the active TTS subprocess if still running."""
+ global _active_process
+ _stop_event.set()
+ with _tts_lock:
+ if _active_process is not None:
+ try:
+ _active_process.kill()
+ _active_process.wait(timeout=1)
+ except Exception:
+ pass
+ _active_process = None
+
+
+def _play_mp3(path: str, first_chunk: bool = True):
+ """Play an mp3 file in the background. first_chunk adds startup delay to prevent clipping."""
+ try:
+ af_args = ["-af", "adelay=300|300"] if first_chunk else []
+ if SYSTEM == "Windows":
+ try:
+ return subprocess.Popen(
+ ["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet"] + af_args + [path],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+ except FileNotFoundError:
+ return subprocess.Popen(
+ f'start "" "{path}"',
+ shell=True,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+ elif SYSTEM == "Darwin":
+ return subprocess.Popen(
+ ["afplay", path],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+ else:
+ for player in [["mpv", "--no-video", path], ["ffplay", "-nodisp", "-autoexit"] + af_args + [path]]:
+ try:
+ return subprocess.Popen(player, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ except FileNotFoundError:
+ continue
+ except Exception:
+ pass
+ return None
+
+
+def _play_mp3_blocking(path: str, first_chunk: bool = True) -> bool:
+ """Play an mp3 and wait for it to finish. Returns True if completed."""
+ proc = _play_mp3(path, first_chunk=first_chunk)
+ if proc is None:
+ return False
+ global _active_process
+ with _tts_lock:
+ _active_process = proc
+ proc.wait()
+ return proc.returncode == 0
+
+
+def _split_sentences(text: str) -> list[str]:
+ """Split text into sentences for chunked playback."""
+ # Split on sentence-ending punctuation followed by space or end of string
+ parts = re.split(r'(?<=[.!?])\s+', text.strip())
+ # Merge very short fragments with the previous sentence
+ sentences = []
+ for part in parts:
+ if sentences and len(sentences[-1]) < 30:
+ sentences[-1] += " " + part
+ else:
+ sentences.append(part)
+ return [s for s in sentences if s.strip()]
+
+
+def _generate_mp3(text: str, voice: str, rate: str, pitch: str, path: str):
+ """Generate mp3 from text using edge-tts."""
+ import edge_tts
+ loop = asyncio.new_event_loop()
+ try:
+ communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
+ loop.run_until_complete(communicate.save(path))
+ finally:
+ loop.close()
+
+
+def _speak_edge_tts(text: str, config: dict):
+ """Generate and play speech using edge-tts (neural voice)."""
+ global _active_process
+
+ voice = config.get("tts_voice") or _EDGE_VOICE
+ rate = config.get("tts_edge_rate") or _EDGE_RATE
+ pitch = config.get("tts_edge_pitch") or _EDGE_PITCH
+
+ # Short text: generate and play in one shot
+ if len(text) <= _CHUNK_THRESHOLD:
+ mp3_path = os.path.join(tempfile.gettempdir(), "vibetotext_tts.mp3")
+ _generate_mp3(text, voice, rate, pitch, mp3_path)
+ with _tts_lock:
+ _active_process = _play_mp3(mp3_path)
+ return
+
+ # Long text: rolling chunks of 2-3 sentences — generate next while playing current
+ sentences = _split_sentences(text)
+ if not sentences:
+ return
+
+ # Group sentences into chunks of ~2-3
+ chunks = []
+ current_chunk = []
+ current_len = 0
+ for s in sentences:
+ current_chunk.append(s)
+ current_len += len(s)
+ # Target ~120-200 chars per chunk (2-3 sentences)
+ if current_len >= 120:
+ chunks.append(" ".join(current_chunk))
+ current_chunk = []
+ current_len = 0
+ if current_chunk:
+ chunks.append(" ".join(current_chunk))
+
+ if not chunks:
+ return
+
+ tmp_dir = tempfile.gettempdir()
+
+ # Generate first chunk
+ path_0 = os.path.join(tmp_dir, "vibetotext_tts_0.mp3")
+ _generate_mp3(chunks[0], voice, rate, pitch, path_0)
+
+ if _stop_event.is_set():
+ return
+
+ for i, chunk in enumerate(chunks):
+ if _stop_event.is_set():
+ return
+
+ current_path = os.path.join(tmp_dir, f"vibetotext_tts_{i}.mp3")
+
+ # Start generating next chunk in parallel
+ next_thread = None
+ if i + 1 < len(chunks):
+ next_path = os.path.join(tmp_dir, f"vibetotext_tts_{i + 1}.mp3")
+ next_thread = threading.Thread(
+ target=_generate_mp3,
+ args=(chunks[i + 1], voice, rate, pitch, next_path),
+ daemon=True,
+ )
+ next_thread.start()
+
+ # Play current chunk
+ _play_mp3_blocking(current_path, first_chunk=(i == 0))
+
+ # Wait for next chunk to finish generating
+ if next_thread:
+ next_thread.join()
+
+
+def _speak_fallback(text: str, config: dict):
+ """Fallback to platform TTS when edge-tts is unavailable."""
+ global _active_process
+ rate = config.get("tts_rate", 185)
+ volume = config.get("tts_volume", 80)
+
+ if SYSTEM == "Darwin":
+ cmd = ["say", "-r", str(rate), "-v", "Daniel", text]
+ _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ elif SYSTEM == "Windows":
+ escaped = text.replace("'", "''")
+ sapi_rate = max(-10, min(10, round((rate - 200) / 20)))
+ ps_cmd = (
+ "Add-Type -AssemblyName System.Speech; "
+ "$s = New-Object System.Speech.Synthesis.SpeechSynthesizer; "
+ f"$s.Rate = {sapi_rate}; $s.Volume = {volume}; "
+ f"$s.Speak('{escaped}')"
+ )
+ _active_process = subprocess.Popen(
+ ["powershell", "-NoProfile", "-Command", ps_cmd],
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ creationflags=0x08000000,
+ )
+ else:
+ try:
+ cmd = ["espeak-ng", "-s", str(rate), "-a", str(min(200, volume * 2)), text]
+ _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ except FileNotFoundError:
+ cmd = ["spd-say", "-r", str(max(-100, min(100, rate - 200))), text]
+ _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+
+def speak(text: str) -> None:
+ """Speak text — fire-and-forget, cancels previous."""
+ config = _load_config()
+ if not config.get("tts_enabled", True):
+ return
+ if not text or not text.strip():
+ return
+
+ # Cancel previous speech
+ stop()
+ _stop_event.clear()
+
+ def _run():
+ try:
+ _speak_edge_tts(text, config)
+ except Exception:
+ try:
+ _speak_fallback(text, config)
+ except Exception:
+ pass
+
+ # Run in background thread to avoid blocking
+ t = threading.Thread(target=_run, daemon=True)
+ t.start()
+
+
+def _count_words(text: str) -> int:
+ return len(text.split())
+
+
+def _count_paragraphs(text: str) -> int:
+ return len([p for p in text.strip().split("\n\n") if p.strip()])
+
+
+def _count_steps(text: str) -> int:
+ return len(re.findall(r"(?m)^[\s]*(?:\d+[\.\):]|[-*])\s", text))
+
+
+def speak_status(mode: str, text: str, output: str, file_count: int = 0) -> None:
+ """Generate and speak a concise JARVIS-style status message."""
+ try:
+ if mode == "greppy":
+ msg = f"Located {file_count} files, sir" if file_count != 1 else "Located one file, sir"
+ elif mode == "cleanup":
+ n = _count_paragraphs(output)
+ msg = f"All tidied up. {n} paragraphs ready"
+ elif mode == "plan":
+ n = _count_steps(output)
+ msg = f"Plan's ready. {n} steps laid out"
+ else:
+ n = _count_words(text)
+ msg = f"Got it. {n} words captured"
+ speak(msg)
+ except Exception:
+ pass
+
+
+@atexit.register
+def _cleanup():
+ stop()
diff --git a/windows-native/src/VibeToText/App.xaml.cs b/windows-native/src/VibeToText/App.xaml.cs
index 68a93e0..8c9d995 100644
--- a/windows-native/src/VibeToText/App.xaml.cs
+++ b/windows-native/src/VibeToText/App.xaml.cs
@@ -17,6 +17,7 @@ public partial class App : Application
private MainWindow? _mainWindow;
private TranscriptionPipeline? _pipeline;
private HotkeyManager? _hotkeyManager;
+ private ApiServer? _apiServer;
private Mutex? _singleInstanceMutex;
// Shared services
@@ -87,10 +88,15 @@ protected override void OnStartup(StartupEventArgs e)
var transcriber = new WhisperTranscriber(Config.WhisperModel);
var pasteService = new PasteService();
Gemini = new GeminiService();
+ var ttsService = new TtsService(Config);
- _pipeline = new TranscriptionPipeline(recorder, transcriber, Database, pasteService, Gemini, Config);
+ _pipeline = new TranscriptionPipeline(recorder, transcriber, Database, pasteService, Gemini, ttsService, Config);
Pipeline = _pipeline;
+ // Start HTTP API server for external integrations
+ _apiServer = new ApiServer(ttsService);
+ _apiServer.Start();
+
Log("Services initialized.");
// Set up hotkey manager
@@ -318,6 +324,7 @@ private void OnHotkeyReleased(object? sender, HotkeyEventArgs e)
private void ExitApplication()
{
_hotkeyManager?.Stop();
+ _apiServer?.Dispose();
_pipeline?.Dispose();
_trayIcon?.Dispose();
_singleInstanceMutex?.ReleaseMutex();
@@ -327,6 +334,7 @@ private void ExitApplication()
protected override void OnExit(ExitEventArgs e)
{
_hotkeyManager?.Stop();
+ _apiServer?.Dispose();
_pipeline?.Dispose();
_trayIcon?.Dispose();
_singleInstanceMutex?.ReleaseMutex();
diff --git a/windows-native/src/VibeToText/Core/ApiServer.cs b/windows-native/src/VibeToText/Core/ApiServer.cs
new file mode 100644
index 0000000..062e807
--- /dev/null
+++ b/windows-native/src/VibeToText/Core/ApiServer.cs
@@ -0,0 +1,182 @@
+using System.Net;
+using System.Text;
+using System.Text.Json;
+
+namespace VibeToText.Core;
+
+///
+/// Lightweight HTTP API server for external integrations.
+/// Listens on http://127.0.0.1:7865/ and exposes:
+/// GET /api/status → {"status": "ok", "service": "vibetotext"}
+/// POST /api/speak → {"text": "..."} → calls TtsService.Speak()
+///
+public class ApiServer : IDisposable
+{
+ private readonly HttpListener _listener;
+ private readonly TtsService _ttsService;
+ private readonly int _port;
+ private CancellationTokenSource? _cts;
+
+ public ApiServer(TtsService ttsService, int port = 7865)
+ {
+ _ttsService = ttsService;
+ _port = port;
+ _listener = new HttpListener();
+ _listener.Prefixes.Add($"http://127.0.0.1:{_port}/");
+ }
+
+ public void Start()
+ {
+ try
+ {
+ _cts = new CancellationTokenSource();
+ _listener.Start();
+ Console.WriteLine($"[API] Server listening on http://127.0.0.1:{_port}/");
+ Task.Run(() => AcceptLoop(_cts.Token));
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"[API] Failed to start server: {ex.Message}");
+ }
+ }
+
+ public void Stop()
+ {
+ try
+ {
+ _cts?.Cancel();
+ _listener.Stop();
+ Console.WriteLine("[API] Server stopped.");
+ }
+ catch { }
+ }
+
+ public void Dispose()
+ {
+ Stop();
+ _cts?.Dispose();
+ ((IDisposable)_listener).Dispose();
+ }
+
+ private async Task AcceptLoop(CancellationToken ct)
+ {
+ while (!ct.IsCancellationRequested)
+ {
+ try
+ {
+ var ctx = await _listener.GetContextAsync().WaitAsync(ct);
+ // Handle each request in its own task (don't await — fire and forget)
+ _ = Task.Run(() => HandleRequest(ctx), ct);
+ }
+ catch (OperationCanceledException)
+ {
+ break;
+ }
+ catch (HttpListenerException) when (ct.IsCancellationRequested)
+ {
+ break;
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"[API] Accept error: {ex.Message}");
+ }
+ }
+ }
+
+ private async Task HandleRequest(HttpListenerContext ctx)
+ {
+ var req = ctx.Request;
+ var resp = ctx.Response;
+
+ // Add CORS headers for local dev tools
+ resp.Headers.Add("Access-Control-Allow-Origin", "*");
+ resp.Headers.Add("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+ resp.Headers.Add("Access-Control-Allow-Headers", "Content-Type");
+
+ try
+ {
+ // Handle CORS preflight
+ if (req.HttpMethod == "OPTIONS")
+ {
+ resp.StatusCode = 204;
+ resp.Close();
+ return;
+ }
+
+ var path = req.Url?.AbsolutePath ?? "";
+
+ switch (path)
+ {
+ case "/api/status" when req.HttpMethod == "GET":
+ await WriteJson(resp, 200, new { status = "ok", service = "vibetotext" });
+ break;
+
+ case "/api/speak" when req.HttpMethod == "POST":
+ await HandleSpeak(req, resp);
+ break;
+
+ default:
+ await WriteJson(resp, 404, new { error = "not_found", message = $"No route for {req.HttpMethod} {path}" });
+ break;
+ }
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"[API] Request error: {ex.Message}");
+ try
+ {
+ await WriteJson(resp, 500, new { error = "internal_error", message = ex.Message });
+ }
+ catch { }
+ }
+ }
+
+ private async Task HandleSpeak(HttpListenerRequest req, HttpListenerResponse resp)
+ {
+ string body;
+ using (var reader = new System.IO.StreamReader(req.InputStream, req.ContentEncoding))
+ {
+ body = await reader.ReadToEndAsync();
+ }
+
+ if (string.IsNullOrWhiteSpace(body))
+ {
+ await WriteJson(resp, 400, new { error = "bad_request", message = "Empty request body" });
+ return;
+ }
+
+ JsonDocument? doc = null;
+ try
+ {
+ doc = JsonDocument.Parse(body);
+ }
+ catch (JsonException)
+ {
+ await WriteJson(resp, 400, new { error = "bad_request", message = "Invalid JSON" });
+ return;
+ }
+
+ var text = doc.RootElement.TryGetProperty("text", out var textProp) ? textProp.GetString() : null;
+ doc.Dispose();
+
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ await WriteJson(resp, 400, new { error = "bad_request", message = "Missing or empty 'text' field" });
+ return;
+ }
+
+ _ttsService.Speak(text);
+ await WriteJson(resp, 200, new { status = "speaking" });
+ }
+
+ private static async Task WriteJson(HttpListenerResponse resp, int statusCode, object data)
+ {
+ resp.StatusCode = statusCode;
+ resp.ContentType = "application/json";
+ var json = JsonSerializer.Serialize(data);
+ var bytes = Encoding.UTF8.GetBytes(json);
+ resp.ContentLength64 = bytes.Length;
+ await resp.OutputStream.WriteAsync(bytes);
+ resp.Close();
+ }
+}
diff --git a/windows-native/src/VibeToText/Core/GeminiService.cs b/windows-native/src/VibeToText/Core/GeminiService.cs
index 9c52d13..4a76e75 100644
--- a/windows-native/src/VibeToText/Core/GeminiService.cs
+++ b/windows-native/src/VibeToText/Core/GeminiService.cs
@@ -36,6 +36,23 @@ public class GeminiService
Refined output:
""";
+ private const string FeedbackPrompt = """
+ You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response.
+
+ Rules:
+ - Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS.
+ - Be direct and helpful. No filler, no "I think", no hedging.
+ - Use natural spoken English — contractions, simple words. No markdown, no bullet points.
+ - If they asked a question, answer it. If they described something, give concise feedback.
+ - Address the user as "sir" occasionally but not every sentence.
+ - Sound like a knowledgeable, confident AI assistant.
+
+ User said:
+ {text}
+
+ Your spoken response:
+ """;
+
private const string PlanPrompt = """
You are a senior software architect. Transform a rambling voice description into a concise implementation plan.
@@ -146,6 +163,11 @@ public void LoadApiKey()
return await CallGeminiAsync(PlanPrompt.Replace("{text}", text), 0.4f, 4096);
}
+ public async Task GenerateFeedbackAsync(string text)
+ {
+ return await CallGeminiAsync(FeedbackPrompt.Replace("{text}", text), 0.5f, 256);
+ }
+
private async Task CallGeminiAsync(string prompt, float temperature, int maxTokens)
{
if (string.IsNullOrEmpty(_apiKey))
diff --git a/windows-native/src/VibeToText/Core/HotkeyManager.cs b/windows-native/src/VibeToText/Core/HotkeyManager.cs
index 9d98d1f..4ff0b6a 100644
--- a/windows-native/src/VibeToText/Core/HotkeyManager.cs
+++ b/windows-native/src/VibeToText/Core/HotkeyManager.cs
@@ -9,6 +9,7 @@ public enum RecordingMode
Greppy,
Cleanup,
Plan,
+ Feedback,
History
}
@@ -28,6 +29,7 @@ public class HotkeyManager : IDisposable
{
[new HashSet { ModKey.Ctrl, ModKey.Shift }] = RecordingMode.Transcribe,
[new HashSet { ModKey.Alt, ModKey.Shift }] = RecordingMode.Cleanup,
+ [new HashSet { ModKey.Ctrl, ModKey.Shift, ModKey.Alt }] = RecordingMode.Feedback,
[new HashSet { ModKey.Ctrl, ModKey.Alt }] = RecordingMode.History,
};
diff --git a/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs b/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs
index d0b3d0f..ac6d4b4 100644
--- a/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs
+++ b/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs
@@ -16,6 +16,7 @@ public class TranscriptionPipeline : IDisposable
private readonly HistoryDatabase _database;
private readonly PasteService _pasteService;
private readonly GeminiService _geminiService;
+ private readonly TtsService _ttsService;
private readonly ConfigStore _config;
private WaveformOverlay? _overlay;
@@ -46,6 +47,7 @@ public TranscriptionPipeline(
HistoryDatabase database,
PasteService pasteService,
GeminiService geminiService,
+ TtsService ttsService,
ConfigStore config)
{
_recorder = recorder;
@@ -53,6 +55,7 @@ public TranscriptionPipeline(
_database = database;
_pasteService = pasteService;
_geminiService = geminiService;
+ _ttsService = ttsService;
_config = config;
_recorder.OnLevelUpdate += levels =>
@@ -194,6 +197,18 @@ public async Task StopRecordingAndProcess(RecordingMode mode)
var plan = await _geminiService.GeneratePlanAsync(text);
output = plan ?? text;
break;
+ case RecordingMode.Feedback:
+ var feedback = await _geminiService.GenerateFeedbackAsync(text);
+ if (feedback != null)
+ {
+ _ttsService.Speak(feedback);
+ }
+ else
+ {
+ _ttsService.Speak("I couldn't generate feedback, sir");
+ }
+ // Still paste the original transcription
+ break;
case RecordingMode.Transcribe:
default:
break;
@@ -207,11 +222,16 @@ public async Task StopRecordingAndProcess(RecordingMode mode)
await _pasteService.PasteAtCursorAsync(output);
Log("Pasted at cursor.");
+ // Speak status report
+ var status = TtsService.GenerateStatusMessage(mode, text, output);
+ _ttsService.Speak(status);
+
_currentMode = null;
}
catch (Exception ex)
{
Log($"ERROR in processing: {ex}");
+ _ttsService.Speak("Processing failed");
System.Windows.Application.Current?.Dispatcher.Invoke(() =>
{
_overlay?.SetRecording(false);
@@ -224,5 +244,6 @@ public void Dispose()
{
_recorder.Dispose();
_transcriber.Dispose();
+ _ttsService.Dispose();
}
}
diff --git a/windows-native/src/VibeToText/Core/TtsService.cs b/windows-native/src/VibeToText/Core/TtsService.cs
new file mode 100644
index 0000000..a59dcc3
--- /dev/null
+++ b/windows-native/src/VibeToText/Core/TtsService.cs
@@ -0,0 +1,231 @@
+using System.Diagnostics;
+using System.IO;
+using System.Speech.Synthesis;
+using VibeToText.Data;
+
+namespace VibeToText.Core;
+
+///
+/// Fire-and-forget text-to-speech for status reports.
+/// Primary: edge-tts (neural voice) + ffplay for playback.
+/// Fallback: System.Speech.Synthesis (built-in SAPI).
+///
+public class TtsService : IDisposable
+{
+ private SpeechSynthesizer? _synth;
+ private readonly ConfigStore _config;
+ private readonly object _lock = new();
+ private Process? _ffplayProcess;
+ private static readonly string TempMp3Path = Path.Combine(Path.GetTempPath(), "vibetotext_tts.mp3");
+
+ // edge-tts defaults
+ private const string DefaultVoice = "en-GB-RyanNeural";
+
+ public TtsService(ConfigStore config)
+ {
+ _config = config;
+ try
+ {
+ _synth = new SpeechSynthesizer();
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"[TTS] Failed to initialize SAPI fallback: {ex.Message}");
+ }
+ }
+
+ ///
+ /// Speak text using edge-tts (neural) with SAPI fallback. Fire-and-forget.
+ ///
+ public void Speak(string text)
+ {
+ if (!_config.TtsEnabled || string.IsNullOrWhiteSpace(text))
+ return;
+
+ // Cancel any previous speech
+ StopPlayback();
+
+ // Run everything in background so we don't block
+ Task.Run(() =>
+ {
+ try
+ {
+ SpeakEdgeTts(text);
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"[TTS] edge-tts failed, falling back to SAPI: {ex.Message}");
+ try
+ {
+ SpeakSapiFallback(text);
+ }
+ catch (Exception ex2)
+ {
+ Console.WriteLine($"[TTS] SAPI fallback also failed: {ex2.Message}");
+ }
+ }
+ });
+ }
+
+ ///
+ /// Stop current playback (kill ffplay or cancel SAPI).
+ ///
+ public void StopPlayback()
+ {
+ lock (_lock)
+ {
+ // Kill ffplay if running
+ if (_ffplayProcess != null)
+ {
+ try
+ {
+ if (!_ffplayProcess.HasExited)
+ _ffplayProcess.Kill();
+ _ffplayProcess.Dispose();
+ }
+ catch { }
+ _ffplayProcess = null;
+ }
+
+ // Cancel SAPI if running
+ try { _synth?.SpeakAsyncCancelAll(); }
+ catch { }
+ }
+ }
+
+ private void SpeakEdgeTts(string text)
+ {
+ var voice = _config.TtsVoice;
+ if (string.IsNullOrEmpty(voice))
+ voice = DefaultVoice;
+ var rate = _config.TtsEdgeRate;
+ var pitch = _config.TtsEdgePitch;
+
+ // Step 1: Generate mp3 with edge-tts CLI
+ var edgeProcess = new Process
+ {
+ StartInfo = new ProcessStartInfo
+ {
+ FileName = "edge-tts",
+ Arguments = $"--voice \"{voice}\" --rate \"{rate}\" --pitch \"{pitch}\" --text \"{EscapeArg(text)}\" --write-media \"{TempMp3Path}\"",
+ UseShellExecute = false,
+ CreateNoWindow = true,
+ RedirectStandardOutput = true,
+ RedirectStandardError = true,
+ }
+ };
+
+ edgeProcess.Start();
+ // Consume stdout/stderr to prevent deadlock
+ edgeProcess.StandardOutput.ReadToEnd();
+ edgeProcess.StandardError.ReadToEnd();
+
+ if (!edgeProcess.WaitForExit(15_000)) // 15 second timeout
+ {
+ try { edgeProcess.Kill(); } catch { }
+ throw new TimeoutException("edge-tts timed out after 15 seconds");
+ }
+
+ if (edgeProcess.ExitCode != 0)
+ throw new InvalidOperationException($"edge-tts exited with code {edgeProcess.ExitCode}");
+
+ if (!File.Exists(TempMp3Path) || new FileInfo(TempMp3Path).Length == 0)
+ throw new FileNotFoundException("edge-tts did not produce an output file");
+
+ // Step 2: Play with ffplay
+ var ffplay = new Process
+ {
+ StartInfo = new ProcessStartInfo
+ {
+ FileName = "ffplay",
+ Arguments = $"-nodisp -autoexit -loglevel quiet \"{TempMp3Path}\"",
+ UseShellExecute = false,
+ CreateNoWindow = true,
+ }
+ };
+
+ lock (_lock)
+ {
+ ffplay.Start();
+ _ffplayProcess = ffplay;
+ }
+
+ // Wait for playback to complete (don't leave zombie processes)
+ ffplay.WaitForExit();
+ }
+
+ private void SpeakSapiFallback(string text)
+ {
+ if (_synth == null)
+ throw new InvalidOperationException("SAPI SpeechSynthesizer not available");
+
+ // Apply settings
+ // SAPI rate: -10 to 10, WPM 185 ~ -1 (slightly slower for Jarvis feel)
+ _synth.Rate = Math.Clamp((_config.TtsRate - 200) / 20, -10, 10);
+ _synth.Volume = Math.Clamp(_config.TtsVolume, 0, 100);
+
+ var voice = _config.TtsVoice;
+ if (string.IsNullOrEmpty(voice))
+ voice = "Microsoft David Desktop"; // Deep male - Jarvis vibe
+ try { _synth.SelectVoice(voice); }
+ catch { /* voice not found, use default */ }
+
+ _synth.SpeakAsync(text);
+ }
+
+ /// Escape double quotes in text for CLI argument.
+ private static string EscapeArg(string text)
+ {
+ return text
+ .Replace("\\", "\\\\")
+ .Replace("\"", "\\\"")
+ .Replace("\r", " ")
+ .Replace("\n", " ");
+ }
+
+ public static string GenerateStatusMessage(RecordingMode mode, string text, string output)
+ {
+ return mode switch
+ {
+ RecordingMode.Greppy => "Files located, sir",
+ RecordingMode.Cleanup => $"All tidied up. {CountParagraphs(output)} paragraphs ready",
+ RecordingMode.Plan => $"Plan's ready. {CountSteps(output)} steps laid out",
+ RecordingMode.Feedback => "Feedback spoken, sir",
+ _ => $"Got it. {CountWords(text)} words captured",
+ };
+ }
+
+ private static int CountWords(string text) =>
+ text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
+
+ private static int CountParagraphs(string text) =>
+ text.Split("\n\n", StringSplitOptions.RemoveEmptyEntries)
+ .Count(p => !string.IsNullOrWhiteSpace(p));
+
+ private static int CountSteps(string text) =>
+ text.Split('\n')
+ .Count(line =>
+ {
+ var trimmed = line.TrimStart();
+ return System.Text.RegularExpressions.Regex.IsMatch(trimmed, @"^(\d+[\.\):]|[-*])\s");
+ });
+
+ public void Dispose()
+ {
+ StopPlayback();
+ try
+ {
+ _synth?.Dispose();
+ }
+ catch { }
+ _synth = null;
+
+ // Clean up temp file
+ try
+ {
+ if (File.Exists(TempMp3Path))
+ File.Delete(TempMp3Path);
+ }
+ catch { }
+ }
+}
diff --git a/windows-native/src/VibeToText/Data/ConfigStore.cs b/windows-native/src/VibeToText/Data/ConfigStore.cs
index 0da6d49..4e76655 100644
--- a/windows-native/src/VibeToText/Data/ConfigStore.cs
+++ b/windows-native/src/VibeToText/Data/ConfigStore.cs
@@ -23,6 +23,12 @@ public partial class ConfigStore : ObservableObject
private string? _geminiApiKey;
private string _whisperModel = "base";
private List _customDictionary = new();
+ private bool _ttsEnabled = true;
+ private int _ttsRate = 200;
+ private int _ttsVolume = 80;
+ private string? _ttsVoice;
+ private string _ttsEdgeRate = "+12%";
+ private string _ttsEdgePitch = "+1Hz";
private JsonObject? _rawJson; // Preserve unknown keys
public int? AudioDeviceIndex
@@ -61,6 +67,42 @@ public List CustomDictionary
set { SetProperty(ref _customDictionary, value); Save(); }
}
+ public bool TtsEnabled
+ {
+ get => _ttsEnabled;
+ set { SetProperty(ref _ttsEnabled, value); Save(); }
+ }
+
+ public int TtsRate
+ {
+ get => _ttsRate;
+ set { SetProperty(ref _ttsRate, value); Save(); }
+ }
+
+ public int TtsVolume
+ {
+ get => _ttsVolume;
+ set { SetProperty(ref _ttsVolume, value); Save(); }
+ }
+
+ public string? TtsVoice
+ {
+ get => _ttsVoice;
+ set { SetProperty(ref _ttsVoice, value); Save(); }
+ }
+
+ public string TtsEdgeRate
+ {
+ get => _ttsEdgeRate;
+ set { SetProperty(ref _ttsEdgeRate, value); Save(); }
+ }
+
+ public string TtsEdgePitch
+ {
+ get => _ttsEdgePitch;
+ set { SetProperty(ref _ttsEdgePitch, value); Save(); }
+ }
+
public ConfigStore()
{
Directory.CreateDirectory(ConfigDir);
@@ -91,6 +133,13 @@ public void Load()
.Where(s => !string.IsNullOrEmpty(s))
.ToList();
}
+
+ _ttsEnabled = _rawJson["tts_enabled"]?.GetValue() ?? true;
+ _ttsRate = _rawJson["tts_rate"]?.GetValue() ?? 200;
+ _ttsVolume = _rawJson["tts_volume"]?.GetValue() ?? 80;
+ _ttsVoice = _rawJson["tts_voice"]?.GetValue();
+ _ttsEdgeRate = _rawJson["tts_edge_rate"]?.GetValue() ?? "+12%";
+ _ttsEdgePitch = _rawJson["tts_edge_pitch"]?.GetValue() ?? "+1Hz";
}
catch (Exception ex)
{
@@ -127,6 +176,17 @@ public void Save()
_rawJson["whisper_model"] = _whisperModel;
+ _rawJson["tts_enabled"] = _ttsEnabled;
+ _rawJson["tts_rate"] = _ttsRate;
+ _rawJson["tts_volume"] = _ttsVolume;
+ if (_ttsVoice != null)
+ _rawJson["tts_voice"] = _ttsVoice;
+ else
+ _rawJson.Remove("tts_voice");
+
+ _rawJson["tts_edge_rate"] = _ttsEdgeRate;
+ _rawJson["tts_edge_pitch"] = _ttsEdgePitch;
+
var dictArray = new JsonArray();
foreach (var word in _customDictionary)
dictArray.Add(word);
diff --git a/windows-native/src/VibeToText/VibeToText.csproj b/windows-native/src/VibeToText/VibeToText.csproj
index 2065255..1cae332 100644
--- a/windows-native/src/VibeToText/VibeToText.csproj
+++ b/windows-native/src/VibeToText/VibeToText.csproj
@@ -38,6 +38,9 @@
+
+
+