diff --git a/CHANGELOG.md b/CHANGELOG.md index c215a3f..b00415d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ ## Unreleased ### Added +- **JARVIS-style TTS status reports** — Neural voice (edge-tts, en-GB-RyanNeural) speaks concise status after each action ("Got it. 12 words captured", "Located 4 files, sir") +- **TTS HTTP API server** — `POST http://127.0.0.1:7865/api/speak` endpoint lets any external tool (Claude Code, scripts, etc.) trigger spoken feedback +- **Feedback hotkey mode** (`Cmd+Shift+F`) — Pastes transcription with TTS endpoint instructions so the receiving LLM can speak back +- **Chunked TTS playback** — Long text splits into rolling chunks of 2-3 sentences; first chunk plays immediately while the rest generate in the background +- **Cross-platform TTS** — edge-tts with ffplay/afplay for headless playback on Windows, macOS, and Linux; SAPI/AVSpeechSynthesizer/espeak-ng fallback when offline +- **TTS config keys** — `tts_enabled`, `tts_voice`, `tts_edge_rate`, `tts_edge_pitch`, `tts_rate`, `tts_volume` in `~/.vibetotext/config.json` - **Gemini LLM integration** — New `llm.py` module that uses Google Gemini to clean up rambling voice transcriptions into clear prompts and generate structured implementation plans - **Window state persistence** — History app now remembers its position and size between sessions - **Startup/stop scripts** — `start-all.sh` and `stop-all.sh` to launch and kill both services in one command diff --git a/README.md b/README.md index 47ae3d5..6a825bd 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,20 @@ All implementations share the same SQLite database at `~/.vibetotext/history.db` - `Cmd+Shift` — **Greppy** mode with semantic code search - `Alt+Shift` — **Cleanup** mode (AI refines rambling into clear prompts) - `Cmd+Alt` — **Plan** mode (generates structured implementation plans) +- `Cmd+Shift+F` — **Feedback** mode (pastes transcription with TTS endpoint instructions so any LLM can speak back) **Fast Local Transcription** - Whisper.cpp for 2-4x faster transcription than Python Whisper - Technical vocabulary bias for programming terms - Auto-paste to cursor +**JARVIS-style TTS Status Reports** +- Neural voice (edge-tts, en-GB-RyanNeural) speaks status after each action +- Chunked playback for long text — first sentence plays immediately while rest generates in background +- HTTP API server at `http://127.0.0.1:7865` enables any external tool to speak via `POST /api/speak` +- Configurable voice, rate, pitch, and volume in `~/.vibetotext/config.json` +- Falls back to platform TTS (SAPI/say/espeak-ng) when offline + ## Analytics & Settings ![Analytics Dashboard](docs/analytics.png) diff --git a/macos-native/Sources/Core/ApiServer.swift b/macos-native/Sources/Core/ApiServer.swift new file mode 100644 index 0000000..93be85a --- /dev/null +++ b/macos-native/Sources/Core/ApiServer.swift @@ -0,0 +1,166 @@ +import Foundation +import Network + +/// Minimal HTTP API server using NWListener — exposes TTS and status +/// endpoints for external tools (e.g. DevGlide MCP voice server). +final class ApiServer { + private var listener: NWListener? + let port: UInt16 + + init(port: UInt16 = 7865) { + self.port = port + } + + // MARK: - Lifecycle + + func start() { + do { + let params = NWParameters.tcp + listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!) + listener?.newConnectionHandler = { [weak self] connection in + self?.handleConnection(connection) + } + listener?.stateUpdateHandler = { state in + switch state { + case .ready: + print("[API] Server running on http://127.0.0.1:\(self.port)") + case .failed(let error): + print("[API] Listener failed: \(error)") + default: + break + } + } + listener?.start(queue: .global()) + } catch { + print("[API] Failed to start: \(error)") + } + } + + func stop() { + listener?.cancel() + listener = nil + } + + // MARK: - Connection handling + + private func handleConnection(_ connection: NWConnection) { + connection.start(queue: .global()) + connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) { [weak self] data, _, _, error in + if let error { + print("[API] Receive error: \(error)") + connection.cancel() + return + } + guard let data, let request = String(data: data, encoding: .utf8) else { + connection.cancel() + return + } + self?.handleHTTP(request: request, connection: connection) + } + } + + // MARK: - HTTP routing + + private func handleHTTP(request: String, connection: NWConnection) { + let lines = request.components(separatedBy: "\r\n") + guard let requestLine = lines.first else { + sendResponse(connection, status: 400, body: #"{"error":"malformed request"}"#) + return + } + + let parts = requestLine.split(separator: " ", maxSplits: 2) + guard parts.count >= 2 else { + sendResponse(connection, status: 400, body: #"{"error":"malformed request line"}"#) + return + } + + let method = String(parts[0]) + let path = String(parts[1]) + + // Handle CORS preflight + if method == "OPTIONS" { + sendCORSPreflight(connection) + return + } + + switch (method, path) { + case ("POST", "/api/speak"): + handleSpeak(request: request, connection: connection) + + case ("POST", "/api/stop"): + TtsService.shared.stop() + sendResponse(connection, status: 200, body: #"{"status":"stopped"}"#) + + case ("GET", "/api/status"): + sendResponse(connection, status: 200, body: #"{"status":"ok","tts_enabled":\#(ConfigStore.shared.ttsEnabled)}"#) + + default: + sendResponse(connection, status: 404, body: #"{"error":"not found"}"#) + } + } + + // MARK: - Endpoint handlers + + private func handleSpeak(request: String, connection: NWConnection) { + // Extract JSON body after the blank line separating headers from body + guard let bodyRange = request.range(of: "\r\n\r\n") else { + sendResponse(connection, status: 400, body: #"{"error":"no body"}"#) + return + } + let bodyStr = String(request[bodyRange.upperBound...]) + + guard let bodyData = bodyStr.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: bodyData) as? [String: Any], + let text = json["text"] as? String, !text.isEmpty else { + sendResponse(connection, status: 400, body: #"{"error":"missing or empty \"text\" field"}"#) + return + } + + TtsService.shared.speak(text) + sendResponse(connection, status: 200, body: #"{"status":"speaking"}"#) + } + + // MARK: - Response helpers + + private func sendResponse(_ connection: NWConnection, status: Int, body: String) { + let statusText: String + switch status { + case 200: statusText = "OK" + case 400: statusText = "Bad Request" + case 404: statusText = "Not Found" + case 405: statusText = "Method Not Allowed" + default: statusText = "Error" + } + + let response = [ + "HTTP/1.1 \(status) \(statusText)", + "Content-Type: application/json", + "Access-Control-Allow-Origin: *", + "Access-Control-Allow-Methods: GET, POST, OPTIONS", + "Access-Control-Allow-Headers: Content-Type", + "Content-Length: \(body.utf8.count)", + "", + body, + ].joined(separator: "\r\n") + + connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in + connection.cancel() + }) + } + + private func sendCORSPreflight(_ connection: NWConnection) { + let response = [ + "HTTP/1.1 204 No Content", + "Access-Control-Allow-Origin: *", + "Access-Control-Allow-Methods: GET, POST, OPTIONS", + "Access-Control-Allow-Headers: Content-Type", + "Content-Length: 0", + "", + "", + ].joined(separator: "\r\n") + + connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in + connection.cancel() + }) + } +} diff --git a/macos-native/Sources/Core/GeminiService.swift b/macos-native/Sources/Core/GeminiService.swift index aa2fc04..1310beb 100644 --- a/macos-native/Sources/Core/GeminiService.swift +++ b/macos-native/Sources/Core/GeminiService.swift @@ -29,6 +29,18 @@ final class GeminiService { return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.4, maxTokens: 4096) } + // MARK: - Feedback mode + + func feedback(text: String) async throws -> String? { + guard let apiKey = ConfigStore.shared.geminiAPIKey else { + print("[Gemini] No API key configured") + return nil + } + + let prompt = Self.feedbackPrompt.replacingOccurrences(of: "{text}", with: text) + return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.5, maxTokens: 256) + } + // MARK: - REST API call private func generateContent(prompt: String, apiKey: String, temperature: Double, maxTokens: Int) async throws -> String? { @@ -88,6 +100,23 @@ final class GeminiService { Refined output: """ + static let feedbackPrompt = """ + You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response. + + Rules: + - Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS. + - Be direct and helpful. No filler, no "I think", no hedging. + - Use natural spoken English — contractions, simple words. No markdown, no bullet points. + - If they asked a question, answer it. If they described something, give concise feedback. + - Address the user as "sir" occasionally but not every sentence. + - Sound like a knowledgeable, confident AI assistant. + + User said: + {text} + + Your spoken response: + """ + static let planPrompt = """ You are a senior software architect. Transform a rambling voice description into a concise implementation plan. diff --git a/macos-native/Sources/Core/HotkeyManager.swift b/macos-native/Sources/Core/HotkeyManager.swift index d72cadc..2b19870 100644 --- a/macos-native/Sources/Core/HotkeyManager.swift +++ b/macos-native/Sources/Core/HotkeyManager.swift @@ -33,6 +33,8 @@ final class HotkeyManager { private let hotkeys: [HotkeyDef] = [ // cmd+alt+p (key code 35 = 'p') HotkeyDef(modifiers: [.maskCommand, .maskAlternate], keyCode: 35, mode: "plan"), + // cmd+shift+f (key code 3 = 'f') + HotkeyDef(modifiers: [.maskCommand, .maskShift], keyCode: 3, mode: "feedback"), // cmd+alt+shift (modifiers only) — must be before alt+shift (more specific) HotkeyDef(modifiers: [.maskCommand, .maskAlternate, .maskShift], keyCode: nil, mode: "greppy"), // alt+shift (modifiers only) @@ -76,6 +78,7 @@ final class HotkeyManager { print(" [alt+shift] = cleanup") print(" [cmd+alt+shift] = greppy") print(" [cmd+alt+p] = plan") + print(" [cmd+shift+f] = feedback") } func stop() { diff --git a/macos-native/Sources/Core/TranscriptionPipeline.swift b/macos-native/Sources/Core/TranscriptionPipeline.swift index 6ea648e..4d436e6 100644 --- a/macos-native/Sources/Core/TranscriptionPipeline.swift +++ b/macos-native/Sources/Core/TranscriptionPipeline.swift @@ -10,6 +10,7 @@ final class TranscriptionPipeline { private var geminiService: GeminiService? private var greppyService: GreppyService? private var waveformController: WaveformOverlayController? + private var apiServer: ApiServer? private var isRecording = false private var currentMode: String? @@ -32,6 +33,10 @@ final class TranscriptionPipeline { } hotkeyManager?.start() + + apiServer = ApiServer() + apiServer?.start() + print("[Pipeline] Started — hold hotkey to record") } @@ -41,6 +46,8 @@ final class TranscriptionPipeline { _ = recorder?.stop() } waveformController?.hide() + TtsService.shared.stop() + apiServer?.stop() } private func startRecording(mode: String) { @@ -90,6 +97,7 @@ final class TranscriptionPipeline { // 2. Process based on mode var output = text + var fileCount = 0 switch mode { case "cleanup": if let refined = try await geminiService?.cleanup(text: text) { @@ -103,7 +111,15 @@ final class TranscriptionPipeline { let context = await greppyService?.search(query: text) ?? "" if !context.isEmpty { output = text + "\n\n" + context + fileCount = context.components(separatedBy: "### ").count - 1 } + case "feedback": + if let feedback = try await geminiService?.feedback(text: text) { + TtsService.shared.speak(feedback) + } else { + TtsService.shared.speak("I couldn't generate feedback, sir") + } + // output stays as original text for paste default: break // transcribe mode: use raw text } @@ -119,8 +135,14 @@ final class TranscriptionPipeline { PasteService.pasteAtCursor(output) print("[Pipeline] Pasted at cursor.") + // 5. Speak status report + let status = TtsService.generateStatusMessage( + mode: mode, text: text, output: output, fileCount: fileCount) + TtsService.shared.speak(status) + } catch { print("[Pipeline] Error: \(error)") + TtsService.shared.speak("Processing failed") } } } diff --git a/macos-native/Sources/Core/TtsService.swift b/macos-native/Sources/Core/TtsService.swift new file mode 100644 index 0000000..851712c --- /dev/null +++ b/macos-native/Sources/Core/TtsService.swift @@ -0,0 +1,144 @@ +import AVFoundation + +/// Fire-and-forget text-to-speech — neural TTS via edge-tts CLI, with +/// AVSpeechSynthesizer as fallback when edge-tts is unavailable. +final class TtsService: NSObject, AVSpeechSynthesizerDelegate { + static let shared = TtsService() + + private let synthesizer = AVSpeechSynthesizer() + + /// Active afplay process for single-slot cancellation. + private var activeProcess: Process? + + private override init() { + super.init() + synthesizer.delegate = self + } + + // MARK: - Public + + func speak(_ text: String) { + let config = ConfigStore.shared + guard config.ttsEnabled else { return } + guard !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return } + + // Cancel any previous speech + stop() + + Task.detached { [weak self] in + let tmpPath = NSTemporaryDirectory() + "vibetotext_tts.mp3" + let voice = config.ttsVoice ?? "en-GB-RyanNeural" + let rate = config.ttsEdgeRate ?? "+12%" + let pitch = config.ttsEdgePitch ?? "+1Hz" + + // 1. Generate mp3 via edge-tts CLI + let genProcess = Process() + genProcess.executableURL = URL(fileURLWithPath: "/usr/bin/env") + genProcess.arguments = [ + "edge-tts", + "--voice", voice, + "--rate", rate, + "--pitch", pitch, + "--text", text, + "--write-media", tmpPath, + ] + genProcess.standardOutput = FileHandle.nullDevice + genProcess.standardError = FileHandle.nullDevice + + do { + try genProcess.run() + genProcess.waitUntilExit() + + guard genProcess.terminationStatus == 0 else { + print("[TTS] edge-tts exited with status \(genProcess.terminationStatus), falling back") + self?.speakFallback(text) + return + } + + // 2. Play with afplay + let playProcess = Process() + playProcess.executableURL = URL(fileURLWithPath: "/usr/bin/afplay") + playProcess.arguments = [tmpPath] + try playProcess.run() + self?.activeProcess = playProcess + } catch { + print("[TTS] edge-tts failed: \(error), falling back") + self?.speakFallback(text) + } + } + } + + func stop() { + // Kill active afplay process + if let proc = activeProcess, proc.isRunning { + proc.terminate() + } + activeProcess = nil + + // Also stop AVSpeechSynthesizer fallback + if synthesizer.isSpeaking { + synthesizer.stopSpeaking(at: .immediate) + } + } + + // MARK: - Fallback (AVSpeechSynthesizer) + + private func speakFallback(_ text: String) { + let config = ConfigStore.shared + + // Cancel any previous fallback speech + if synthesizer.isSpeaking { + synthesizer.stopSpeaking(at: .immediate) + } + + let utterance = AVSpeechUtterance(string: text) + + // Convert WPM (default 200) to AVSpeechUtterance rate (0.0 - 1.0) + let wpm = Float(config.ttsRate) + utterance.rate = max(AVSpeechUtteranceMinimumSpeechRate, + min(AVSpeechUtteranceMaximumSpeechRate, + (wpm / 200.0) * AVSpeechUtteranceDefaultSpeechRate)) + + // Volume: 0-100 -> 0.0-1.0 + utterance.volume = Float(config.ttsVolume) / 100.0 + + let voiceId = (config.ttsVoice?.isEmpty == false) ? config.ttsVoice : nil + if let id = voiceId { + utterance.voice = AVSpeechSynthesisVoice(identifier: id) + ?? AVSpeechSynthesisVoice(language: id) + } else { + // Default: Daniel (British male) for Jarvis feel + utterance.voice = AVSpeechSynthesisVoice(identifier: "com.apple.voice.compact.en-GB.Daniel") + ?? AVSpeechSynthesisVoice(language: "en-GB") + } + + synthesizer.speak(utterance) + } + + // MARK: - Status messages + + static func generateStatusMessage(mode: String, text: String, output: String, fileCount: Int = 0) -> String { + switch mode { + case "greppy": + return fileCount == 1 ? "Located one file, sir" : "Located \(fileCount) files, sir" + case "cleanup": + let n = output.components(separatedBy: "\n\n") + .filter { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty } + .count + return "All tidied up. \(n) paragraphs ready" + case "plan": + let n = output.components(separatedBy: "\n") + .filter { line in + let trimmed = line.trimmingCharacters(in: .whitespaces) + return trimmed.range(of: #"^(\d+[\.\):]|[-*])\s"#, options: .regularExpression) != nil + } + .count + return "Plan's ready. \(n) steps laid out" + case "feedback": + return "Feedback spoken, sir" + default: + let n = text.split(separator: " ").count + return "Got it. \(n) words captured" + } + } +} diff --git a/macos-native/Sources/Data/ConfigStore.swift b/macos-native/Sources/Data/ConfigStore.swift index db7283b..3bf9e87 100644 --- a/macos-native/Sources/Data/ConfigStore.swift +++ b/macos-native/Sources/Data/ConfigStore.swift @@ -11,6 +11,12 @@ final class ConfigStore: ObservableObject { @Published var audioDeviceName: String? @Published var codebasePath: String? @Published var customDictionary: [String] + @Published var ttsEnabled: Bool = true + @Published var ttsRate: Int = 200 + @Published var ttsVolume: Int = 80 + @Published var ttsVoice: String? + @Published var ttsEdgeRate: String? + @Published var ttsEdgePitch: String? private init() { let dir = FileManager.default.homeDirectoryForCurrentUser @@ -32,6 +38,12 @@ final class ConfigStore: ObservableObject { audioDeviceName = json["audio_device_name"] as? String codebasePath = json["codebase_path"] as? String customDictionary = json["custom_dictionary"] as? [String] ?? [] + ttsEnabled = json["tts_enabled"] as? Bool ?? true + ttsRate = json["tts_rate"] as? Int ?? 200 + ttsVolume = json["tts_volume"] as? Int ?? 80 + ttsVoice = json["tts_voice"] as? String + ttsEdgeRate = json["tts_edge_rate"] as? String + ttsEdgePitch = json["tts_edge_pitch"] as? String } catch { print("[ConfigStore] Failed to load config: \(error)") } @@ -68,6 +80,24 @@ final class ConfigStore: ObservableObject { json.removeValue(forKey: "codebase_path") } json["custom_dictionary"] = customDictionary + json["tts_enabled"] = ttsEnabled + json["tts_rate"] = ttsRate + json["tts_volume"] = ttsVolume + if let voice = ttsVoice { + json["tts_voice"] = voice + } else { + json.removeValue(forKey: "tts_voice") + } + if let edgeRate = ttsEdgeRate { + json["tts_edge_rate"] = edgeRate + } else { + json.removeValue(forKey: "tts_edge_rate") + } + if let edgePitch = ttsEdgePitch { + json["tts_edge_pitch"] = edgePitch + } else { + json.removeValue(forKey: "tts_edge_pitch") + } let data = try JSONSerialization.data(withJSONObject: json, options: [.prettyPrinted, .sortedKeys]) try data.write(to: configURL, options: .atomic) diff --git a/src/vibetotext/__main__.py b/src/vibetotext/__main__.py index b463bfc..4306bc2 100644 --- a/src/vibetotext/__main__.py +++ b/src/vibetotext/__main__.py @@ -18,8 +18,10 @@ from vibetotext.greppy import search_files, format_files_for_context from vibetotext.llm import cleanup_text, generate_implementation_plan from vibetotext.output import paste_at_cursor +from vibetotext.tts import speak, speak_status from vibetotext.history import TranscriptionHistory from vibetotext.socket_server import TranscriptionSocketServer +from vibetotext.api_server import ApiServer IS_WINDOWS = platform.system() == "Windows" @@ -117,6 +119,11 @@ def main(): default="cmd+alt+/", help="Hotkey for implementation plan mode (default: cmd+alt+/)", ) + parser.add_argument( + "--feedback-hotkey", + default="cmd+shift+f", + help="Hotkey for voice feedback mode (default: cmd+shift+f)", + ) parser.add_argument( "--history-hotkey", default="ctrl+alt", @@ -215,6 +222,7 @@ def main(): args.greppy_hotkey: "greppy", args.cleanup_hotkey: "cleanup", args.plan_hotkey: "plan", + args.feedback_hotkey: "feedback", args.history_hotkey: "history", args.viz_hotkey: "viz", } @@ -239,8 +247,13 @@ def main(): socket_server = TranscriptionSocketServer(transcriber) socket_server.start() + # Start HTTP API server (TTS endpoint for external tools) + api_server = ApiServer(port=saved_config.get("api_port", 7865)) + api_server.start() + # Ensure cleanup on any exit (crash, signal, etc.) def _cleanup(): + api_server.stop() if socket_server: try: socket_server.stop() @@ -330,6 +343,7 @@ def on_stop(mode): if text_lower in noise_markers: return + files = [] if mode == "greppy": # Greppy mode: search for relevant files and attach them files = search_files(text, limit=args.greppy_limit, codebase=args.codebase) @@ -347,6 +361,16 @@ def on_stop(mode): plan = generate_implementation_plan(text) output = plan if plan else text + elif mode == "feedback": + # Feedback mode: paste transcription + TTS endpoint instructions + # so the receiving LLM knows how to speak back + speak_url = f"{api_server.url}/api/speak" + output = ( + f"{text}\n\n" + f"[To respond verbally to the user, POST to {speak_url} " + f'with body: {{"text": "your response here"}}]' + ) + else: # Regular transcribe mode - just transcribe, no context search output = text @@ -357,7 +381,12 @@ def on_stop(mode): # Paste at cursor paste_at_cursor(output) + # Speak status report (skip for feedback mode — already spoke) + if mode != "feedback": + speak_status(mode, text, output, file_count=len(files)) + except Exception: + speak("Processing failed") # Log error to file error_log = os.path.join(tempfile.gettempdir(), "vibetotext_crash.log") error_msg = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Error in on_stop (mode={mode}):\n" diff --git a/src/vibetotext/api_server.py b/src/vibetotext/api_server.py new file mode 100644 index 0000000..fdbe562 --- /dev/null +++ b/src/vibetotext/api_server.py @@ -0,0 +1,92 @@ +"""HTTP API server — exposes TTS and status endpoints for external tools.""" + +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler + +from vibetotext.tts import speak + +DEFAULT_PORT = 7865 + + +class _Handler(BaseHTTPRequestHandler): + """Handle API requests.""" + + def log_message(self, format, *args): + # Suppress default stderr logging + pass + + def _send_json(self, status: int, data: dict): + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_OPTIONS(self): + """Handle CORS preflight.""" + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.end_headers() + + def do_GET(self): + if self.path == "/api/status": + self._send_json(200, {"status": "ok", "service": "vibetotext"}) + else: + self._send_json(404, {"error": "not found"}) + + def do_POST(self): + if self.path == "/api/speak": + self._handle_speak() + else: + self._send_json(404, {"error": "not found"}) + + def _handle_speak(self): + try: + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + data = json.loads(body) + text = data.get("text", "").strip() + + if not text: + self._send_json(400, {"error": "missing 'text' field"}) + return + + speak(text) + self._send_json(200, {"status": "speaking", "text": text}) + + except json.JSONDecodeError: + self._send_json(400, {"error": "invalid JSON"}) + except Exception as e: + self._send_json(500, {"error": str(e)}) + + +class ApiServer: + """Lightweight HTTP server for TTS and status endpoints.""" + + def __init__(self, port: int = DEFAULT_PORT): + self.port = port + self._server: HTTPServer | None = None + self._thread: threading.Thread | None = None + + def start(self): + try: + self._server = HTTPServer(("127.0.0.1", self.port), _Handler) + self._thread = threading.Thread(target=self._server.serve_forever, daemon=True) + self._thread.start() + print(f"[API] Server running on http://127.0.0.1:{self.port}") + print(f"[API] POST /api/speak {{\"text\": \"...\"}}") + print(f"[API] GET /api/status") + except OSError as e: + print(f"[API] Failed to start server on port {self.port}: {e}") + + def stop(self): + if self._server: + self._server.shutdown() + self._server = None + + @property + def url(self) -> str: + return f"http://127.0.0.1:{self.port}" diff --git a/src/vibetotext/cli.py b/src/vibetotext/cli.py index c2f9a73..b8841c7 100644 --- a/src/vibetotext/cli.py +++ b/src/vibetotext/cli.py @@ -15,8 +15,10 @@ from .greppy import search_files, format_files_for_context from .llm import cleanup_text, generate_implementation_plan from .output import paste_at_cursor +from .tts import speak, speak_status from .history import TranscriptionHistory from .history_ui import toggle_history, refresh_history +from .api_server import ApiServer def main(): @@ -49,6 +51,11 @@ def main(): default="cmd+alt+p", help="Hotkey for implementation plan mode (default: cmd+alt+p)", ) + parser.add_argument( + "--feedback-hotkey", + default="cmd+shift+f", + help="Hotkey for voice feedback mode (default: cmd+shift+f)", + ) parser.add_argument( "--history-hotkey", default="ctrl+alt", @@ -139,6 +146,7 @@ def main(): args.greppy_hotkey: "greppy", args.cleanup_hotkey: "cleanup", args.plan_hotkey: "plan", + args.feedback_hotkey: "feedback", args.history_hotkey: "history", args.viz_hotkey: "viz", } @@ -151,6 +159,10 @@ def main(): if ui: recorder.on_level = ui.update_waveform + # Start HTTP API server (TTS endpoint for external tools) + api_server = ApiServer() + api_server.start() + print(f"vibetotext ready. Hold hotkey to record, release to process.") print(f" [{args.hotkey}] = transcribe + paste") print(f" [{args.greppy_hotkey}] = Greppy search + attach files") @@ -277,6 +289,7 @@ def on_stop(mode): print(f"Transcribed: {text}") _log(f"on_stop: transcribed text='{text[:80]}'") + files = [] if mode == "greppy": # Greppy mode: search for relevant files and attach them print("Searching with Greppy...", end="", flush=True) @@ -324,6 +337,17 @@ def on_stop(mode): print(" failed, using original.") output = text + elif mode == "feedback": + # Feedback mode: paste transcription + TTS endpoint instructions + _log("on_stop: feedback mode — appending TTS instructions") + speak_url = f"{api_server.url}/api/speak" + output = ( + f"{text}\n\n" + f"[To respond verbally to the user, POST to {speak_url} " + f'with body: {{"text": "your response here"}}]' + ) + print(f"Feedback mode: pasting with TTS endpoint {speak_url}") + else: # Regular transcribe mode if not args.no_context: @@ -352,10 +376,15 @@ def on_stop(mode): _log(f"on_stop: paste_at_cursor() done ({time.time() - t:.3f}s)") print("Pasted at cursor.\n") + # Speak status report (skip for feedback mode — already spoke) + if mode != "feedback": + speak_status(mode, text, output, file_count=len(files)) + _log(f"on_stop: END total={time.time() - stop_t0:.3f}s") except Exception as e: _log(f"on_stop: EXCEPTION after {time.time() - stop_t0:.3f}s: {e}") + speak("Processing failed") # Log error to file and print to console error_log = os.path.join(tempfile.gettempdir(), "vibetotext_crash.log") diff --git a/src/vibetotext/llm.py b/src/vibetotext/llm.py index 0f3b7b1..74e4190 100644 --- a/src/vibetotext/llm.py +++ b/src/vibetotext/llm.py @@ -178,3 +178,51 @@ def generate_implementation_plan(text: str) -> Optional[str]: except Exception as e: print(f"Gemini plan generation error: {e}") return None + + +FEEDBACK_PROMPT = """You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response. + +Rules: +- Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS. +- Be direct and helpful. No filler, no "I think", no hedging. +- Use natural spoken English — contractions, simple words. No markdown, no bullet points. +- If they asked a question, answer it. If they described something, give concise feedback. +- Address the user as "sir" occasionally but not every sentence. +- Sound like a knowledgeable, confident AI assistant. + +To speak your response aloud, send a POST request to the user's TTS endpoint: + POST {speak_url} + Content-Type: application/json + {{"text": "your response here"}} + +User said: +{text} + +Your spoken response:""" + + +def generate_feedback(text: str, speak_url: str = "http://127.0.0.1:7865/api/speak") -> Optional[str]: + """Use Gemini to generate a concise spoken feedback response.""" + if not _genai_available: + print("Gemini feedback error: google-genai package not installed") + return None + if not _client: + print("Gemini feedback error: No API key configured") + return None + + try: + prompt = FEEDBACK_PROMPT.format(text=text, speak_url=speak_url) + response = _client.models.generate_content( + model="gemini-2.5-flash", + contents=prompt, + config=genai_types.GenerateContentConfig( + temperature=0.5, + max_output_tokens=256, + ), + ) + if response.text: + return response.text.strip() + return None + except Exception as e: + print(f"Gemini feedback error: {e}") + return None diff --git a/src/vibetotext/tts.py b/src/vibetotext/tts.py new file mode 100644 index 0000000..71dff6e --- /dev/null +++ b/src/vibetotext/tts.py @@ -0,0 +1,291 @@ +"""Text-to-speech status reports — neural TTS via edge-tts, fire-and-forget.""" + +import atexit +import asyncio +import json +import os +import platform +import subprocess +import re +import tempfile +import threading +from pathlib import Path + +SYSTEM = platform.system() + +_active_process: subprocess.Popen | None = None +_tts_lock = threading.Lock() +_stop_event = threading.Event() + +# edge-tts voice settings (JARVIS-style: Ryan British, fast & crisp) +_EDGE_VOICE = "en-GB-RyanNeural" +_EDGE_RATE = "+12%" +_EDGE_PITCH = "+1Hz" + +# Chunking threshold: texts longer than this many chars get chunked +_CHUNK_THRESHOLD = 100 + + +def _load_config() -> dict: + """Load TTS settings from ~/.vibetotext/config.json.""" + try: + config_file = Path.home() / ".vibetotext" / "config.json" + if config_file.exists(): + with open(config_file, "r") as f: + return json.load(f) + except Exception: + pass + return {} + + +def stop(): + """Kill the active TTS subprocess if still running.""" + global _active_process + _stop_event.set() + with _tts_lock: + if _active_process is not None: + try: + _active_process.kill() + _active_process.wait(timeout=1) + except Exception: + pass + _active_process = None + + +def _play_mp3(path: str, first_chunk: bool = True): + """Play an mp3 file in the background. first_chunk adds startup delay to prevent clipping.""" + try: + af_args = ["-af", "adelay=300|300"] if first_chunk else [] + if SYSTEM == "Windows": + try: + return subprocess.Popen( + ["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet"] + af_args + [path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + except FileNotFoundError: + return subprocess.Popen( + f'start "" "{path}"', + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + elif SYSTEM == "Darwin": + return subprocess.Popen( + ["afplay", path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + else: + for player in [["mpv", "--no-video", path], ["ffplay", "-nodisp", "-autoexit"] + af_args + [path]]: + try: + return subprocess.Popen(player, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except FileNotFoundError: + continue + except Exception: + pass + return None + + +def _play_mp3_blocking(path: str, first_chunk: bool = True) -> bool: + """Play an mp3 and wait for it to finish. Returns True if completed.""" + proc = _play_mp3(path, first_chunk=first_chunk) + if proc is None: + return False + global _active_process + with _tts_lock: + _active_process = proc + proc.wait() + return proc.returncode == 0 + + +def _split_sentences(text: str) -> list[str]: + """Split text into sentences for chunked playback.""" + # Split on sentence-ending punctuation followed by space or end of string + parts = re.split(r'(?<=[.!?])\s+', text.strip()) + # Merge very short fragments with the previous sentence + sentences = [] + for part in parts: + if sentences and len(sentences[-1]) < 30: + sentences[-1] += " " + part + else: + sentences.append(part) + return [s for s in sentences if s.strip()] + + +def _generate_mp3(text: str, voice: str, rate: str, pitch: str, path: str): + """Generate mp3 from text using edge-tts.""" + import edge_tts + loop = asyncio.new_event_loop() + try: + communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch) + loop.run_until_complete(communicate.save(path)) + finally: + loop.close() + + +def _speak_edge_tts(text: str, config: dict): + """Generate and play speech using edge-tts (neural voice).""" + global _active_process + + voice = config.get("tts_voice") or _EDGE_VOICE + rate = config.get("tts_edge_rate") or _EDGE_RATE + pitch = config.get("tts_edge_pitch") or _EDGE_PITCH + + # Short text: generate and play in one shot + if len(text) <= _CHUNK_THRESHOLD: + mp3_path = os.path.join(tempfile.gettempdir(), "vibetotext_tts.mp3") + _generate_mp3(text, voice, rate, pitch, mp3_path) + with _tts_lock: + _active_process = _play_mp3(mp3_path) + return + + # Long text: rolling chunks of 2-3 sentences — generate next while playing current + sentences = _split_sentences(text) + if not sentences: + return + + # Group sentences into chunks of ~2-3 + chunks = [] + current_chunk = [] + current_len = 0 + for s in sentences: + current_chunk.append(s) + current_len += len(s) + # Target ~120-200 chars per chunk (2-3 sentences) + if current_len >= 120: + chunks.append(" ".join(current_chunk)) + current_chunk = [] + current_len = 0 + if current_chunk: + chunks.append(" ".join(current_chunk)) + + if not chunks: + return + + tmp_dir = tempfile.gettempdir() + + # Generate first chunk + path_0 = os.path.join(tmp_dir, "vibetotext_tts_0.mp3") + _generate_mp3(chunks[0], voice, rate, pitch, path_0) + + if _stop_event.is_set(): + return + + for i, chunk in enumerate(chunks): + if _stop_event.is_set(): + return + + current_path = os.path.join(tmp_dir, f"vibetotext_tts_{i}.mp3") + + # Start generating next chunk in parallel + next_thread = None + if i + 1 < len(chunks): + next_path = os.path.join(tmp_dir, f"vibetotext_tts_{i + 1}.mp3") + next_thread = threading.Thread( + target=_generate_mp3, + args=(chunks[i + 1], voice, rate, pitch, next_path), + daemon=True, + ) + next_thread.start() + + # Play current chunk + _play_mp3_blocking(current_path, first_chunk=(i == 0)) + + # Wait for next chunk to finish generating + if next_thread: + next_thread.join() + + +def _speak_fallback(text: str, config: dict): + """Fallback to platform TTS when edge-tts is unavailable.""" + global _active_process + rate = config.get("tts_rate", 185) + volume = config.get("tts_volume", 80) + + if SYSTEM == "Darwin": + cmd = ["say", "-r", str(rate), "-v", "Daniel", text] + _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + elif SYSTEM == "Windows": + escaped = text.replace("'", "''") + sapi_rate = max(-10, min(10, round((rate - 200) / 20))) + ps_cmd = ( + "Add-Type -AssemblyName System.Speech; " + "$s = New-Object System.Speech.Synthesis.SpeechSynthesizer; " + f"$s.Rate = {sapi_rate}; $s.Volume = {volume}; " + f"$s.Speak('{escaped}')" + ) + _active_process = subprocess.Popen( + ["powershell", "-NoProfile", "-Command", ps_cmd], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + creationflags=0x08000000, + ) + else: + try: + cmd = ["espeak-ng", "-s", str(rate), "-a", str(min(200, volume * 2)), text] + _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except FileNotFoundError: + cmd = ["spd-say", "-r", str(max(-100, min(100, rate - 200))), text] + _active_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + +def speak(text: str) -> None: + """Speak text — fire-and-forget, cancels previous.""" + config = _load_config() + if not config.get("tts_enabled", True): + return + if not text or not text.strip(): + return + + # Cancel previous speech + stop() + _stop_event.clear() + + def _run(): + try: + _speak_edge_tts(text, config) + except Exception: + try: + _speak_fallback(text, config) + except Exception: + pass + + # Run in background thread to avoid blocking + t = threading.Thread(target=_run, daemon=True) + t.start() + + +def _count_words(text: str) -> int: + return len(text.split()) + + +def _count_paragraphs(text: str) -> int: + return len([p for p in text.strip().split("\n\n") if p.strip()]) + + +def _count_steps(text: str) -> int: + return len(re.findall(r"(?m)^[\s]*(?:\d+[\.\):]|[-*])\s", text)) + + +def speak_status(mode: str, text: str, output: str, file_count: int = 0) -> None: + """Generate and speak a concise JARVIS-style status message.""" + try: + if mode == "greppy": + msg = f"Located {file_count} files, sir" if file_count != 1 else "Located one file, sir" + elif mode == "cleanup": + n = _count_paragraphs(output) + msg = f"All tidied up. {n} paragraphs ready" + elif mode == "plan": + n = _count_steps(output) + msg = f"Plan's ready. {n} steps laid out" + else: + n = _count_words(text) + msg = f"Got it. {n} words captured" + speak(msg) + except Exception: + pass + + +@atexit.register +def _cleanup(): + stop() diff --git a/windows-native/src/VibeToText/App.xaml.cs b/windows-native/src/VibeToText/App.xaml.cs index 68a93e0..8c9d995 100644 --- a/windows-native/src/VibeToText/App.xaml.cs +++ b/windows-native/src/VibeToText/App.xaml.cs @@ -17,6 +17,7 @@ public partial class App : Application private MainWindow? _mainWindow; private TranscriptionPipeline? _pipeline; private HotkeyManager? _hotkeyManager; + private ApiServer? _apiServer; private Mutex? _singleInstanceMutex; // Shared services @@ -87,10 +88,15 @@ protected override void OnStartup(StartupEventArgs e) var transcriber = new WhisperTranscriber(Config.WhisperModel); var pasteService = new PasteService(); Gemini = new GeminiService(); + var ttsService = new TtsService(Config); - _pipeline = new TranscriptionPipeline(recorder, transcriber, Database, pasteService, Gemini, Config); + _pipeline = new TranscriptionPipeline(recorder, transcriber, Database, pasteService, Gemini, ttsService, Config); Pipeline = _pipeline; + // Start HTTP API server for external integrations + _apiServer = new ApiServer(ttsService); + _apiServer.Start(); + Log("Services initialized."); // Set up hotkey manager @@ -318,6 +324,7 @@ private void OnHotkeyReleased(object? sender, HotkeyEventArgs e) private void ExitApplication() { _hotkeyManager?.Stop(); + _apiServer?.Dispose(); _pipeline?.Dispose(); _trayIcon?.Dispose(); _singleInstanceMutex?.ReleaseMutex(); @@ -327,6 +334,7 @@ private void ExitApplication() protected override void OnExit(ExitEventArgs e) { _hotkeyManager?.Stop(); + _apiServer?.Dispose(); _pipeline?.Dispose(); _trayIcon?.Dispose(); _singleInstanceMutex?.ReleaseMutex(); diff --git a/windows-native/src/VibeToText/Core/ApiServer.cs b/windows-native/src/VibeToText/Core/ApiServer.cs new file mode 100644 index 0000000..062e807 --- /dev/null +++ b/windows-native/src/VibeToText/Core/ApiServer.cs @@ -0,0 +1,182 @@ +using System.Net; +using System.Text; +using System.Text.Json; + +namespace VibeToText.Core; + +/// +/// Lightweight HTTP API server for external integrations. +/// Listens on http://127.0.0.1:7865/ and exposes: +/// GET /api/status → {"status": "ok", "service": "vibetotext"} +/// POST /api/speak → {"text": "..."} → calls TtsService.Speak() +/// +public class ApiServer : IDisposable +{ + private readonly HttpListener _listener; + private readonly TtsService _ttsService; + private readonly int _port; + private CancellationTokenSource? _cts; + + public ApiServer(TtsService ttsService, int port = 7865) + { + _ttsService = ttsService; + _port = port; + _listener = new HttpListener(); + _listener.Prefixes.Add($"http://127.0.0.1:{_port}/"); + } + + public void Start() + { + try + { + _cts = new CancellationTokenSource(); + _listener.Start(); + Console.WriteLine($"[API] Server listening on http://127.0.0.1:{_port}/"); + Task.Run(() => AcceptLoop(_cts.Token)); + } + catch (Exception ex) + { + Console.WriteLine($"[API] Failed to start server: {ex.Message}"); + } + } + + public void Stop() + { + try + { + _cts?.Cancel(); + _listener.Stop(); + Console.WriteLine("[API] Server stopped."); + } + catch { } + } + + public void Dispose() + { + Stop(); + _cts?.Dispose(); + ((IDisposable)_listener).Dispose(); + } + + private async Task AcceptLoop(CancellationToken ct) + { + while (!ct.IsCancellationRequested) + { + try + { + var ctx = await _listener.GetContextAsync().WaitAsync(ct); + // Handle each request in its own task (don't await — fire and forget) + _ = Task.Run(() => HandleRequest(ctx), ct); + } + catch (OperationCanceledException) + { + break; + } + catch (HttpListenerException) when (ct.IsCancellationRequested) + { + break; + } + catch (Exception ex) + { + Console.WriteLine($"[API] Accept error: {ex.Message}"); + } + } + } + + private async Task HandleRequest(HttpListenerContext ctx) + { + var req = ctx.Request; + var resp = ctx.Response; + + // Add CORS headers for local dev tools + resp.Headers.Add("Access-Control-Allow-Origin", "*"); + resp.Headers.Add("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + resp.Headers.Add("Access-Control-Allow-Headers", "Content-Type"); + + try + { + // Handle CORS preflight + if (req.HttpMethod == "OPTIONS") + { + resp.StatusCode = 204; + resp.Close(); + return; + } + + var path = req.Url?.AbsolutePath ?? ""; + + switch (path) + { + case "/api/status" when req.HttpMethod == "GET": + await WriteJson(resp, 200, new { status = "ok", service = "vibetotext" }); + break; + + case "/api/speak" when req.HttpMethod == "POST": + await HandleSpeak(req, resp); + break; + + default: + await WriteJson(resp, 404, new { error = "not_found", message = $"No route for {req.HttpMethod} {path}" }); + break; + } + } + catch (Exception ex) + { + Console.WriteLine($"[API] Request error: {ex.Message}"); + try + { + await WriteJson(resp, 500, new { error = "internal_error", message = ex.Message }); + } + catch { } + } + } + + private async Task HandleSpeak(HttpListenerRequest req, HttpListenerResponse resp) + { + string body; + using (var reader = new System.IO.StreamReader(req.InputStream, req.ContentEncoding)) + { + body = await reader.ReadToEndAsync(); + } + + if (string.IsNullOrWhiteSpace(body)) + { + await WriteJson(resp, 400, new { error = "bad_request", message = "Empty request body" }); + return; + } + + JsonDocument? doc = null; + try + { + doc = JsonDocument.Parse(body); + } + catch (JsonException) + { + await WriteJson(resp, 400, new { error = "bad_request", message = "Invalid JSON" }); + return; + } + + var text = doc.RootElement.TryGetProperty("text", out var textProp) ? textProp.GetString() : null; + doc.Dispose(); + + if (string.IsNullOrWhiteSpace(text)) + { + await WriteJson(resp, 400, new { error = "bad_request", message = "Missing or empty 'text' field" }); + return; + } + + _ttsService.Speak(text); + await WriteJson(resp, 200, new { status = "speaking" }); + } + + private static async Task WriteJson(HttpListenerResponse resp, int statusCode, object data) + { + resp.StatusCode = statusCode; + resp.ContentType = "application/json"; + var json = JsonSerializer.Serialize(data); + var bytes = Encoding.UTF8.GetBytes(json); + resp.ContentLength64 = bytes.Length; + await resp.OutputStream.WriteAsync(bytes); + resp.Close(); + } +} diff --git a/windows-native/src/VibeToText/Core/GeminiService.cs b/windows-native/src/VibeToText/Core/GeminiService.cs index 9c52d13..4a76e75 100644 --- a/windows-native/src/VibeToText/Core/GeminiService.cs +++ b/windows-native/src/VibeToText/Core/GeminiService.cs @@ -36,6 +36,23 @@ public class GeminiService Refined output: """; + private const string FeedbackPrompt = """ + You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response. + + Rules: + - Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS. + - Be direct and helpful. No filler, no "I think", no hedging. + - Use natural spoken English — contractions, simple words. No markdown, no bullet points. + - If they asked a question, answer it. If they described something, give concise feedback. + - Address the user as "sir" occasionally but not every sentence. + - Sound like a knowledgeable, confident AI assistant. + + User said: + {text} + + Your spoken response: + """; + private const string PlanPrompt = """ You are a senior software architect. Transform a rambling voice description into a concise implementation plan. @@ -146,6 +163,11 @@ public void LoadApiKey() return await CallGeminiAsync(PlanPrompt.Replace("{text}", text), 0.4f, 4096); } + public async Task GenerateFeedbackAsync(string text) + { + return await CallGeminiAsync(FeedbackPrompt.Replace("{text}", text), 0.5f, 256); + } + private async Task CallGeminiAsync(string prompt, float temperature, int maxTokens) { if (string.IsNullOrEmpty(_apiKey)) diff --git a/windows-native/src/VibeToText/Core/HotkeyManager.cs b/windows-native/src/VibeToText/Core/HotkeyManager.cs index 9d98d1f..4ff0b6a 100644 --- a/windows-native/src/VibeToText/Core/HotkeyManager.cs +++ b/windows-native/src/VibeToText/Core/HotkeyManager.cs @@ -9,6 +9,7 @@ public enum RecordingMode Greppy, Cleanup, Plan, + Feedback, History } @@ -28,6 +29,7 @@ public class HotkeyManager : IDisposable { [new HashSet { ModKey.Ctrl, ModKey.Shift }] = RecordingMode.Transcribe, [new HashSet { ModKey.Alt, ModKey.Shift }] = RecordingMode.Cleanup, + [new HashSet { ModKey.Ctrl, ModKey.Shift, ModKey.Alt }] = RecordingMode.Feedback, [new HashSet { ModKey.Ctrl, ModKey.Alt }] = RecordingMode.History, }; diff --git a/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs b/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs index d0b3d0f..ac6d4b4 100644 --- a/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs +++ b/windows-native/src/VibeToText/Core/TranscriptionPipeline.cs @@ -16,6 +16,7 @@ public class TranscriptionPipeline : IDisposable private readonly HistoryDatabase _database; private readonly PasteService _pasteService; private readonly GeminiService _geminiService; + private readonly TtsService _ttsService; private readonly ConfigStore _config; private WaveformOverlay? _overlay; @@ -46,6 +47,7 @@ public TranscriptionPipeline( HistoryDatabase database, PasteService pasteService, GeminiService geminiService, + TtsService ttsService, ConfigStore config) { _recorder = recorder; @@ -53,6 +55,7 @@ public TranscriptionPipeline( _database = database; _pasteService = pasteService; _geminiService = geminiService; + _ttsService = ttsService; _config = config; _recorder.OnLevelUpdate += levels => @@ -194,6 +197,18 @@ public async Task StopRecordingAndProcess(RecordingMode mode) var plan = await _geminiService.GeneratePlanAsync(text); output = plan ?? text; break; + case RecordingMode.Feedback: + var feedback = await _geminiService.GenerateFeedbackAsync(text); + if (feedback != null) + { + _ttsService.Speak(feedback); + } + else + { + _ttsService.Speak("I couldn't generate feedback, sir"); + } + // Still paste the original transcription + break; case RecordingMode.Transcribe: default: break; @@ -207,11 +222,16 @@ public async Task StopRecordingAndProcess(RecordingMode mode) await _pasteService.PasteAtCursorAsync(output); Log("Pasted at cursor."); + // Speak status report + var status = TtsService.GenerateStatusMessage(mode, text, output); + _ttsService.Speak(status); + _currentMode = null; } catch (Exception ex) { Log($"ERROR in processing: {ex}"); + _ttsService.Speak("Processing failed"); System.Windows.Application.Current?.Dispatcher.Invoke(() => { _overlay?.SetRecording(false); @@ -224,5 +244,6 @@ public void Dispose() { _recorder.Dispose(); _transcriber.Dispose(); + _ttsService.Dispose(); } } diff --git a/windows-native/src/VibeToText/Core/TtsService.cs b/windows-native/src/VibeToText/Core/TtsService.cs new file mode 100644 index 0000000..a59dcc3 --- /dev/null +++ b/windows-native/src/VibeToText/Core/TtsService.cs @@ -0,0 +1,231 @@ +using System.Diagnostics; +using System.IO; +using System.Speech.Synthesis; +using VibeToText.Data; + +namespace VibeToText.Core; + +/// +/// Fire-and-forget text-to-speech for status reports. +/// Primary: edge-tts (neural voice) + ffplay for playback. +/// Fallback: System.Speech.Synthesis (built-in SAPI). +/// +public class TtsService : IDisposable +{ + private SpeechSynthesizer? _synth; + private readonly ConfigStore _config; + private readonly object _lock = new(); + private Process? _ffplayProcess; + private static readonly string TempMp3Path = Path.Combine(Path.GetTempPath(), "vibetotext_tts.mp3"); + + // edge-tts defaults + private const string DefaultVoice = "en-GB-RyanNeural"; + + public TtsService(ConfigStore config) + { + _config = config; + try + { + _synth = new SpeechSynthesizer(); + } + catch (Exception ex) + { + Console.WriteLine($"[TTS] Failed to initialize SAPI fallback: {ex.Message}"); + } + } + + /// + /// Speak text using edge-tts (neural) with SAPI fallback. Fire-and-forget. + /// + public void Speak(string text) + { + if (!_config.TtsEnabled || string.IsNullOrWhiteSpace(text)) + return; + + // Cancel any previous speech + StopPlayback(); + + // Run everything in background so we don't block + Task.Run(() => + { + try + { + SpeakEdgeTts(text); + } + catch (Exception ex) + { + Console.WriteLine($"[TTS] edge-tts failed, falling back to SAPI: {ex.Message}"); + try + { + SpeakSapiFallback(text); + } + catch (Exception ex2) + { + Console.WriteLine($"[TTS] SAPI fallback also failed: {ex2.Message}"); + } + } + }); + } + + /// + /// Stop current playback (kill ffplay or cancel SAPI). + /// + public void StopPlayback() + { + lock (_lock) + { + // Kill ffplay if running + if (_ffplayProcess != null) + { + try + { + if (!_ffplayProcess.HasExited) + _ffplayProcess.Kill(); + _ffplayProcess.Dispose(); + } + catch { } + _ffplayProcess = null; + } + + // Cancel SAPI if running + try { _synth?.SpeakAsyncCancelAll(); } + catch { } + } + } + + private void SpeakEdgeTts(string text) + { + var voice = _config.TtsVoice; + if (string.IsNullOrEmpty(voice)) + voice = DefaultVoice; + var rate = _config.TtsEdgeRate; + var pitch = _config.TtsEdgePitch; + + // Step 1: Generate mp3 with edge-tts CLI + var edgeProcess = new Process + { + StartInfo = new ProcessStartInfo + { + FileName = "edge-tts", + Arguments = $"--voice \"{voice}\" --rate \"{rate}\" --pitch \"{pitch}\" --text \"{EscapeArg(text)}\" --write-media \"{TempMp3Path}\"", + UseShellExecute = false, + CreateNoWindow = true, + RedirectStandardOutput = true, + RedirectStandardError = true, + } + }; + + edgeProcess.Start(); + // Consume stdout/stderr to prevent deadlock + edgeProcess.StandardOutput.ReadToEnd(); + edgeProcess.StandardError.ReadToEnd(); + + if (!edgeProcess.WaitForExit(15_000)) // 15 second timeout + { + try { edgeProcess.Kill(); } catch { } + throw new TimeoutException("edge-tts timed out after 15 seconds"); + } + + if (edgeProcess.ExitCode != 0) + throw new InvalidOperationException($"edge-tts exited with code {edgeProcess.ExitCode}"); + + if (!File.Exists(TempMp3Path) || new FileInfo(TempMp3Path).Length == 0) + throw new FileNotFoundException("edge-tts did not produce an output file"); + + // Step 2: Play with ffplay + var ffplay = new Process + { + StartInfo = new ProcessStartInfo + { + FileName = "ffplay", + Arguments = $"-nodisp -autoexit -loglevel quiet \"{TempMp3Path}\"", + UseShellExecute = false, + CreateNoWindow = true, + } + }; + + lock (_lock) + { + ffplay.Start(); + _ffplayProcess = ffplay; + } + + // Wait for playback to complete (don't leave zombie processes) + ffplay.WaitForExit(); + } + + private void SpeakSapiFallback(string text) + { + if (_synth == null) + throw new InvalidOperationException("SAPI SpeechSynthesizer not available"); + + // Apply settings + // SAPI rate: -10 to 10, WPM 185 ~ -1 (slightly slower for Jarvis feel) + _synth.Rate = Math.Clamp((_config.TtsRate - 200) / 20, -10, 10); + _synth.Volume = Math.Clamp(_config.TtsVolume, 0, 100); + + var voice = _config.TtsVoice; + if (string.IsNullOrEmpty(voice)) + voice = "Microsoft David Desktop"; // Deep male - Jarvis vibe + try { _synth.SelectVoice(voice); } + catch { /* voice not found, use default */ } + + _synth.SpeakAsync(text); + } + + /// Escape double quotes in text for CLI argument. + private static string EscapeArg(string text) + { + return text + .Replace("\\", "\\\\") + .Replace("\"", "\\\"") + .Replace("\r", " ") + .Replace("\n", " "); + } + + public static string GenerateStatusMessage(RecordingMode mode, string text, string output) + { + return mode switch + { + RecordingMode.Greppy => "Files located, sir", + RecordingMode.Cleanup => $"All tidied up. {CountParagraphs(output)} paragraphs ready", + RecordingMode.Plan => $"Plan's ready. {CountSteps(output)} steps laid out", + RecordingMode.Feedback => "Feedback spoken, sir", + _ => $"Got it. {CountWords(text)} words captured", + }; + } + + private static int CountWords(string text) => + text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length; + + private static int CountParagraphs(string text) => + text.Split("\n\n", StringSplitOptions.RemoveEmptyEntries) + .Count(p => !string.IsNullOrWhiteSpace(p)); + + private static int CountSteps(string text) => + text.Split('\n') + .Count(line => + { + var trimmed = line.TrimStart(); + return System.Text.RegularExpressions.Regex.IsMatch(trimmed, @"^(\d+[\.\):]|[-*])\s"); + }); + + public void Dispose() + { + StopPlayback(); + try + { + _synth?.Dispose(); + } + catch { } + _synth = null; + + // Clean up temp file + try + { + if (File.Exists(TempMp3Path)) + File.Delete(TempMp3Path); + } + catch { } + } +} diff --git a/windows-native/src/VibeToText/Data/ConfigStore.cs b/windows-native/src/VibeToText/Data/ConfigStore.cs index 0da6d49..4e76655 100644 --- a/windows-native/src/VibeToText/Data/ConfigStore.cs +++ b/windows-native/src/VibeToText/Data/ConfigStore.cs @@ -23,6 +23,12 @@ public partial class ConfigStore : ObservableObject private string? _geminiApiKey; private string _whisperModel = "base"; private List _customDictionary = new(); + private bool _ttsEnabled = true; + private int _ttsRate = 200; + private int _ttsVolume = 80; + private string? _ttsVoice; + private string _ttsEdgeRate = "+12%"; + private string _ttsEdgePitch = "+1Hz"; private JsonObject? _rawJson; // Preserve unknown keys public int? AudioDeviceIndex @@ -61,6 +67,42 @@ public List CustomDictionary set { SetProperty(ref _customDictionary, value); Save(); } } + public bool TtsEnabled + { + get => _ttsEnabled; + set { SetProperty(ref _ttsEnabled, value); Save(); } + } + + public int TtsRate + { + get => _ttsRate; + set { SetProperty(ref _ttsRate, value); Save(); } + } + + public int TtsVolume + { + get => _ttsVolume; + set { SetProperty(ref _ttsVolume, value); Save(); } + } + + public string? TtsVoice + { + get => _ttsVoice; + set { SetProperty(ref _ttsVoice, value); Save(); } + } + + public string TtsEdgeRate + { + get => _ttsEdgeRate; + set { SetProperty(ref _ttsEdgeRate, value); Save(); } + } + + public string TtsEdgePitch + { + get => _ttsEdgePitch; + set { SetProperty(ref _ttsEdgePitch, value); Save(); } + } + public ConfigStore() { Directory.CreateDirectory(ConfigDir); @@ -91,6 +133,13 @@ public void Load() .Where(s => !string.IsNullOrEmpty(s)) .ToList(); } + + _ttsEnabled = _rawJson["tts_enabled"]?.GetValue() ?? true; + _ttsRate = _rawJson["tts_rate"]?.GetValue() ?? 200; + _ttsVolume = _rawJson["tts_volume"]?.GetValue() ?? 80; + _ttsVoice = _rawJson["tts_voice"]?.GetValue(); + _ttsEdgeRate = _rawJson["tts_edge_rate"]?.GetValue() ?? "+12%"; + _ttsEdgePitch = _rawJson["tts_edge_pitch"]?.GetValue() ?? "+1Hz"; } catch (Exception ex) { @@ -127,6 +176,17 @@ public void Save() _rawJson["whisper_model"] = _whisperModel; + _rawJson["tts_enabled"] = _ttsEnabled; + _rawJson["tts_rate"] = _ttsRate; + _rawJson["tts_volume"] = _ttsVolume; + if (_ttsVoice != null) + _rawJson["tts_voice"] = _ttsVoice; + else + _rawJson.Remove("tts_voice"); + + _rawJson["tts_edge_rate"] = _ttsEdgeRate; + _rawJson["tts_edge_pitch"] = _ttsEdgePitch; + var dictArray = new JsonArray(); foreach (var word in _customDictionary) dictArray.Add(word); diff --git a/windows-native/src/VibeToText/VibeToText.csproj b/windows-native/src/VibeToText/VibeToText.csproj index 2065255..1cae332 100644 --- a/windows-native/src/VibeToText/VibeToText.csproj +++ b/windows-native/src/VibeToText/VibeToText.csproj @@ -38,6 +38,9 @@ + + +