diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e509ee2..1f0ddfd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,11 +27,25 @@ jobs: needs: test steps: - uses: actions/checkout@v5 + with: + submodules: recursive - uses: actions/setup-go@v6 with: go-version: '1.24' + # Cache the on-device Parakeet ggufs (~940MB) so the local-model tests + # run without re-downloading every time. Bump the key when the models + # release changes (localmodel.Version). + - name: Cache local models + uses: actions/cache@v4 + with: + path: models/parakeet/v1 + key: parakeet-models-v1 + + - name: Download local models + run: make download-models + - name: Integration tests env: GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 92497d6..f2a1e6f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -23,6 +23,8 @@ jobs: runs-on: macos-15 steps: - uses: actions/checkout@v5 + with: + submodules: recursive - uses: actions/setup-go@v6 with: @@ -36,6 +38,8 @@ jobs: runs-on: macos-15 steps: - uses: actions/checkout@v5 + with: + submodules: recursive - uses: actions/setup-go@v6 with: @@ -54,6 +58,7 @@ jobs: - uses: actions/checkout@v5 with: fetch-depth: 0 + submodules: recursive - uses: actions/setup-go@v6 with: @@ -72,9 +77,9 @@ jobs: VERSION: ${{ github.ref_name }} GITHUB_TOKEN: ${{ github.token }} run: | - universal=$(find dist -path "*universal*" -name "zee" -type f | head -1) - test -n "$universal" || { echo "universal binary not found"; find dist -type f; exit 1; } - chmod +x "$universal" - packaging/mkdmg.sh "$universal" "$VERSION" "Zee-${VERSION}.dmg" + bin=$(find dist -name "zee" -type f | head -1) + test -n "$bin" || { echo "arm64 binary not found"; find dist -type f; exit 1; } + chmod +x "$bin" + packaging/mkdmg.sh "$bin" "$VERSION" "Zee-${VERSION}.dmg" shasum -a 256 "Zee-${VERSION}.dmg" >> dist/checksums.txt gh release upload "$VERSION" "Zee-${VERSION}.dmg" dist/checksums.txt --clobber diff --git a/.gitignore b/.gitignore index 00e7e6e..653fa32 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ zee-gui Zee-*.dmg packaging/Zee.icns +# Local STT models (downloaded / dev-placed ggufs — never committed) +/models/ + # Environment .env diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1ad600b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "parakeet.cpp"] + path = third_party/parakeet.cpp + url = https://github.com/mudler/parakeet.cpp + ignore = dirty diff --git a/.goreleaser.yml b/.goreleaser.yml index 7c3ca58..5ff83b7 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -2,26 +2,28 @@ version: 2 before: hooks: + # Init the parakeet.cpp submodule and build its static archives before the + # arm64 cgo build links them (darwin/arm64 only; no-op elsewhere). + - git submodule update --init --recursive + - make parakeet-lib - packaging/mkicns.sh packaging/appicon.png builds: - id: zee env: - CGO_ENABLED=1 + # Stamp the macOS 11.0 deploy target so the binary runs on every supported + # Mac (matches the -mcpu=apple-m1 / deploy target the archives were built with). + - CGO_CFLAGS=-mmacosx-version-min=11.0 + - CGO_LDFLAGS=-mmacosx-version-min=11.0 + - MACOSX_DEPLOYMENT_TARGET=11.0 goos: - darwin goarch: - - amd64 - - arm64 + - arm64 # Apple Silicon only — local STT (parakeet.cpp) is arm64-only ldflags: - -s -w -X main.version={{ .Version }} -universal_binaries: - - id: zee-universal - ids: - - zee - replace: false - archives: - builds: - zee diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f05719..c5205a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # Changelog ## Unreleased + +- Add offline, on-device transcription via Parakeet (parakeet.cpp, CPU) on Apple Silicon +- Works out of the box with no API key on Apple Silicon (falls back to the local 110M model) +- Add local model picker in the tray: 110M English (default), 0.6B v3 multilingual, 0.6B v2 English (opt-in download) +- Download missing local models on demand from the tray with progress +- `-transcribe` supports local WAV (16 kHz mono) transcription without a network call, and accepts multiple files in one invocation (model loaded once; one transcript printed per line) +- Block starting a new recording while the previous transcription is still in progress (the recording guard now spans inference, not just capture); show a blue status-dot tray icon during transcription and play a short "denied" beep if the hotkey is pressed then +- `-doctor` reports local model status (present, path, size, decoder) +- `-doctor` transcription test uses the app's default engine (local Parakeet, else first cloud key) instead of prompting for a provider + API key +- Idle tray icon adapts to the menubar appearance (template tinting) — renders white on dark/transparent menubars instead of black +- Diagnostics log per-transcription process RSS (`rss_mb`, from gopsutil — includes cgo/mmap model memory) for both batch and stream sessions +- "Save Last Recording" now works for the local (Parakeet) model — captured PCM is saved as WAV (was cloud-only before) +- Add `-provider` and `-model` flags to override the saved provider/model from the CLI (an unavailable explicit `-provider` is now a hard error) +- Fix a crash (SIGSEGV/SIGABRT double-free in `ma_device_uninit`) when recording after a sleep/wake or audio-device change: the per-call device reinit left the device pointer dangling when reinit failed, so the next call uninited the already-freed device. Null the device after uninit in both capture and beep playback. Also serialize all miniaudio device lifecycle calls behind a process-wide lock (`internal/malgolock`) as defense against concurrent capture/playback init/uninit +- Fix the tray language menu to always reflect the active model's languages — both at startup and on model switch. English-only Parakeet models no longer offer Auto-detect, and switching providers (e.g. to Groq) now updates the list + ## v0.3.8 - Update dialog points to install instructions diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..da56e94 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Sumer Cip + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile index e595292..54f6198 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,46 @@ -.PHONY: build build-linux-amd64 build-linux-arm64 test test-integration benchmark integration-test clean bump-version release icns app +.PHONY: build build-linux-amd64 build-linux-arm64 test test-integration benchmark integration-test clean bump-version release icns app parakeet-lib download-models VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") -build: - go build -ldflags="-X main.version=$(VERSION)" -o zee +# Local STT (Parakeet) is a darwin/arm64-only cgo feature. On that host we build +# the static parakeet.cpp + ggml archives first and stamp the macOS deploy +# target; everywhere else the no-cgo stub is compiled and these are no-ops. +MACOS_MIN := 11.0 +PARAKEET_DIR := third_party/parakeet.cpp +PARAKEET_LIB := $(PARAKEET_DIR)/build-release/libparakeet.a +HOST := $(shell go env GOOS)/$(shell go env GOARCH) +ifeq ($(HOST),darwin/arm64) +CGO_ENV := MACOSX_DEPLOYMENT_TARGET=$(MACOS_MIN) CGO_CFLAGS=-mmacosx-version-min=$(MACOS_MIN) CGO_LDFLAGS=-mmacosx-version-min=$(MACOS_MIN) +endif + +build: parakeet-lib download-models + $(CGO_ENV) go build -ldflags="-X main.version=$(VERSION)" -o zee + +# Fetch the mandatory (PreFetch) local models into the dev folder from the +# pinned models- GitHub release. Reuses the localmodel registry + +# downloader (single source of truth) and is a per-file no-op when present. +download-models: + go run ./cmd/modeldl + +# Configure once (submodule init + cmake, which auto-applies the in-tree ggml +# patches), then always `cmake --build` so source changes recompile incrementally +# and relink — a no-op when nothing changed. After a submodule bump, delete +# build-release to force a reconfigure (re-applies the patch to the new ggml). +parakeet-lib: + @if [ "$(HOST)" != "darwin/arm64" ]; then exit 0; fi; \ + if [ ! -f $(PARAKEET_DIR)/CMakeLists.txt ]; then \ + echo "==> initializing parakeet.cpp submodule (first checkout)"; \ + git submodule update --init --recursive $(PARAKEET_DIR); \ + fi; \ + if [ ! -d $(PARAKEET_DIR)/build-release ]; then \ + echo "==> configuring parakeet.cpp (one-time)"; \ + cmake -S $(PARAKEET_DIR) -B $(PARAKEET_DIR)/build-release \ + -DBUILD_SHARED_LIBS=OFF -DPARAKEET_SHARED=OFF -DPARAKEET_BUILD_CLI=OFF \ + -DPARAKEET_GGML_METAL=OFF -DGGML_NATIVE=OFF \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=$(MACOS_MIN) \ + -DCMAKE_C_FLAGS="-mcpu=apple-m1" -DCMAKE_CXX_FLAGS="-mcpu=apple-m1"; \ + fi && \ + cmake --build $(PARAKEET_DIR)/build-release -j build-linux-amd64: GOOS=linux GOARCH=amd64 go build -ldflags="-X main.version=$(VERSION) -s -w" -o zee-linux-amd64 @@ -11,24 +48,24 @@ build-linux-amd64: build-linux-arm64: GOOS=linux GOARCH=arm64 go build -ldflags="-X main.version=$(VERSION) -s -w" -o zee-linux-arm64 -test: - go test -race -v ./... +test: parakeet-lib + $(CGO_ENV) go test -race -v ./... -integration-test: +integration-test: parakeet-lib @test -n "$(WAV)" || (echo "Usage: make integration-test WAV=file.wav" && exit 1) @if [ -f .env ]; then export $$(grep -v '^#' .env | xargs); fi; \ test -n "$$GROQ_API_KEY" || (echo "Error: GROQ_API_KEY not set (create .env or export it)" && exit 1); \ - go run test/integration_test.go $(WAV) + $(CGO_ENV) go run test/integration_test.go $(WAV) benchmark: build @test -n "$(WAV)" || (echo "Usage: make benchmark WAV=file.wav [RUNS=5]" && exit 1) @if [ -f .env ]; then export $$(grep -v '^#' .env | xargs); fi; \ ./zee -benchmark $(WAV) -runs $(or $(RUNS),3) -test-integration: +test-integration: parakeet-lib @tmp=$$(mktemp -d) && \ - go build -o "$$tmp/zee-test-bin" . && \ - ZEE_TEST_BIN="$$tmp/zee-test-bin" go test -race -tags integration -v -timeout 120s -count=1 ./test/ ; \ + $(CGO_ENV) go build -o "$$tmp/zee-test-bin" . && \ + ZEE_TEST_BIN="$$tmp/zee-test-bin" $(CGO_ENV) go test -race -tags integration -v -timeout 600s -count=1 ./test/ ; \ status=$$? ; rm -rf "$$tmp" ; exit $$status icns: diff --git a/README.md b/README.md index 18cf162..3345b18 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ ## Highlights +- **Offline, on-device** — on Apple Silicon, transcribes fully locally via Parakeet (parakeet.cpp, CPU) with **no API key and no network**. Cloud providers are optional and switchable from the tray. - **System tray app** — lives in the menu bar. Switch microphones, transcription providers, and languages from the tray menu. Dynamic icons show recording and warning states. - **Two recording modes** — push-to-talk (hold hotkey) or tap-to-toggle (tap to start/stop). - **Real-time streaming** — when a streaming-capable model is selected (e.g. Deepgram Nova-3), words appear as you speak and auto-paste into the focused window incrementally. @@ -25,7 +26,7 @@ - **Multiple providers** — Groq, OpenAI, Mistral, ElevenLabs, and Deepgram, switchable from the tray menu at runtime. - **36 languages** — select transcription language from the tray menu or via `-lang` flag. - **Cross-platform** — minimal dependencies, pure Go where possible. - - [x] macOS + - [x] macOS (Apple Silicon) - [ ] Linux - [ ] Windows @@ -50,15 +51,13 @@ Downloads the latest DMG, verifies its SHA256 against `checksums.txt`, copies `Z For terminal usage: ```bash -# Apple Silicon +# Apple Silicon (the only supported target) curl -L https://github.com/sumerc/zee/releases/latest/download/zee_darwin_arm64.tar.gz | tar xz - -# Intel -curl -L https://github.com/sumerc/zee/releases/latest/download/zee_darwin_amd64.tar.gz | tar xz ``` ```bash -GROQ_API_KEY=xxx ./zee # Groq Whisper +./zee # offline, on-device (no key needed) +GROQ_API_KEY=xxx ./zee # Groq Whisper (cloud) DEEPGRAM_API_KEY=xxx ./zee # Deepgram (streaming auto-enabled when a streaming model is selected from the tray) ./zee -debug-transcribe # include transcription text logs ``` @@ -67,15 +66,20 @@ DEEPGRAM_API_KEY=xxx ./zee # Deepgram (streaming auto-enabled when a st ### Build from source +Requires **Apple Silicon**, plus `cmake` and the Xcode Command Line Tools (for the one-time on-device STT engine build). + ```bash git clone https://github.com/sumerc/zee && cd zee -make build # CLI binary +make build # builds the local STT engine (cmake) + CLI binary; + # first run also fetches the default models (~900 MB) into models/parakeet/v1/ make app # macOS DMG ``` +The submodule, static libraries, and models are all set up automatically by `make build` — no manual steps. + ## Usage -Set at least one API key, then run zee: +On Apple Silicon, zee works offline out of the box — no key required. To use a cloud provider instead, set its key (pick the provider from the tray), then run zee: ```bash export GROQ_API_KEY=your_key # batch mode (Groq Whisper) diff --git a/THIRD_PARTY_LICENSES b/THIRD_PARTY_LICENSES new file mode 100644 index 0000000..1082bff --- /dev/null +++ b/THIRD_PARTY_LICENSES @@ -0,0 +1,88 @@ +Third-Party Licenses and Attribution +===================================== + +Zee statically links the components below and bundles/downloads the local +speech-to-text models below. Their licenses and the required attributions are +reproduced here. + + +-------------------------------------------------------------------------------- +1. parakeet.cpp — local speech-to-text engine (statically linked) + https://github.com/mudler/parakeet.cpp + License: MIT +-------------------------------------------------------------------------------- + +MIT License + +Copyright (c) 2026 the parakeet.cpp authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +-------------------------------------------------------------------------------- +2. ggml — tensor library (statically linked via parakeet.cpp) + https://github.com/ggml-org/ggml + License: MIT +-------------------------------------------------------------------------------- + +MIT License + +Copyright (c) 2023-2026 The ggml authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +-------------------------------------------------------------------------------- +3. Parakeet speech-to-text models (GGUF) + License: Creative Commons Attribution 4.0 International (CC-BY-4.0) + https://creativecommons.org/licenses/by/4.0/ +-------------------------------------------------------------------------------- + +The local transcription models that Zee bundles and/or downloads are GGUF +conversions of the NVIDIA NeMo Parakeet checkpoints: + + - nvidia/parakeet-tdt_ctc-110m (110M, English) + - nvidia/parakeet-tdt-0.6b-v2 (0.6B, English) + - nvidia/parakeet-tdt-0.6b-v3 (0.6B, multilingual) + + Source: NVIDIA NeMo — https://github.com/NVIDIA/NeMo + License: CC-BY-4.0 — https://creativecommons.org/licenses/by/4.0/ + +Changes made: the original NeMo checkpoints were converted to the GGUF format +for use with parakeet.cpp; the v3 model is additionally quantized (q4_k). No +other modifications were made to the model weights. + +"Parakeet", "NeMo", and "NVIDIA" are trademarks of NVIDIA Corporation. Zee is +not affiliated with, sponsored by, or endorsed by NVIDIA. diff --git a/audio/audio.go b/audio/audio.go index 02a5017..7b66407 100644 --- a/audio/audio.go +++ b/audio/audio.go @@ -1,9 +1,89 @@ package audio -import "strings" +import ( + "encoding/binary" + "fmt" + "strings" +) const WAVHeaderSize = 44 +// WAVToPCM parses a RIFF/WAVE file and returns the raw PCM data chunk, after +// validating it is 16 kHz mono signed-16-bit little-endian — the format the +// local Parakeet engine expects. It walks the chunk list so padding chunks +// (FLLR, LIST, fact, …) between the header and `data` are handled correctly. +func WAVToPCM(b []byte) ([]byte, error) { + if len(b) < 12 || string(b[0:4]) != "RIFF" || string(b[8:12]) != "WAVE" { + return nil, fmt.Errorf("not a RIFF/WAVE file") + } + var ( + gotFmt bool + channels, bits uint16 + sampleRate uint32 + data []byte + ) + for off := 12; off+8 <= len(b); { + id := string(b[off : off+4]) + size := int(binary.LittleEndian.Uint32(b[off+4 : off+8])) + body := off + 8 + if body+size > len(b) { + size = len(b) - body // tolerate a truncated final chunk + } + switch id { + case "fmt ": + if size < 16 { + return nil, fmt.Errorf("short fmt chunk") + } + channels = binary.LittleEndian.Uint16(b[body+2 : body+4]) + sampleRate = binary.LittleEndian.Uint32(b[body+4 : body+8]) + bits = binary.LittleEndian.Uint16(b[body+14 : body+16]) + gotFmt = true + case "data": + data = b[body : body+size] + } + off = body + size + if size%2 == 1 { + off++ // chunks are word-aligned + } + } + if !gotFmt { + return nil, fmt.Errorf("no fmt chunk") + } + if data == nil { + return nil, fmt.Errorf("no data chunk") + } + if channels != 1 || sampleRate != 16000 || bits != 16 { + return nil, fmt.Errorf("unsupported WAV format: %d-bit %d ch %d Hz (need 16-bit mono 16000 Hz)", bits, channels, sampleRate) + } + return data, nil +} + +// PCMToWAV wraps raw 16 kHz mono signed-16-bit little-endian PCM in a minimal +// 44-byte RIFF/WAVE container — the inverse of WAVToPCM. Local (Parakeet) +// recordings are never encoded, so this is how they're persisted to disk. +func PCMToWAV(pcm []byte) []byte { + const sampleRate, channels, bits = 16000, 1, 16 + byteRate := sampleRate * channels * bits / 8 + blockAlign := channels * bits / 8 + + buf := make([]byte, WAVHeaderSize+len(pcm)) + copy(buf[0:4], "RIFF") + binary.LittleEndian.PutUint32(buf[4:8], uint32(36+len(pcm))) + copy(buf[8:12], "WAVE") + copy(buf[12:16], "fmt ") + binary.LittleEndian.PutUint32(buf[16:20], 16) // PCM fmt chunk size + binary.LittleEndian.PutUint16(buf[20:22], 1) // format = PCM + binary.LittleEndian.PutUint16(buf[22:24], channels) + binary.LittleEndian.PutUint32(buf[24:28], sampleRate) + binary.LittleEndian.PutUint32(buf[28:32], uint32(byteRate)) + binary.LittleEndian.PutUint16(buf[32:34], uint16(blockAlign)) + binary.LittleEndian.PutUint16(buf[34:36], bits) + copy(buf[36:40], "data") + binary.LittleEndian.PutUint32(buf[40:44], uint32(len(pcm))) + copy(buf[WAVHeaderSize:], pcm) + return buf +} + var btKeywords = []string{ "airpods", "beats", "bose", "wh-1000", "wf-1000", "sony wh-", "sony wf-", diff --git a/audio/audio_other.go b/audio/audio_other.go index 601c9fb..d117f73 100644 --- a/audio/audio_other.go +++ b/audio/audio_other.go @@ -8,6 +8,8 @@ import ( "sync/atomic" "github.com/gen2brain/malgo" + + "zee/internal/malgolock" ) type malgoContext struct { @@ -15,7 +17,9 @@ type malgoContext struct { } func NewContext() (Context, error) { + malgolock.Lock() ctx, err := malgo.InitContext(nil, malgo.ContextConfig{}, nil) + malgolock.Unlock() if err != nil { return nil, err } @@ -23,7 +27,9 @@ func NewContext() (Context, error) { } func (m *malgoContext) Devices() ([]DeviceInfo, error) { + malgolock.Lock() devices, err := m.ctx.Devices(malgo.Capture) + malgolock.Unlock() if err != nil { return nil, fmt.Errorf("malgo devices: %w", err) } @@ -44,7 +50,10 @@ func (m *malgoContext) NewCapture(device *DeviceInfo, config CaptureConfig) (Cap config: config, } - if err := c.initDevice(); err != nil { + malgolock.Lock() + err := c.initDevice() + malgolock.Unlock() + if err != nil { return nil, err } @@ -52,8 +61,10 @@ func (m *malgoContext) NewCapture(device *DeviceInfo, config CaptureConfig) (Cap } func (m *malgoContext) Close() { + malgolock.Lock() m.ctx.Uninit() m.ctx.Free() + malgolock.Unlock() } type malgoCapture struct { @@ -64,6 +75,7 @@ type malgoCapture struct { callback atomic.Pointer[DataCallback] } +// initDevice is lock-free; callers must hold malgolock around it. func (c *malgoCapture) initDevice() error { deviceConfig := malgo.DefaultDeviceConfig(malgo.Capture) deviceConfig.Capture.Format = malgo.FormatS16 @@ -98,9 +110,17 @@ func (c *malgoCapture) initDevice() error { } func (c *malgoCapture) Start() error { - // Always reinitialize before starting — handles macOS sleep/wake - // where the device handle goes stale without returning errors - c.device.Uninit() + malgolock.Lock() + defer malgolock.Unlock() + // Always reinitialize before starting — handles macOS sleep/wake where the + // device handle goes stale without returning errors. Null the pointer after + // Uninit: if the reinit below fails (transient CoreAudio error during a + // route/sleep-wake change), c.device must not be left pointing at the freed + // device, or the next Start uninits it again and double-frees. + if c.device != nil { + c.device.Uninit() + c.device = nil + } if err := c.initDevice(); err != nil { return fmt.Errorf("device reinit failed: %w", err) } @@ -108,11 +128,20 @@ func (c *malgoCapture) Start() error { } func (c *malgoCapture) Stop() { - c.device.Stop() + malgolock.Lock() + if c.device != nil { + c.device.Stop() + } + malgolock.Unlock() } func (c *malgoCapture) Close() { - c.device.Uninit() + malgolock.Lock() + if c.device != nil { + c.device.Uninit() + c.device = nil + } + malgolock.Unlock() } func (c *malgoCapture) SetCallback(cb DataCallback) { diff --git a/beep/beep.go b/beep/beep.go index 5099361..4749a8c 100644 --- a/beep/beep.go +++ b/beep/beep.go @@ -21,6 +21,12 @@ const ( errorFreq = 350 errorVolume = 0.6 errorDecay = 30 + + // Denied beep: low, short single tick — a press was ignored (e.g. while a + // transcription is still in progress). + deniedFreq = 240 + deniedVolume = 0.45 + deniedDecay = 40 ) // Platform-specific durations (darwin uses shorter durations) diff --git a/beep/beep_darwin.go b/beep/beep_darwin.go index 9783497..70a700c 100644 --- a/beep/beep_darwin.go +++ b/beep/beep_darwin.go @@ -8,15 +8,18 @@ import ( "sync/atomic" "github.com/gen2brain/malgo" + + "zee/internal/malgolock" ) var ( malgoCtx *malgo.AllocatedContext device *malgo.Device - startSamples []byte - endSamples []byte - errorSamples []byte - soundOnce sync.Once + startSamples []byte + endSamples []byte + errorSamples []byte + deniedSamples []byte + soundOnce sync.Once // Playback state - accessed atomically from callback playSamples atomic.Pointer[[]byte] @@ -24,6 +27,7 @@ var ( playMu sync.Mutex ) +// initDevice is lock-free; callers must hold malgolock around it. func initDevice() error { config := malgo.DefaultDeviceConfig(malgo.Playback) config.Playback.Format = malgo.FormatS16 @@ -40,6 +44,9 @@ func initDevice() error { } func initSound() { + malgolock.Lock() + defer malgolock.Unlock() + var err error malgoCtx, err = malgo.InitContext(nil, malgo.ContextConfig{}, nil) if err != nil { @@ -49,6 +56,7 @@ func initSound() { startSamples = generateTickBytes(sampleRate, startFreq, 0.03, startVolume, startDecay) endSamples = generateTickBytes(sampleRate, endFreq, 0.05, endVolume, endDecay) errorSamples = generateDoubleBeepBytes(sampleRate, errorFreq, 0.08, 0.05, errorVolume, errorDecay) + deniedSamples = generateTickBytes(sampleRate, deniedFreq, 0.06, deniedVolume, deniedDecay) if err := initDevice(); err != nil { malgoCtx.Uninit() @@ -124,13 +132,20 @@ func playBytes(samples []byte) { playMu.Lock() defer playMu.Unlock() + malgolock.Lock() + defer malgolock.Unlock() + // Always reinitialize to pick up current default output device - // (handles BT connect/disconnect, sleep/wake) + // (handles BT connect/disconnect, sleep/wake). Null device after Uninit + // so a failed reinit can't leave it pointing at the freed device — the + // next call would otherwise Uninit it again and double-free. if device != nil { device.Stop() device.Uninit() + device = nil } if err := initDevice(); err != nil { + device = nil return } @@ -169,3 +184,11 @@ func PlayError() { soundOnce.Do(initSound) playBytes(errorSamples) } + +func PlayDenied() { + if disabled { + return + } + soundOnce.Do(initSound) + playBytes(deniedSamples) +} diff --git a/beep/beep_linux.go b/beep/beep_linux.go index 18b47dc..e5a3daf 100644 --- a/beep/beep_linux.go +++ b/beep/beep_linux.go @@ -11,16 +11,18 @@ import ( ) var ( - startSamples []int16 - endSamples []int16 - errorSamples []int16 - soundOnce sync.Once + startSamples []int16 + endSamples []int16 + errorSamples []int16 + deniedSamples []int16 + soundOnce sync.Once ) func initSound() { startSamples = generateTick(sampleRate, startFreq, 0.2, startVolume, startDecay) endSamples = generateTick(sampleRate, endFreq, 0.2, endVolume, endDecay) errorSamples = generateDoubleBeep(sampleRate, errorFreq, 0.08, 0.05, errorVolume, errorDecay) + deniedSamples = generateTick(sampleRate, deniedFreq, 0.12, deniedVolume, deniedDecay) } func generateTick(sampleRate int, freq float64, duration float64, volume float64, decay float64) []int16 { @@ -109,3 +111,11 @@ func PlayError() { soundOnce.Do(initSound) go playSamples(errorSamples) } + +func PlayDenied() { + if disabled { + return + } + soundOnce.Do(initSound) + go playSamples(deniedSamples) +} diff --git a/beep/beep_windows.go b/beep/beep_windows.go index 630d83d..b59b241 100644 --- a/beep/beep_windows.go +++ b/beep/beep_windows.go @@ -4,7 +4,8 @@ package beep // No audio playback on Windows - beeps disabled. -func Init() {} -func PlayStart() {} -func PlayEnd() {} -func PlayError() {} +func Init() {} +func PlayStart() {} +func PlayEnd() {} +func PlayError() {} +func PlayDenied() {} diff --git a/cmd/modeldl/main.go b/cmd/modeldl/main.go new file mode 100644 index 0000000..6d5901e --- /dev/null +++ b/cmd/modeldl/main.go @@ -0,0 +1,35 @@ +// Command modeldl downloads the mandatory (PreFetch) local models into the +// versioned dev folder (models/parakeet/), used by `make +// download-models` and the build step. Opt-in models are skipped — fetch those +// from the tray at runtime. Best-effort: a failure warns but never fails the +// build (a dev may symlink/copy the ggufs instead, or the release isn't up yet). +package main + +import ( + "fmt" + "os" + "path/filepath" + + "zee/localmodel" +) + +func main() { + dir := filepath.Join("models", "parakeet", localmodel.Version) + os.Setenv("ZEE_MODELS_DIR", dir) // force the dev folder regardless of cwd state + + for _, m := range localmodel.All() { + if !m.PreFetch { + continue // opt-in — not auto-downloaded + } + if localmodel.Present(m) { + fmt.Printf("✓ %s present\n", m.Filename) + continue + } + fmt.Printf("↓ %s (%d MB)...\n", m.Filename, m.SizeBytes>>20) + if err := localmodel.Download(m, nil); err != nil { + fmt.Fprintf(os.Stderr, " warning: %v\n", err) + fmt.Fprintf(os.Stderr, " place %s in %s/ manually, or publish the models-%s release\n", + m.Filename, dir, localmodel.Version) + } + } +} diff --git a/config/config.go b/config/config.go index c3dd740..2d52957 100644 --- a/config/config.go +++ b/config/config.go @@ -5,6 +5,7 @@ import ( "os" "path/filepath" "runtime" + "strings" "sync" "zee/log" @@ -62,6 +63,18 @@ func settingsPath() string { return filepath.Join(Dir(), settingsFile) } +// IsAppBundle reports whether this binary is the installed Zee.app rather than a +// local dev build, keyed off the executable path (not cwd). It's the single +// "am I the installed app?" signal — used to pick app-vs-dev locations +// consistently (login-item plist, local models dir). +func IsAppBundle() bool { + exe, err := os.Executable() + if err != nil { + return false + } + return strings.Contains(exe, ".app/Contents/MacOS/") +} + func Load() error { dir = Dir() current = defaults diff --git a/docs/design-notes.md b/docs/design-notes.md new file mode 100644 index 0000000..173065f --- /dev/null +++ b/docs/design-notes.md @@ -0,0 +1,37 @@ +# Design Notes + +Ideas and tradeoffs decided while building zee. Each entry captures *why* a +choice was made, so we don't relitigate it (or silently regress it) later. + +## Why the audio device is re-init'd on every recording + +`audio.Start()` tears down the capture device (`Uninit`) and rebuilds it +(`InitDevice`) on **every** recording, instead of initializing it once at +startup and reusing it. + +**Reason:** after macOS sleep/wake (and some device/route changes) the malgo +(miniaudio/CoreAudio) device handle goes **stale silently** — `device.Start()` +returns no error, but the mic produces only silence. There's no reliable signal +to detect this, so the blunt-but-safe fix is to rebuild the device every time. +This replaced an earlier "start, and only recreate on error" approach that +didn't work because the stale handle never reported an error. + +**Tradeoff:** constant `Uninit`/`InitDevice` churn enlarges the surface for +miniaudio lifecycle bugs. It directly caused a double-free crash: if the rebuild +failed transiently, the old (already-freed) device pointer was retained and the +next `Start` uninited it a second time (SIGABRT/SIGSEGV in `ma_device_uninit`). +Fixed by never keeping a freed pointer — store `nil` if the rebuild fails (see +`audio/reinit.go`). The same teardown-on-use pattern (and the same fix) applies +to `beep` playback. + +**Better long-term options (not yet done):** +- Init once + reinit only on a real signal — macOS `NSWorkspace` sleep/wake + notification, or miniaudio's reroute/stop notification callback. Most correct, + but only manually verifiable (`pmset sleepnow`, wake, record). +- Init once + stale-by-behavior detection — keep the device, and after `Start` + watch the data callback for actual frames; reinit only if none arrive. Catches + every stale cause, and is unit-testable with a fake device, but it's heuristic. + +All miniaudio device lifecycle calls (capture + playback) are also serialized +behind a process-wide lock (`internal/malgolock`) as defense against concurrent +init/uninit across the two malgo contexts. diff --git a/doctor/doctor.go b/doctor/doctor.go index 36dacf9..cc738ea 100644 --- a/doctor/doctor.go +++ b/doctor/doctor.go @@ -13,6 +13,8 @@ import ( "zee/clipboard" "zee/encoder" "zee/hotkey" + "zee/internal/parakeet" + "zee/localmodel" "zee/transcriber" ) @@ -24,6 +26,8 @@ func Run(_ string) int { fmt.Println("zee doctor - interactive system diagnostics") fmt.Println("============================================") + printLocalModels() + allPass := true if !checkHotkey() { @@ -49,6 +53,25 @@ func Run(_ string) int { return 1 } +// printLocalModels reports the offline engine + which models are on disk. +// Informational only — it never fails the run (cloud-only users have none). +func printLocalModels() { + fmt.Println() + fmt.Println("Local models (Parakeet)") + if !parakeet.Available() { + fmt.Println(" engine: not available on this platform (Apple Silicon only)") + return + } + fmt.Printf(" engine available — models dir: %s\n", localmodel.Dir()) + for _, m := range localmodel.All() { + state := "missing" + if localmodel.Present(m) { + state = "present" + } + fmt.Printf(" %-34s %5d MB %s\n", m.Label, m.SizeBytes>>20, state) + } +} + func checkHotkey() bool { fmt.Println() fmt.Println("[1/3] Hotkey detection") @@ -132,49 +155,18 @@ func checkMicAndTranscription() bool { fmt.Printf("Selected: %s\n", device.Name) } - // Select provider - fmt.Println() - fmt.Println("Select transcription provider:") - fmt.Println(" 1. Groq") - fmt.Println(" 2. DeepGram") - fmt.Println(" 3. OpenAI") - fmt.Print("Choice [1/2/3]: ") - - choice, _ := reader.ReadString('\n') - choice = strings.TrimSpace(choice) - - var provider string - switch choice { - case "1", "": - provider = "groq" - case "2": - provider = "deepgram" - case "3": - provider = "openai" - default: - fmt.Printf(" FAIL: invalid choice %q\n", choice) - return false - } - - // Get API key - fmt.Printf("Enter %s API key: ", provider) - apiKey, _ := reader.ReadString('\n') - apiKey = strings.TrimSpace(apiKey) - if apiKey == "" { - fmt.Println(" FAIL: API key required") + // Use the same engine the app would: local Parakeet when a model is present, + // else the first cloud provider whose API key is set. No interactive prompt. + trans, err := transcriber.New() + if err != nil { + fmt.Printf(" FAIL: no transcription engine available: %v\n", err) return false } - - // Create transcriber - var trans transcriber.Transcriber - switch provider { - case "groq": - trans = transcriber.NewGroq(apiKey) - case "deepgram": - trans = transcriber.NewDeepgram(apiKey) - case "openai": - trans = transcriber.NewOpenAI(apiKey) + engine := "cloud" + if transcriber.IsLocal(trans) { + engine = "Local (Parakeet)" } + fmt.Printf("Using engine: %s\n", engine) fmt.Println() fmt.Print("Press Enter and speak for 3 seconds...") diff --git a/go.mod b/go.mod index 3f92e0f..d1efb3b 100644 --- a/go.mod +++ b/go.mod @@ -19,12 +19,20 @@ require ( ) require ( + github.com/ebitengine/purego v0.10.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/icza/bitio v1.1.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mewkiz/pkg v0.0.0-20250417130911-3f050ff8c56d // indirect github.com/mewpkg/term v0.0.0-20241026122259-37a80af23985 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/shirou/gopsutil/v4 v4.26.5 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect golang.design/x/mainthread v0.3.0 // indirect - golang.org/x/sys v0.40.0 // indirect + golang.org/x/sys v0.41.0 // indirect ) diff --git a/go.sum b/go.sum index 57c34d2..f197db4 100644 --- a/go.sum +++ b/go.sum @@ -1,19 +1,26 @@ github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/energye/systray v1.0.3 h1:XnyjJCeRU5z00bpNOic2fGTKz/7yHZMZjWiGIVXDS+4= github.com/energye/systray v1.0.3/go.mod h1:HelKhC3PXwv3ryDxbuQqV+7kAxAYNzE5cfdrerGOZTc= github.com/gen2brain/malgo v0.11.25-0.20251120102819-856f60956a65 h1:fcKNzdcFB4fVELL3TludxzRVGWEPSQ/ICyDTbJ9LK5Y= github.com/gen2brain/malgo v0.11.25-0.20251120102819-856f60956a65/go.mod h1:f9TtuN7DVrXMiV/yIceMeWpvanyVzJQMlBecJFVMxww= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/icza/bitio v1.1.0 h1:ysX4vtldjdi3Ygai5m1cWy4oLkhWTAi+SyO6HC8L9T0= github.com/icza/bitio v1.1.0/go.mod h1:0jGnlLAx8MKMr9VGnn/4YrvZiprkvBelsVIbA9Jjr9A= github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6 h1:8UsGZ2rr2ksmEru6lToqnXgA8Mz1DP11X4zSJ159C3k= github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6/go.mod h1:xQig96I1VNBDIWGCdTt54nHt6EeI639SmHycLYL7FkA= github.com/jfreymuth/pulse v0.1.1 h1:9WLNBNCijmtZ14ZJpatgJPu/NjwAl3TIKItSFnTh+9A= github.com/jfreymuth/pulse v0.1.1/go.mod h1:cpYspI6YljhkUf1WLXLLDmeaaPFc3CnGLjDZf9dZ4no= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= @@ -31,20 +38,35 @@ github.com/mewpkg/term v0.0.0-20241026122259-37a80af23985/go.mod h1:uiPmbdUbdt1N github.com/micmonay/keybd_event v1.1.2 h1:RpgvPJKOh4Jc+ZYe0OrVzGd2eNMCfuVg3dFTCsuSah4= github.com/micmonay/keybd_event v1.1.2/go.mod h1:CGMWMDNgsfPljzrAWoybUOSKafQPZpv+rLigt2LzNGI= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= +github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM= +github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= golang.design/x/hotkey v0.4.1 h1:zLP/2Pztl4WjyxURdW84GoZ5LUrr6hr69CzJFJ5U1go= golang.design/x/hotkey v0.4.1/go.mod h1:M8SGcwFYHnKRa83FpTFQoZvPO5vVT+kWPztFqTQKmXA= golang.design/x/mainthread v0.3.0 h1:UwFus0lcPodNpMOGoQMe87jSFwbSsEY//CA7yVmu4j8= golang.design/x/mainthread v0.3.0/go.mod h1:vYX7cF2b3pTJMGM/hc13NmN6kblKnf4/IyvHeu259L0= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201022201747-fb209a7c41cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y= nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c= diff --git a/install.sh b/install.sh index 21d0fe9..3894a46 100755 --- a/install.sh +++ b/install.sh @@ -34,6 +34,40 @@ log "Installing Zee ${VERSION}" DMG="Zee-${VERSION}.dmg" BASE="https://github.com/${REPO}/releases/download/${VERSION}" +# Offline models live under an immutable, app-version-independent tag. +MODELS_TAG="models-v1" +MODELS_BASE="https://github.com/${REPO}/releases/download/${MODELS_TAG}" +MODELS_DIR="${HOME}/Library/Application Support/zee/models" +# filenamesha256 — the default 110M + the multilingual v3 (v2 is opt-in). +PREFETCH_MODELS=( + "tdt_ctc-110m-f16.gguf 7f9a6376edde6a74592ace48b2ebdc27a1ac972d0be9dfcc29e668d99381faf1" + "tdt-0.6b-v3-q4_k.gguf 993d73feb4206dadda865ab25bd64b50c48dc4d013c3bf6126a721f28b1d5ee8" +) + +# Best-effort: pre-download the offline models so Apple Silicon works with no +# API key on first launch. Never fails the install — the in-app downloader +# recovers anything missing. +prefetch_models() { + [[ "$(uname -m)" == "arm64" ]] || return 0 + mkdir -p "$MODELS_DIR" 2>/dev/null || return 0 + local entry f sum dest + for entry in "${PREFETCH_MODELS[@]}"; do + f="${entry%% *}"; sum="${entry##* }"; dest="${MODELS_DIR}/${f}" + if [[ -f "$dest" ]] && shasum -a 256 "$dest" | grep -q "$sum"; then + log "Model ${f} already present"; continue + fi + log "Downloading model ${f} (best-effort)..." + if curl -fL --progress-bar "${MODELS_BASE}/${f}" -o "${dest}.part" \ + && shasum -a 256 "${dest}.part" | grep -q "$sum"; then + mv -f "${dest}.part" "$dest" + log "Model ${f} OK" + else + log "Model ${f} unavailable — the app will fetch it on first launch" + rm -f "${dest}.part" + fi + done +} + log "Downloading ${DMG}..." curl -fL --progress-bar "${BASE}/${DMG}" -o "${TMP}/${DMG}" \ || err "download failed: ${BASE}/${DMG}" @@ -64,21 +98,26 @@ run_or_sudo cp -R "$MOUNT/Zee.app" "${APP_DIR}/" log "Clearing quarantine attribute..." run_or_sudo xattr -cr "${APP_DIR}/Zee.app" +log "Fetching offline models (best-effort)..." +prefetch_models || true + cat < +#include "parakeet_capi.h" +*/ +import "C" + +import ( + "fmt" + "sync" + "unsafe" +) + +// Decoder selects the model head passed to the C-API. +const ( + DecoderDefault = 0 // by arch: transducer for tdt/rnnt, CTC for ctc + DecoderCTC = 1 // force the CTC head + DecoderTDT = 2 // force the transducer (TDT/RNN-T) head +) + +// Available reports whether local Parakeet transcription is compiled in. +func Available() bool { return true } + +// Ctx wraps one loaded GGUF model. Transcribe is serialised by an internal +// mutex (push-to-talk is serial; the C ctx is not concurrency-safe), and Close +// waits for any in-flight Transcribe before freeing the model. +type Ctx struct { + mu sync.Mutex + ptr *C.parakeet_ctx +} + +// New loads a GGUF model from path. The returned Ctx must be Closed. +func New(path string) (*Ctx, error) { + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + ptr := C.parakeet_capi_load(cPath) + if ptr == nil { + // load failures have no ctx, so pass NULL to last_error. + return nil, fmt.Errorf("parakeet: load %q: %s", path, C.GoString(C.parakeet_capi_last_error(nil))) + } + return &Ctx{ptr: ptr}, nil +} + +// Transcribe runs the model over mono 16 kHz float32 PCM and returns the +// transcript. decoder is one of the Decoder* constants. +func (c *Ctx) Transcribe(pcm []float32, decoder int) (string, error) { + if len(pcm) == 0 { + return "", nil + } + c.mu.Lock() + defer c.mu.Unlock() + if c.ptr == nil { + return "", fmt.Errorf("parakeet: transcribe on closed model") + } + + out := C.parakeet_capi_transcribe_pcm(c.ptr, + (*C.float)(unsafe.Pointer(&pcm[0])), C.int(len(pcm)), 16000, C.int(decoder)) + if out == nil { + return "", fmt.Errorf("parakeet: transcribe: %s", C.GoString(C.parakeet_capi_last_error(c.ptr))) + } + defer C.parakeet_capi_free_string(out) + return C.GoString(out), nil +} + +// Close frees the model. Safe to call more than once. +func (c *Ctx) Close() { + c.mu.Lock() + defer c.mu.Unlock() + if c.ptr != nil { + C.parakeet_capi_free(c.ptr) + c.ptr = nil + } +} diff --git a/internal/parakeet/parakeet_stub.go b/internal/parakeet/parakeet_stub.go new file mode 100644 index 0000000..5c25bd0 --- /dev/null +++ b/internal/parakeet/parakeet_stub.go @@ -0,0 +1,28 @@ +//go:build !darwin || !arm64 + +// Stub for platforms without the parakeet.cpp static libs (everything except +// darwin/arm64). Available() is false; nothing links any C dependency, so the +// universal-binary release pipeline and Linux/Intel builds are untouched. +package parakeet + +import "errors" + +const ( + DecoderDefault = 0 + DecoderCTC = 1 + DecoderTDT = 2 +) + +// Available reports whether local Parakeet transcription is compiled in. +func Available() bool { return false } + +// Ctx is an empty placeholder on unsupported platforms. +type Ctx struct{} + +var errUnavailable = errors.New("parakeet: local transcription is only available on Apple Silicon") + +func New(string) (*Ctx, error) { return nil, errUnavailable } + +func (c *Ctx) Transcribe([]float32, int) (string, error) { return "", errUnavailable } + +func (c *Ctx) Close() {} diff --git a/localmodel/diskspace_other.go b/localmodel/diskspace_other.go new file mode 100644 index 0000000..55c9bac --- /dev/null +++ b/localmodel/diskspace_other.go @@ -0,0 +1,7 @@ +//go:build !darwin && !linux + +package localmodel + +// checkDiskSpace is a no-op where we don't have statfs. Local models are an +// Apple Silicon feature; other platforms never reach the downloader in practice. +func checkDiskSpace(string, int64) error { return nil } diff --git a/localmodel/diskspace_unix.go b/localmodel/diskspace_unix.go new file mode 100644 index 0000000..83e6d22 --- /dev/null +++ b/localmodel/diskspace_unix.go @@ -0,0 +1,22 @@ +//go:build darwin || linux + +package localmodel + +import ( + "fmt" + "syscall" +) + +// checkDiskSpace fails if `dir`'s filesystem has less than need bytes free +// (plus a small margin), so a download aborts before filling the disk. +func checkDiskSpace(dir string, need int64) error { + var st syscall.Statfs_t + if err := syscall.Statfs(dir, &st); err != nil { + return nil // can't tell — don't block the download + } + free := int64(st.Bavail) * int64(st.Bsize) + if free < need+(64<<20) { // 64 MB margin + return fmt.Errorf("not enough disk space: need %d MB, %d MB free", need>>20, free>>20) + } + return nil +} diff --git a/localmodel/localmodel.go b/localmodel/localmodel.go new file mode 100644 index 0000000..c0a2a19 --- /dev/null +++ b/localmodel/localmodel.go @@ -0,0 +1,205 @@ +// Package localmodel is the single source of truth for the offline Parakeet +// GGUF models: their filenames, download URLs, checksums, sizes, and decoder +// head. It resolves where models live on disk and downloads missing ones +// atomically (tmp → verify sha256 → rename). +// +// Decoder values match internal/parakeet.Decoder* (0=default, 1=ctc, 2=tdt). +// Keeping them here as plain ints keeps this package free of the cgo engine so +// it stays cross-platform (the tray and installer reference it everywhere). +package localmodel + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "time" + + "zee/config" +) + +// Version is the pinned model-set version. It drives BOTH the download tag and +// the dev folder, so they never drift (decision #5: models are pinned to the +// binary, never "latest"). Bump it when the parakeet.cpp commit changes. +const Version = "v1" + +// baseURL hosts the immutable models- release assets. +const baseURL = "https://github.com/sumerc/zee/releases/download/models-" + Version + "/" + +// Model IDs (stable; persisted in config.json and shown in the tray). +const ( + ID110mEN = "parakeet-110m-en" // default, English-only, loaded at startup + IDV3Multi = "parakeet-v3-multi" // multilingual (25 lang), pre-fetched + IDV2Large = "parakeet-v2-en-large" // English long-form, opt-in download +) + +// Model is one downloadable GGUF plus everything zee needs to load, route to, +// and verify it. +type Model struct { + ID string + Label string + Filename string + SHA256 string + SizeBytes int64 + Decoder int // parakeet head: 0=default, 1=ctc, 2=tdt + Multilingual bool // true => non-English supported (v3); false => English-only + PreFetch bool // install.sh pre-fetches it (110m + v3); v2 never +} + +// URL is where the gguf is hosted under the pinned models tag. +func (m Model) URL() string { return baseURL + m.Filename } + +// models is ordered: default first, then the pre-fetched multilingual option, +// then the opt-in large English model. +var models = []Model{ + { + ID: ID110mEN, + Label: "Parakeet 110M (English)", + Filename: "tdt_ctc-110m-f16.gguf", + SHA256: "7f9a6376edde6a74592ace48b2ebdc27a1ac972d0be9dfcc29e668d99381faf1", + SizeBytes: 267452544, + Decoder: 2, // TDT head + PreFetch: true, + }, + { + ID: IDV3Multi, + Label: "Parakeet 0.6B v3 (multilingual)", + Filename: "tdt-0.6b-v3-q4_k.gguf", + SHA256: "993d73feb4206dadda865ab25bd64b50c48dc4d013c3bf6126a721f28b1d5ee8", + SizeBytes: 675200864, + Decoder: 0, // default head + Multilingual: true, + PreFetch: true, + }, + { + ID: IDV2Large, + Label: "Parakeet 0.6B v2 (English, large)", + Filename: "tdt-0.6b-v2-f16.gguf", + SHA256: "f8df7f5dc7b9ceb5cd0637a81194aab5d93022ace555ce81c8969c7a694b8f3d", + SizeBytes: 1404218656, + Decoder: 0, // default head + PreFetch: false, + }, +} + +// All returns the registry in display order. +func All() []Model { return models } + +// ByID looks up a model by its stable ID. +func ByID(id string) (Model, bool) { + for _, m := range models { + if m.ID == id { + return m, true + } + } + return Model{}, false +} + +// Default returns the model loaded at startup (110m English). +func Default() Model { m, _ := ByID(ID110mEN); return m } + +// Dir is where ggufs live: $ZEE_MODELS_DIR override, else /models +// for the installed app (the install/download location), else the versioned dev +// folder ./models/parakeet/. The app-vs-dev split uses the same +// executable-path signal as the login item (config.IsAppBundle), so prod is +// detected reliably regardless of working directory. +func Dir() string { + if d := os.Getenv("ZEE_MODELS_DIR"); d != "" { + return d + } + if config.IsAppBundle() { + return filepath.Join(config.Dir(), "models") + } + return filepath.Join("models", "parakeet", Version) +} + +// Path is the on-disk location of a model's gguf (whether or not it exists). +func Path(m Model) string { return filepath.Join(Dir(), m.Filename) } + +// Present reports whether the model's gguf exists on disk at the right size. +// (A size check is cheap and catches truncated/aborted downloads; the full +// sha256 is verified at download time, not on every startup.) +func Present(m Model) bool { + fi, err := os.Stat(Path(m)) + return err == nil && fi.Size() == m.SizeBytes +} + +// Download fetches a model to Dir() atomically: stream to a temp file, verify +// the sha256, then rename into place. progress (may be nil) is called with the +// fraction downloaded in [0,1]. A no-op if the model is already present. +func Download(m Model, progress func(fraction float64)) error { + if Present(m) { + return nil + } + dir := Dir() + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("create models dir: %w", err) + } + if err := checkDiskSpace(dir, m.SizeBytes); err != nil { + return err + } + + resp, err := http.Get(m.URL()) + if err != nil { + return fmt.Errorf("download %s: %w", m.Filename, err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("download %s: HTTP %d", m.Filename, resp.StatusCode) + } + + tmp, err := os.CreateTemp(dir, "."+m.Filename+".*.part") + if err != nil { + return fmt.Errorf("create temp: %w", err) + } + tmpPath := tmp.Name() + defer os.Remove(tmpPath) // no-op after a successful rename + + h := sha256.New() + pr := &progressReader{r: resp.Body, total: m.SizeBytes, cb: progress} + if _, err := io.Copy(io.MultiWriter(tmp, h), pr); err != nil { + tmp.Close() + return fmt.Errorf("download %s: %w", m.Filename, err) + } + if err := tmp.Close(); err != nil { + return fmt.Errorf("close temp: %w", err) + } + + if got := hex.EncodeToString(h.Sum(nil)); got != m.SHA256 { + return fmt.Errorf("checksum mismatch for %s (got %s, want %s)", m.Filename, got, m.SHA256) + } + + if err := os.Rename(tmpPath, Path(m)); err != nil { + return fmt.Errorf("install %s: %w", m.Filename, err) + } + return nil +} + +// progressReader reports download progress at most ~10x/sec via cb. +type progressReader struct { + r io.Reader + total int64 + read int64 + cb func(float64) + lastAt time.Time +} + +func (p *progressReader) Read(b []byte) (int, error) { + n, err := p.r.Read(b) + p.read += int64(n) + if p.cb != nil && p.total > 0 { + now := time.Now() + if err != nil || now.Sub(p.lastAt) > 100*time.Millisecond { + p.lastAt = now + frac := float64(p.read) / float64(p.total) + if frac > 1 { + frac = 1 + } + p.cb(frac) + } + } + return n, err +} diff --git a/log/log.go b/log/log.go index b6d165c..76b7737 100644 --- a/log/log.go +++ b/log/log.go @@ -12,14 +12,14 @@ import ( ) var ( - diagLog zerolog.Logger - diagFile *os.File - transcribeFile *os.File - logMu sync.Mutex - logReady atomic.Bool - transcribeOn bool - pid int - dir string + diagLog zerolog.Logger + diagFile *os.File + transcribeFile *os.File + logMu sync.Mutex + logReady atomic.Bool + transcribeOn bool + pid int + dir string ) type Metrics struct { @@ -32,8 +32,7 @@ type Metrics struct { TLSTimeMs float64 TTFBMs float64 TotalTimeMs float64 - MemoryAllocMB float64 - MemoryPeakMB float64 + ProcessRSSMB float64 InferenceMs float64 } @@ -197,8 +196,7 @@ func TranscriptionMetrics(m Metrics, mode, format, provider string, connReused b Float64("tls_ms", m.TLSTimeMs). Float64("ttfb_ms", m.TTFBMs). Float64("total_ms", m.TotalTimeMs). - Float64("mem_mb", m.MemoryAllocMB). - Float64("peak_mb", m.MemoryPeakMB) + Float64("rss_mb", m.ProcessRSSMB) if m.InferenceMs > 0 { ev = ev.Float64("inference_ms", m.InferenceMs) } @@ -235,6 +233,7 @@ type StreamMetricsData struct { RecvMessages int RecvFinal int CommitEvents int + ProcessRSSMB float64 } func StreamMetrics(m StreamMetricsData) { @@ -252,6 +251,7 @@ func StreamMetrics(m StreamMetricsData) { Int("recv_messages", m.RecvMessages). Int("recv_final", m.RecvFinal). Int("commit_events", m.CommitEvents). + Float64("rss_mb", m.ProcessRSSMB). Msg("stream_transcription") } diff --git a/login/login_darwin.go b/login/login_darwin.go index 10af335..bfdf651 100644 --- a/login/login_darwin.go +++ b/login/login_darwin.go @@ -9,12 +9,13 @@ import ( "os/exec" "path/filepath" "strings" + + "zee/config" ) const ( plistNameApp = "com.zee.app.plist" // installed /Applications/Zee.app plistNameDev = "com.zee.app.dev.plist" // local dev build - bundleSig = ".app/Contents/MacOS/" ) func xmlEscape(s string) string { @@ -23,20 +24,11 @@ func xmlEscape(s string) string { return b.String() } -// isRunningFromApp reports whether this binary is the installed Zee.app bundle -// rather than a local dev build. The login item (plist filename, launchd Label, -// and target binary) is keyed off this so a dev build never clobbers — or gets -// clobbered by — the installed app's entry. -func isRunningFromApp() bool { - exe, err := os.Executable() - if err != nil { - return false - } - return strings.Contains(exe, bundleSig) -} - +// plistName keys the login item (plist filename, launchd Label, target binary) +// off config.IsAppBundle so a dev build never clobbers — or gets clobbered by — +// the installed app's entry. func plistName() string { - if isRunningFromApp() { + if config.IsAppBundle() { return plistNameApp } return plistNameDev diff --git a/main.go b/main.go index a68f7b4..915c49e 100644 --- a/main.go +++ b/main.go @@ -18,9 +18,9 @@ import ( "zee/alert" "zee/audio" - "zee/config" "zee/beep" "zee/clipboard" + "zee/config" "zee/doctor" "zee/encoder" "zee/hotkey" @@ -62,6 +62,17 @@ var ( lastRec *savedRecording ) +func trayModelState(s transcriber.ModelStatus) tray.ModelState { + switch { + case s.Ready: + return tray.ModelReady + case s.Downloadable: + return tray.ModelNeedsDownload + default: + return tray.ModelUnavailable + } +} + func modelSupportsStream(tr transcriber.Transcriber) bool { id := tr.GetModel() for _, m := range tr.Models() { @@ -91,6 +102,12 @@ var configMu sync.Mutex var trayRecordChan = make(chan struct{}, 1) var isRecording atomic.Bool +// isTranscribing is true while a recording has stopped but its transcription is +// still running. isRecording stays true across this phase too (so a re-press is +// blocked); isTranscribing distinguishes "stop the live recording" from "denied, +// transcription in progress" for the hotkey feedback. +var isTranscribing atomic.Bool + var ( stopMu sync.Mutex stopCh chan struct{} // closed to stop the active recording @@ -170,7 +187,9 @@ func run() { logPathFlag := flag.String("logpath", "", "log directory path (default: OS-specific location, use ./ for current dir)") testFlag := flag.Bool("test", false, "Test mode (headless, stdin-driven)") hintsFlag := flag.String("hints", "", "Vocabulary hints for transcription (comma-separated)") - transcribeFlag := flag.String("transcribe", "", "Transcribe an audio file and exit") + transcribeFlag := flag.String("transcribe", "", "Transcribe audio file(s) and exit; extra files may follow as positional args (one transcript printed per line)") + providerFlag := flag.String("provider", "", "Transcription provider (e.g. parakeet, groq); overrides saved config") + modelFlag := flag.String("model", "", "Model ID for the selected provider; overrides saved config") flag.Parse() // Resolve log directory early @@ -245,19 +264,31 @@ func run() { fatal("Unknown format %q (use mp3@16, mp3@64, or flac)", *formatFlag) } + // CLI -provider/-model override the saved provider/model (also lets the + // integration test pick a specific local model). + if flagSet["provider"] { + cfg.Provider = *providerFlag + } + if flagSet["model"] { + cfg.Model = *modelFlag + } + // Restore saved provider/model or fall back to auto-detection if cfg.Provider != "" { for _, p := range transcriber.Providers() { - if p.Name == cfg.Provider { - if key := os.Getenv(p.EnvKey); key != "" { - activeTranscriber = p.NewFn(key) - if cfg.Model != "" { - activeTranscriber.SetModel(cfg.Model) - } + if p.Name == cfg.Provider && p.Available() { + activeTranscriber = p.New() + if cfg.Model != "" { + activeTranscriber.SetModel(cfg.Model) } break } } + // An explicit -provider that didn't resolve is a hard error (don't + // silently fall back to a different engine under the test's feet). + if activeTranscriber == nil && flagSet["provider"] { + fatal("Provider %q is not available", *providerFlag) + } } if activeTranscriber == nil { var initErr error @@ -307,7 +338,9 @@ func run() { } if *transcribeFlag != "" { - runTranscribeFile(*transcribeFlag) + // First file is the flag value; any remaining positionals are extra + // files transcribed in the same process (the model loads once). + runTranscribeFiles(append([]string{*transcribeFlag}, flag.Args()...)) return } @@ -360,7 +393,12 @@ func run() { tray.OnCopyLast(clip.CopyLast) tray.OnRecord( - func() { select { case trayRecordChan <- struct{}{}: default: } }, + func() { + select { + case trayRecordChan <- struct{}{}: + default: + } + }, func() { requestStop() }, ) // preferredDevice remembers the user's choice so we can auto-reconnect @@ -374,7 +412,7 @@ func run() { for i := range devices { names[i] = devices[i].Name } - tray.SetDevices(names, preferredDevice, func(name string) { + tray.SetDevices(names, preferredDevice, func(name string) { preferredDevice = name config.Update(func(s *config.Settings) { s.Device = name }) if name == "" { @@ -389,51 +427,82 @@ func run() { var trayModels []tray.Model modelIndex := map[string]transcriber.ModelInfo{} for _, p := range transcriber.Providers() { - key := os.Getenv(p.EnvKey) for _, m := range p.Models { + st := p.Status(m.ID) trayModels = append(trayModels, tray.Model{ Provider: p.Name, ProviderLabel: p.Label, ModelID: m.ID, Label: m.Label, - HasKey: key != "", + State: trayModelState(st), + Detail: st.Detail, Active: activeTranscriber.Name() == p.Name && activeTranscriber.GetModel() == m.ID, }) modelIndex[p.Name+":"+m.ID] = m } } - tray.SetLanguages(transcriber.AllLanguages()) - - tray.SetModels(trayModels, func(provider, model string) { - configMu.Lock() - defer configMu.Unlock() - - currentLang := activeTranscriber.GetLanguage() + tray.SetLanguages(activeTranscriber.SupportedLanguages()) - var newTr transcriber.Transcriber + providerByName := func(name string) (transcriber.ProviderInfo, bool) { for _, p := range transcriber.Providers() { - if p.Name == provider { - if key := os.Getenv(p.EnvKey); key != "" { - newTr = p.NewFn(key) - } - break + if p.Name == name { + return p, true } } - if newTr == nil { - return - } - newTr.SetLanguage(currentLang) - newTr.SetModel(model) + return transcriber.ProviderInfo{}, false + } - activeTranscriber = newTr - streamEnabled = modelIndex[provider+":"+model].Stream + // switchModel makes (provider, model) active, reusing the current instance + // when the provider is unchanged so we don't reload a local model twice. + switchModel := func(p transcriber.ProviderInfo, model string) { + configMu.Lock() + if activeTranscriber.Name() != p.Name { + newTr := p.New() + newTr.SetLanguage(activeTranscriber.GetLanguage()) + activeTranscriber = newTr + } + activeTranscriber.SetModel(model) // local: blocks here during gguf load + streamEnabled = modelIndex[p.Name+":"+model].Stream if !streamEnabled { activeFormat = *formatFlag } + langs := activeTranscriber.SupportedLanguages() + local := transcriber.IsLocal(activeTranscriber) + configMu.Unlock() + + config.Update(func(s *config.Settings) { s.Provider = p.Name; s.Model = model }) + tray.SetLanguages(langs) + tray.SetHintsEnabled(!local) + tray.SetActiveModel(p.Name, model) + } - config.Update(func(s *config.Settings) { s.Provider = provider; s.Model = model }) - tray.SetLanguages(newTr.SupportedLanguages()) + tray.SetModels(trayModels, func(provider, model string) { + p, ok := providerByName(provider) + if !ok { + return + } + st := p.Status(model) + switch { + case st.Ready: + switchModel(p, model) + case st.Downloadable: + // Async: a model download takes minutes; show progress in the menu. + go func() { + tray.UpdateModelState(provider, model, tray.ModelDownloading, "0%") + err := p.Download(model, func(f float64) { + tray.UpdateModelState(provider, model, tray.ModelDownloading, fmt.Sprintf("%.0f%%", f*100)) + }) + if err != nil { + log.Errorf("model download: %v", err) + tray.SetError("Download failed: " + err.Error()) + tray.UpdateModelState(provider, model, tray.ModelNeedsDownload, st.Detail) + return + } + tray.UpdateModelState(provider, model, tray.ModelReady, "") + switchModel(p, model) + }() + } }) tray.SetLanguage(*langFlag, func(code string) { @@ -442,6 +511,7 @@ func run() { configMu.Unlock() config.Update(func(s *config.Settings) { s.Language = code }) }) + tray.SetHintsEnabled(!transcriber.IsLocal(activeTranscriber)) tray.SetLogin(login.Enabled()) tray.SetVersion(version) tray.OnSaveAudio(saveLastRecording) @@ -544,10 +614,6 @@ func run() { } defer hk.Unregister() - logRecordDevice := func() { - log.Info("recording_device: " + captureDevice.DeviceName()) - } - sessions := make(chan recSession, 1) go listenHotkey(hk, longPressDuration(), sessions) @@ -557,20 +623,41 @@ func run() { } }() + recordSessions(captureDevice, sessions) +} + +// afterRecordCycle, when non-nil, is called by recordSessions at the end of each +// record+transcribe cycle. Test-only hook (lets the harness know a cycle ended). +var afterRecordCycle func() + +// recordSessions is the core record→transcribe loop, shared by the live app and +// tests. isRecording stays true for the WHOLE cycle — recording AND inference — +// so listenHotkey blocks a new recording while a transcription is still running +// (handleRecording returns a `done` channel that closes when inference ends). +func recordSessions(capture audio.CaptureDevice, sessions <-chan recSession) { for sess := range sessions { log.Info("recording_start") - logRecordDevice() + log.Info("recording_device: " + capture.DeviceName()) isRecording.Store(true) tray.SetRecording(true) go beep.PlayStart() - _, err := handleRecording(captureDevice, sess) - isRecording.Store(false) - tray.SetRecording(false) + done, err := handleRecording(capture, sess) if err != nil { log.Errorf("recording error: %v", err) tray.SetError(err.Error()) } + if done != nil { + isTranscribing.Store(true) + tray.SetTranscribing(true) // blue status dot while inference runs + <-done // hold isRecording too — blocks re-record + isTranscribing.Store(false) + } + isRecording.Store(false) + tray.SetRecording(false) + if afterRecordCycle != nil { + afterRecordCycle() + } } } @@ -599,7 +686,11 @@ func listenHotkey(hk hotkey.Hotkey, longPress time.Duration, sessions chan<- rec <-hk.Keydown() if isRecording.Load() { <-hk.Keyup() - requestStop() + if isTranscribing.Load() { + go beep.PlayDenied() // ignored: transcription still in progress + } else { + requestStop() + } continue } sc := &atomic.Bool{} @@ -611,7 +702,12 @@ func listenHotkey(hk hotkey.Hotkey, longPress time.Duration, sessions chan<- rec requestStop() st = idle case <-hk.Keyup(): - if !timer.Stop() { select { case <-timer.C: default: } } + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } sc.Store(true) st = toggleRecording } @@ -768,8 +864,7 @@ func finishTranscription(sess transcriber.Session, clipCh chan string, updatesDo TLSTimeMs: bs.TLSTimeMs, TTFBMs: bs.TTFBMs, TotalTimeMs: bs.TotalTimeMs, - MemoryAllocMB: result.MemoryAllocMB, - MemoryPeakMB: result.MemoryPeakMB, + ProcessRSSMB: result.ProcessRSSMB, InferenceMs: bs.InferenceMs, } transcriptionsMu.Lock() @@ -792,6 +887,7 @@ func finishTranscription(sess transcriber.Session, clipCh chan string, updatesDo RecvMessages: ss.RecvMessages, RecvFinal: ss.RecvFinal, CommitEvents: ss.CommitEvents, + ProcessRSSMB: result.ProcessRSSMB, }) } @@ -856,10 +952,29 @@ func saveLastRecording() { alert.Info("Saved to " + dir) } -func runTranscribeFile(audioFile string) { +// directTranscriber is implemented by cloud providers that accept encoded audio +// bytes directly; local (Parakeet) instead routes WAV → PCM → Session. +type directTranscriber interface { + Transcribe(audio []byte, format, lang, hints string) (*transcriber.Result, error) +} + +// runTranscribeFiles transcribes one or more files with the already-loaded +// engine — the model is loaded once at startup and reused across files — and +// prints one transcript per line, in input order. +func runTranscribeFiles(files []string) { + for _, f := range files { + text, err := transcribeFile(f) + if err != nil { + fatal("%s: %v", f, err) + } + fmt.Println(text) + } +} + +func transcribeFile(audioFile string) (string, error) { data, err := os.ReadFile(audioFile) if err != nil { - fatal("Error reading file: %v", err) + return "", err } ext := filepath.Ext(audioFile) @@ -872,24 +987,36 @@ func runTranscribeFile(audioFile string) { case ".mp3": format = "mp3" default: - fatal("Unsupported audio format: %s", ext) + return "", fmt.Errorf("unsupported audio format %q", ext) } - type directTranscriber interface { - Transcribe(audio []byte, format, lang, hints string) (*transcriber.Result, error) + if dt, ok := activeTranscriber.(directTranscriber); ok { + result, err := dt.Transcribe(data, format, activeTranscriber.GetLanguage(), config.GetHints()) + if err != nil { + return "", err + } + return result.Text, nil } - dt, ok := activeTranscriber.(directTranscriber) - if !ok { - fatal("Provider %q does not support direct file transcription", activeTranscriber.Name()) + if format != "wav" { + return "", fmt.Errorf("local transcription supports WAV files only (got %s)", ext) } - - result, err := dt.Transcribe(data, format, activeTranscriber.GetLanguage(), config.GetHints()) + pcm, err := audio.WAVToPCM(data) if err != nil { - fatal("Transcription error: %v", err) + return "", fmt.Errorf("cannot read WAV: %w", err) } - - fmt.Println(result.Text) + sess, err := activeTranscriber.NewSession(context.Background(), transcriber.SessionConfig{ + Language: activeTranscriber.GetLanguage(), + }) + if err != nil { + return "", err + } + sess.Feed(pcm) + result, err := sess.Close() + if err != nil { + return "", err + } + return result.Text, nil } func runBenchmark(wavFile string, runs int) { diff --git a/main_test.go b/main_test.go index 9016188..8cd722f 100644 --- a/main_test.go +++ b/main_test.go @@ -1,11 +1,85 @@ package main import ( + "sync/atomic" "testing" "time" + + "zee/audio" + "zee/beep" + "zee/encoder" "zee/hotkey" + "zee/transcriber" ) +// TestRecordSessionsBlocksDuringInference verifies the guard's missing half: +// isRecording must stay true for the WHOLE record+transcribe cycle, not just +// while recording. It drives the real recordSessions loop with a fake capture +// and a fake transcriber whose "inference" takes 800ms, then checks isRecording +// is still set mid-inference. Combined with TestListenHotkey_StopsTrayRecording +// (a press while isRecording is true starts no new session), this proves a +// hotkey press during inference is blocked. +func TestRecordSessionsBlocksDuringInference(t *testing.T) { + beep.Disable() + isRecording.Store(false) + + fake := transcriber.NewFake("hello", nil) + fake.SetDelay(800 * time.Millisecond) // simulated inference window + activeTranscriber = fake + + ctx, err := audio.NewFakeContext("test/data/short.wav", false) + if err != nil { + t.Fatalf("fake audio context: %v", err) + } + capture, err := ctx.NewCapture(nil, audio.CaptureConfig{ + SampleRate: encoder.SampleRate, Channels: encoder.Channels, + }) + if err != nil { + t.Fatalf("fake capture: %v", err) + } + defer capture.Close() + + var cycles int32 + afterRecordCycle = func() { atomic.AddInt32(&cycles, 1) } + defer func() { afterRecordCycle = nil }() + + sessions := make(chan recSession, 1) + loopDone := make(chan struct{}) + go func() { recordSessions(capture, sessions); close(loopDone) }() + + // Start a recording, let it capture briefly, then stop it (as a keyup would) + // so the 800ms "inference" begins. + sessions <- recSession{Stop: resetStop(), SilenceClose: &atomic.Bool{}} + time.Sleep(150 * time.Millisecond) + requestStop() + + // Mid-inference: the guard must still be engaged, and isTranscribing (which + // drives the blue icon + denied beep) must be set. + time.Sleep(250 * time.Millisecond) + if !isRecording.Load() { + t.Fatal("isRecording cleared during inference — a re-record would NOT be blocked") + } + if !isTranscribing.Load() { + t.Fatal("isTranscribing not set during inference — no blue icon / denied beep") + } + + // Wait out the cycle, confirm both flags were released after inference. + deadline := time.Now().Add(3 * time.Second) + for atomic.LoadInt32(&cycles) < 1 { + if time.Now().After(deadline) { + t.Fatal("record cycle never completed") + } + time.Sleep(10 * time.Millisecond) + } + if isRecording.Load() || isTranscribing.Load() { + t.Fatal("isRecording/isTranscribing still set after inference completed") + } + + // Terminate the loop cleanly so it doesn't leak into other tests. + close(sessions) + <-loopDone +} + func TestListenHotkey_TrayStopNoStaleSignal(t *testing.T) { hk := hotkey.NewFake() sessions := make(chan recSession, 3) diff --git a/packaging/mkdmg.sh b/packaging/mkdmg.sh index b850287..6b0dce3 100755 --- a/packaging/mkdmg.sh +++ b/packaging/mkdmg.sh @@ -27,6 +27,16 @@ else echo "warning: $SCRIPT_DIR/Zee.icns not found, DMG will have no app icon" >&2 fi +# Ship the license + third-party attribution with the app (MIT requires the +# notices in distributions; the models are CC-BY-4.0 and need attribution). +for doc in LICENSE THIRD_PARTY_LICENSES; do + if [ -f "$SCRIPT_DIR/../$doc" ]; then + cp "$SCRIPT_DIR/../$doc" "$APP/Contents/Resources/$doc" + else + echo "warning: $doc not found, not shipped in DMG" >&2 + fi +done + codesign --force --sign - --identifier com.zee.app "$APP" ln -s /Applications "$STAGING/Applications" diff --git a/test/data/en.wav b/test/data/en.wav new file mode 100644 index 0000000..77502a5 Binary files /dev/null and b/test/data/en.wav differ diff --git a/test/data/fr.wav b/test/data/fr.wav new file mode 100644 index 0000000..736bbc2 Binary files /dev/null and b/test/data/fr.wav differ diff --git a/test/data/ru.wav b/test/data/ru.wav new file mode 100644 index 0000000..bb456e1 Binary files /dev/null and b/test/data/ru.wav differ diff --git a/test/integration_test.go b/test/integration_test.go index b6cd8df..92bfd68 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -3,16 +3,20 @@ package test_test import ( + "bytes" "encoding/binary" "fmt" "os" "os/exec" "path/filepath" + "runtime" "strings" "testing" "time" + "unicode" "zee/clipboard" + "zee/localmodel" ) var testBinary string @@ -288,3 +292,160 @@ func TestClipboardRestoreOnError(t *testing.T) { t.Errorf("clipboard not restored on error: got %q, want %q", strings.TrimSpace(clip), sentinel) } } + +// --- Local model (Parakeet) tests --- +// +// End-to-end check that the on-device models transcribe their own languages: +// the default English 110m, and the multilingual v3 across English, French and +// Russian (auto-detect). Audio fixtures are committed WAVs synthesized with +// macOS `say` so the expected transcript is known. Each case self-skips when +// its gguf isn't downloaded (run `make download-models`), so the suite stays +// green on machines/CI without the local models. + +// localModelsDir is the dev gguf location relative to the test working dir +// (/test): models live at /models/parakeet/. +func localModelsDir(t *testing.T) string { + t.Helper() + dir, err := filepath.Abs(filepath.Join("..", "models", "parakeet", localmodel.Version)) + if err != nil { + t.Fatalf("resolve models dir: %v", err) + } + return dir +} + +// transcribeFiles runs `zee -transcribe` over one or more files in a SINGLE +// process (the model is loaded once and reused across files) and returns the +// per-file transcripts — one per stdout line, in input order. Skips if the +// model's gguf isn't present. +func transcribeFiles(t *testing.T, modelID, lang string, files ...string) []string { + t.Helper() + m, ok := localmodel.ByID(modelID) + if !ok { + t.Fatalf("unknown local model %q", modelID) + } + modelsDir := localModelsDir(t) + if fi, err := os.Stat(filepath.Join(modelsDir, m.Filename)); err != nil || fi.Size() != m.SizeBytes { + t.Skipf("model %q not downloaded (run: make download-models)", modelID) + } + + // Flags must precede the positional files: Go's flag parser stops at the + // first non-flag arg, so -transcribe and the files come last. + args := append([]string{"-logpath", t.TempDir(), "-provider", "parakeet", + "-model", modelID, "-lang", lang, "-transcribe"}, files...) + cmd := exec.Command(testBinary, args...) + cmd.Env = append(os.Environ(), "ZEE_MODELS_DIR="+modelsDir) + var stdout, stderr bytes.Buffer + cmd.Stdout, cmd.Stderr = &stdout, &stderr + + start := time.Now() + if err := cmd.Run(); err != nil { + t.Fatalf("transcribe %v failed: %v\nstderr: %s", files, err, stderr.String()) + } + t.Logf("%s: %d file(s) in %s", modelID, len(files), time.Since(start).Round(time.Millisecond)) + + lines := strings.Split(strings.TrimRight(stdout.String(), "\n"), "\n") + if len(lines) != len(files) { + t.Fatalf("expected %d transcripts, got %d:\n%s", len(files), len(lines), stdout.String()) + } + return lines +} + +// assertTranscript checks the transcript matches the expected text by normalized +// token overlap (TTS+ASR drifts slightly, so we don't require an exact match). +func assertTranscript(t *testing.T, got, want string) { + t.Helper() + g, w := normalizeText(got), normalizeText(want) + if o := tokenOverlap(g, w); o < 0.8 { + t.Errorf("token overlap %.2f below 0.8\n got: %q\n want: %q", o, g, w) + } +} + +// normalizeText lowercases and collapses everything but letters/digits to single +// spaces, so punctuation and casing don't fail the comparison. +func normalizeText(s string) string { + var b strings.Builder + for _, r := range strings.ToLower(s) { + if unicode.IsLetter(r) || unicode.IsNumber(r) { + b.WriteRune(r) + } else { + b.WriteRune(' ') + } + } + return strings.Join(strings.Fields(b.String()), " ") +} + +// tokenOverlap is the fraction of want's tokens present in got (multiset). TTS+ +// ASR can drift slightly, so we assert a high overlap rather than exact match. +func tokenOverlap(got, want string) float64 { + w := strings.Fields(want) + if len(w) == 0 { + return 0 + } + have := map[string]int{} + for _, tok := range strings.Fields(got) { + have[tok]++ + } + hits := 0 + for _, tok := range w { + if have[tok] > 0 { + have[tok]-- + hits++ + } + } + return float64(hits) / float64(len(w)) +} + +func TestLocalParakeetModels(t *testing.T) { + if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { + t.Skip("local Parakeet transcription is darwin/arm64 only") + } + + const wantEN = "The quick brown fox jumps." + + t.Run("english-110m", func(t *testing.T) { + got := transcribeFiles(t, "parakeet-110m-en", "en", "data/en.wav") + assertTranscript(t, got[0], wantEN) + }) + + // One process, one v3 load, three languages (auto-detect). + t.Run("v3-multilingual", func(t *testing.T) { + got := transcribeFiles(t, "parakeet-v3-multi", "", + "data/en.wav", "data/fr.wav", "data/ru.wav") + assertTranscript(t, got[0], wantEN) + assertTranscript(t, got[1], "Je m'appelle Thomas Dupont.") + assertTranscript(t, got[2], "Меня зовут Милена Иванова.") + }) +} + +// TestLocalModelDiagnostics checks that a local-model transcription emits the +// same diagnostics/metrics record as the cloud path. We assert on the presence +// of stable markers (provider, and a few metric keys) rather than parsing the +// line, so the log format can evolve without breaking the test. The recording +// path (-test) is what logs metrics; -transcribe is a quiet one-shot. +func TestLocalModelDiagnostics(t *testing.T) { + if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { + t.Skip("local Parakeet transcription is darwin/arm64 only") + } + const modelID = "parakeet-v3-multi" + m, ok := localmodel.ByID(modelID) + if !ok { + t.Fatalf("unknown local model %q", modelID) + } + modelsDir := localModelsDir(t) + if fi, err := os.Stat(filepath.Join(modelsDir, m.Filename)); err != nil || fi.Size() != m.SizeBytes { + t.Skipf("model %q not downloaded (run: make download-models)", modelID) + } + + // Flags must precede the positional WAV: Go's flag parsing stops at the + // first non-flag argument. runZeeOpts already orders them correctly. + logDir := runZeeOpts(t, cmds("KEYDOWN", "KEYUP", "WAIT", "SLEEP 500", "QUIT"), + runOpts{env: []string{"ZEE_MODELS_DIR=" + modelsDir}}, + "-provider", "parakeet", "-model", modelID, "-lang", "", "-test", "data/fr.wav") + + diag := readLog(t, logDir, "diagnostics_log.txt") + for _, marker := range []string{"transcription", "provider=parakeet", "inference_ms", "rss_mb", "audio_s"} { + if !strings.Contains(diag, marker) { + t.Errorf("diagnostics missing %q marker\n--- diagnostics ---\n%s", marker, diag) + } + } +} diff --git a/third_party/parakeet.cpp b/third_party/parakeet.cpp new file mode 160000 index 0000000..b8012f1 --- /dev/null +++ b/third_party/parakeet.cpp @@ -0,0 +1 @@ +Subproject commit b8012f11e5269126eddb7f4fd02f891a2ccc29b0 diff --git a/transcriber/batch_session.go b/transcriber/batch_session.go index 325c5a8..8d555ea 100644 --- a/transcriber/batch_session.go +++ b/transcriber/batch_session.go @@ -133,7 +133,7 @@ func (bs *batchSession) Close() (SessionResult, error) { }, Metrics: bs.formatMetrics(rawSize, encodedSize, compressionPct, audioDuration, result), } - sr.captureMemStats() + sr.captureRSS() return sr, nil } diff --git a/transcriber/fake.go b/transcriber/fake.go index ec69b08..5a8ed1b 100644 --- a/transcriber/fake.go +++ b/transcriber/fake.go @@ -12,12 +12,21 @@ type FakeTranscriber struct { err error lang string stream bool + delay time.Duration // simulated inference time (Close blocks for this long) } func NewFake(text string, err error) *FakeTranscriber { - return &FakeTranscriber{text: text, err: err, stream: os.Getenv("ZEE_FAKE_STREAM") == "1"} + f := &FakeTranscriber{text: text, err: err, stream: os.Getenv("ZEE_FAKE_STREAM") == "1"} + if d := os.Getenv("ZEE_FAKE_DELAY"); d != "" { + f.delay, _ = time.ParseDuration(d) + } + return f } +// SetDelay makes Close block for d, simulating inference latency (for tests that +// need a window where transcription is in progress). +func (f *FakeTranscriber) SetDelay(d time.Duration) { f.delay = d } + func (f *FakeTranscriber) Name() string { return "fake" } func (f *FakeTranscriber) SupportedLanguages() []Language { return nil } func (f *FakeTranscriber) SetLanguage(lang string) { f.lang = lang } @@ -47,13 +56,14 @@ func (f *FakeTranscriber) NewSession(_ context.Context, cfg SessionConfig) (Sess } else { close(updates) } - return &fakeSession{text: f.text, err: f.err, updates: updates}, nil + return &fakeSession{text: f.text, err: f.err, updates: updates, delay: f.delay}, nil } type fakeSession struct { text string err error updates chan string + delay time.Duration } func (s *fakeSession) Feed([]byte) {} @@ -61,6 +71,9 @@ func (s *fakeSession) Feed([]byte) {} func (s *fakeSession) Updates() <-chan string { return s.updates } func (s *fakeSession) Close() (SessionResult, error) { + if s.delay > 0 { + time.Sleep(s.delay) + } if s.err != nil { return SessionResult{}, fmt.Errorf("fake transcriber error: %w", s.err) } @@ -73,6 +86,6 @@ func (s *fakeSession) Close() (SessionResult, error) { }, Metrics: []string{"total: 10ms (fake)"}, } - r.captureMemStats() + r.captureRSS() return r, nil } diff --git a/transcriber/parakeet.go b/transcriber/parakeet.go new file mode 100644 index 0000000..d96fa11 --- /dev/null +++ b/transcriber/parakeet.go @@ -0,0 +1,269 @@ +package transcriber + +import ( + "context" + "encoding/binary" + "fmt" + "strings" + "sync" + "time" + + "zee/audio" + "zee/encoder" + "zee/internal/parakeet" + "zee/localmodel" +) + +// Parakeet is the offline, on-device provider. It wraps one loaded GGUF model +// (decision #2: a single shared ctx; push-to-talk is serial) and swaps the +// gguf to change models (decision #3: 110m loaded at startup, switching freezes +// briefly). The C-API has no usable language parameter for these models, so +// language is model-driven (decision #1): English-only for 110m/v2, and the +// multilingual v3 auto-detects (it is not prompt-conditioned, so a target +// language cannot be forced). +type Parakeet struct { + mu sync.Mutex + modelID string + ctx *parakeet.Ctx + loadErr error + lang string +} + +// parakeetAvailable reports whether local transcription is compiled in and the +// default model is on disk — the gate for the no-key fallback. +func parakeetAvailable() bool { + return parakeet.Available() && localmodel.Present(localmodel.Default()) +} + +// parakeetProvider is the registry entry: availability is "default model on +// disk", status is per-gguf presence, and missing models are downloadable. +func parakeetProvider() ProviderInfo { + return ProviderInfo{ + Name: "parakeet", + Label: "Local (Parakeet)", + Models: ParakeetModels(), + Available: parakeetAvailable, + New: func() Transcriber { return NewParakeet() }, + Status: func(id string) ModelStatus { + m, ok := localmodel.ByID(id) + if !ok || !parakeet.Available() { + return ModelStatus{} // unknown, or not compiled in → Unavailable + } + if localmodel.Present(m) { + return ModelStatus{Ready: true} + } + return ModelStatus{Downloadable: true, Detail: humanSize(m.SizeBytes)} + }, + Download: func(id string, progress func(float64)) error { + m, ok := localmodel.ByID(id) + if !ok { + return fmt.Errorf("unknown local model %q", id) + } + return localmodel.Download(m, progress) + }, + } +} + +// ParakeetModels lists the on-device models as ModelInfo (for the tray) without +// needing a loaded provider instance. +func ParakeetModels() []ModelInfo { + out := make([]ModelInfo, 0, len(localmodel.All())) + for _, m := range localmodel.All() { + out = append(out, ModelInfo{ID: m.ID, Label: m.Label, Stream: false, Languages: parakeetLanguages(m)}) + } + return out +} + +func humanSize(b int64) string { + if b >= 1<<30 { + return fmt.Sprintf("%.1f GB", float64(b)/(1<<30)) + } + return fmt.Sprintf("%d MB", b>>20) +} + +// NewParakeet builds the provider and eagerly loads the default model (110m, +// ~55 ms) so the first recording is instant. A missing/failed model is deferred +// to NewSession as an error. +func NewParakeet() *Parakeet { + p := &Parakeet{modelID: localmodel.ID110mEN, lang: "en"} + p.mu.Lock() + p.load() + p.mu.Unlock() + return p +} + +// load (mu held) swaps the loaded ctx to p.modelID, freeing the previous one. +func (p *Parakeet) load() { + m, ok := localmodel.ByID(p.modelID) + if !ok { + p.loadErr = fmt.Errorf("unknown local model %q", p.modelID) + return + } + if !localmodel.Present(m) { + p.loadErr = fmt.Errorf("model %q not downloaded", m.Label) + return + } + if p.ctx != nil { + p.ctx.Close() + p.ctx = nil + } + ctx, err := parakeet.New(localmodel.Path(m)) + p.ctx, p.loadErr = ctx, err +} + +// IsLocal reports whether tr is the on-device provider. Local decode has no +// hint biasing, no streaming, and no audio encoding, so the UI greys those out. +func IsLocal(tr Transcriber) bool { + _, ok := tr.(*Parakeet) + return ok +} + +func (p *Parakeet) Name() string { return "parakeet" } + +func (p *Parakeet) Models() []ModelInfo { + out := make([]ModelInfo, 0, len(localmodel.All())) + for _, m := range localmodel.All() { + out = append(out, ModelInfo{ + ID: m.ID, + Label: m.Label, + Stream: false, + Languages: parakeetLanguages(m), + }) + } + return out +} + +func parakeetLanguages(m localmodel.Model) []Language { + if m.Multilingual { + // v3 auto-detects and is not prompt-conditioned, so a target language + // cannot be forced — Auto-detect is the only meaningful option. + return []Language{{Code: "", Label: "Auto-detect"}} + } + return []Language{{Code: "en", Label: "English"}} +} + +func (p *Parakeet) SupportedLanguages() []Language { + if m, ok := localmodel.ByID(p.GetModel()); ok { + return parakeetLanguages(m) + } + return []Language{{Code: "en", Label: "English"}} +} + +func (p *Parakeet) SetLanguage(lang string) { p.mu.Lock(); p.lang = lang; p.mu.Unlock() } +func (p *Parakeet) GetLanguage() string { p.mu.Lock(); defer p.mu.Unlock(); return p.lang } + +func (p *Parakeet) GetModel() string { p.mu.Lock(); defer p.mu.Unlock(); return p.modelID } + +// SetModel swaps the active model, loading its gguf eagerly. The caller (tray) +// is intentionally blocked during the load (decision #3); a load failure is +// surfaced at NewSession. +func (p *Parakeet) SetModel(id string) { + p.mu.Lock() + defer p.mu.Unlock() + if id == p.modelID && p.ctx != nil { + return + } + p.modelID = id + p.load() +} + +func (p *Parakeet) NewSession(_ context.Context, cfg SessionConfig) (Session, error) { + if cfg.Stream { + return nil, fmt.Errorf("parakeet does not support streaming") + } + p.mu.Lock() + if p.ctx == nil && p.loadErr == nil { + p.load() + } + ctx, err := p.ctx, p.loadErr + decoder := 0 + if m, ok := localmodel.ByID(p.modelID); ok { + decoder = m.Decoder + } + p.mu.Unlock() + if err != nil { + return nil, err + } + return &pcmSession{ctx: ctx, decoder: decoder, updates: make(chan string)}, nil +} + +// Close frees the loaded model. The provider is reusable afterwards (the next +// NewSession reloads lazily). +func (p *Parakeet) Close() { + p.mu.Lock() + defer p.mu.Unlock() + if p.ctx != nil { + p.ctx.Close() + p.ctx = nil + } +} + +// pcmSession buffers raw S16LE PCM during recording, then runs one batch +// inference on Close. Same Session interface as the cloud batch path, so the +// live hotkey and -transcribe share it — no encoder, no network. +type pcmSession struct { + ctx *parakeet.Ctx + decoder int + mu sync.Mutex + pcm []byte + updates chan string +} + +func (s *pcmSession) Feed(pcm []byte) { + s.mu.Lock() + s.pcm = append(s.pcm, pcm...) + s.mu.Unlock() +} + +func (s *pcmSession) Updates() <-chan string { return s.updates } + +func (s *pcmSession) Close() (SessionResult, error) { + close(s.updates) + + s.mu.Lock() + raw := s.pcm + s.mu.Unlock() + + n := len(raw) / 2 + if n == 0 { + return SessionResult{NoSpeech: true}, nil + } + + f32 := make([]float32, n) + for i := 0; i < n; i++ { + f32[i] = float32(int16(binary.LittleEndian.Uint16(raw[i*2:]))) / 32768.0 + } + + start := time.Now() + text, err := s.ctx.Transcribe(f32, s.decoder) + if err != nil { + return SessionResult{}, err + } + inferenceMs := float64(time.Since(start).Microseconds()) / 1000 + + text = strings.TrimSpace(text) + noSpeech := text == "" + audioSec := float64(n) / float64(encoder.SampleRate) + rawKB := float64(len(raw)) / 1024 + + sr := SessionResult{ + Text: text, + HasText: !noSpeech, + NoSpeech: noSpeech, + AudioData: audio.PCMToWAV(raw), + AudioFormat: "wav", + Batch: &BatchStats{ + AudioLengthS: audioSec, + RawSizeKB: rawKB, + InferenceMs: inferenceMs, + TotalTimeMs: inferenceMs, + }, + Metrics: []string{ + fmt.Sprintf("audio: %.1fs | %.1f KB (raw PCM, no encoding)", audioSec, rawKB), + fmt.Sprintf("inference: %.0fms (local, CPU)", inferenceMs), + fmt.Sprintf("rtfx: %.1fx", audioSec/(inferenceMs/1000)), + }, + } + sr.captureRSS() + return sr, nil +} diff --git a/transcriber/session.go b/transcriber/session.go index 4edac56..ad3b150 100644 --- a/transcriber/session.go +++ b/transcriber/session.go @@ -1,12 +1,20 @@ package transcriber -import "runtime" +import ( + "os" -func (r *SessionResult) captureMemStats() { - var m runtime.MemStats - runtime.ReadMemStats(&m) - r.MemoryAllocMB = float64(m.Alloc) / 1024 / 1024 - r.MemoryPeakMB = float64(m.TotalAlloc) / 1024 / 1024 + "github.com/shirou/gopsutil/v4/process" +) + +// captureRSS records the process resident set size (from gopsutil, not Go's +// heap) — it includes cgo/mmap memory like the loaded model, so it's the one +// memory figure that's meaningful across every provider, local or remote. +func (r *SessionResult) captureRSS() { + if p, err := process.NewProcess(int32(os.Getpid())); err == nil { + if mi, err := p.MemoryInfo(); err == nil { + r.ProcessRSSMB = float64(mi.RSS) / 1024 / 1024 + } + } } type SessionConfig struct { @@ -46,17 +54,16 @@ type StreamStats struct { } type SessionResult struct { - Text string - HasText bool - NoSpeech bool - RateLimit string // "remaining/limit" or empty - MemoryAllocMB float64 - MemoryPeakMB float64 - Batch *BatchStats // non-nil for batch sessions - Stream *StreamStats // non-nil for stream sessions - Metrics []string // pre-formatted metric lines - AudioData []byte // exact bytes sent to the model - AudioFormat string // "mp3", "flac", or "wav" + Text string + HasText bool + NoSpeech bool + RateLimit string // "remaining/limit" or empty + ProcessRSSMB float64 // resident set size (incl. cgo/mmap), all providers + Batch *BatchStats // non-nil for batch sessions + Stream *StreamStats // non-nil for stream sessions + Metrics []string // pre-formatted metric lines + AudioData []byte // exact bytes sent to the model + AudioFormat string // "mp3", "flac", or "wav" } type Session interface { diff --git a/transcriber/stream_session.go b/transcriber/stream_session.go index 9acb4a1..e7ee92f 100644 --- a/transcriber/stream_session.go +++ b/transcriber/stream_session.go @@ -233,7 +233,7 @@ func (s *streamSession) Close() (SessionResult, error) { AudioS: audioDuration, }, } - sr.captureMemStats() + sr.captureRSS() return sr, sessionErr } diff --git a/transcriber/transcriber.go b/transcriber/transcriber.go index 44c0a0e..2821863 100644 --- a/transcriber/transcriber.go +++ b/transcriber/transcriber.go @@ -11,15 +11,15 @@ import ( ) type NetworkMetrics struct { - DNS time.Duration - ConnWait time.Duration - TCP time.Duration - TLS time.Duration - ReqHeaders time.Duration - ReqBody time.Duration - TTFB time.Duration - Download time.Duration - Total time.Duration + DNS time.Duration + ConnWait time.Duration + TCP time.Duration + TLS time.Duration + ReqHeaders time.Duration + ReqBody time.Duration + TTFB time.Duration + Download time.Duration + Total time.Duration ConnReused bool TLSProtocol string } @@ -122,6 +122,18 @@ func langsFromCodes(codes []string) []Language { return langs } +// AllLanguages returns every known language (Auto-detect first, then the rest +// sorted alphabetically). The tray uses this as the fixed universe of language +// menu items; SetLanguages then shows only the active model's subset. +func AllLanguages() []Language { + codes := make([]string, 0, len(langLabels)) + for c := range langLabels { + codes = append(codes, c) + } + sort.Strings(codes) + return langsFromCodes(codes) +} + type baseTranscriber struct { client *TracedClient apiURL string @@ -142,16 +154,6 @@ func (b *baseTranscriber) GetLanguage() string { return b.lang } -// AllLanguages returns every known language, sorted alphabetically. -func AllLanguages() []Language { - codes := make([]string, 0, len(langLabels)) - for c := range langLabels { - codes = append(codes, c) - } - sort.Strings(codes) - return langsFromCodes(codes) -} - func (b *baseTranscriber) Models() []ModelInfo { return nil } func (b *baseTranscriber) SetModel(m string) { b.model = m } func (b *baseTranscriber) GetModel() string { return b.model } @@ -165,24 +167,58 @@ func modelLanguages(models []ModelInfo, current string) []Language { return nil } +// ModelStatus describes whether one model is usable now, and if not whether the +// user can make it usable. Cloud providers are binary (Ready when the key is +// present, else not Downloadable → Unavailable); the local provider adds the +// downloadable middle ground. +type ModelStatus struct { + Ready bool + Downloadable bool // missing but the user can fetch it (local only) + Detail string // human-readable size when downloadable & missing +} + +// ProviderInfo is a uniform descriptor for every backend — cloud or local. No +// provider is special-cased: New() and the tray treat them all through these +// fields. Download is nil for providers that have nothing to fetch (cloud). type ProviderInfo struct { - Name string - Label string - EnvKey string - Models []ModelInfo - NewFn func(string) Transcriber + Name string + Label string + Models []ModelInfo + Available func() bool // at least one model usable right now + New func() Transcriber // keyless: closes over the key / model dir + Status func(modelID string) ModelStatus + Download func(modelID string, progress func(fraction float64)) error +} + +// cloudProvider builds a key-gated ProviderInfo. Availability is "key present"; +// every model shares that status and nothing is downloadable. +func cloudProvider(name, label, envKey string, models []ModelInfo, mk func(string) Transcriber) ProviderInfo { + hasKey := func() bool { return os.Getenv(envKey) != "" } + return ProviderInfo{ + Name: name, + Label: label, + Models: models, + Available: hasKey, + New: func() Transcriber { return mk(os.Getenv(envKey)) }, + Status: func(string) ModelStatus { return ModelStatus{Ready: hasKey()} }, + } } func Providers() []ProviderInfo { return []ProviderInfo{ - {"deepgram", "Deepgram", "DEEPGRAM_API_KEY", DeepgramModels, func(k string) Transcriber { return NewDeepgram(k) }}, - {"openai", "OpenAI", "OPENAI_API_KEY", OpenAIModels, func(k string) Transcriber { return NewOpenAI(k) }}, - {"groq", "Groq", "GROQ_API_KEY", GroqModels, func(k string) Transcriber { return NewGroq(k) }}, - {"mistral", "Mistral", "MISTRAL_API_KEY", MistralModels, func(k string) Transcriber { return NewMistral(k) }}, - {"elevenlabs", "ElevenLabs", "ELEVENLABS_API_KEY", ElevenLabsModels, func(k string) Transcriber { return NewElevenLabs(k) }}, + // Local is first so it's the default on a fresh machine even when cloud + // keys are set; cloud is opt-in via the tray (the choice persists). + parakeetProvider(), + cloudProvider("deepgram", "Deepgram", "DEEPGRAM_API_KEY", DeepgramModels, func(k string) Transcriber { return NewDeepgram(k) }), + cloudProvider("openai", "OpenAI", "OPENAI_API_KEY", OpenAIModels, func(k string) Transcriber { return NewOpenAI(k) }), + cloudProvider("groq", "Groq", "GROQ_API_KEY", GroqModels, func(k string) Transcriber { return NewGroq(k) }), + cloudProvider("mistral", "Mistral", "MISTRAL_API_KEY", MistralModels, func(k string) Transcriber { return NewMistral(k) }), + cloudProvider("elevenlabs", "ElevenLabs", "ELEVENLABS_API_KEY", ElevenLabsModels, func(k string) Transcriber { return NewElevenLabs(k) }), } } +// New returns the default transcriber: the local model when available (even if +// cloud keys are set), else the first cloud provider with a key. func New() (Transcriber, error) { if fakeText, ok := os.LookupEnv("ZEE_FAKE_TEXT"); ok { var fakeErr error @@ -193,10 +229,10 @@ func New() (Transcriber, error) { } for _, p := range Providers() { - if key := os.Getenv(p.EnvKey); key != "" { - return p.NewFn(key), nil + if p.Available() { + return p.New(), nil } } - return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, MISTRAL_API_KEY, or ELEVENLABS_API_KEY environment variable") + return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, MISTRAL_API_KEY, or ELEVENLABS_API_KEY (or install on Apple Silicon to run offline)") } diff --git a/tray/icon_busy.png b/tray/icon_busy.png new file mode 100644 index 0000000..d0fc056 Binary files /dev/null and b/tray/icon_busy.png differ diff --git a/tray/icon_idle.png b/tray/icon_idle.png new file mode 100644 index 0000000..0392266 Binary files /dev/null and b/tray/icon_idle.png differ diff --git a/tray/icon_rec.png b/tray/icon_rec.png new file mode 100644 index 0000000..04200a0 Binary files /dev/null and b/tray/icon_rec.png differ diff --git a/tray/icon_warn.png b/tray/icon_warn.png new file mode 100644 index 0000000..d59a723 Binary files /dev/null and b/tray/icon_warn.png differ diff --git a/tray/icons.go b/tray/icons.go index 2ab3b8b..14bef5f 100644 --- a/tray/icons.go +++ b/tray/icons.go @@ -14,4 +14,7 @@ var ( //go:embed icon_warn.png iconWarnHi []byte + + //go:embed icon_busy.png + iconBusyHi []byte ) diff --git a/tray/tray.go b/tray/tray.go index cacf79f..ad2d0f5 100644 --- a/tray/tray.go +++ b/tray/tray.go @@ -7,12 +7,23 @@ import ( "zee/transcriber" ) +// ModelState drives how a model entry renders in the menu. +type ModelState int + +const ( + ModelReady ModelState = iota // selectable now + ModelNeedsDownload // missing, user can fetch it (local) + ModelDownloading // fetch in progress (shows %) + ModelUnavailable // can't be used (e.g. cloud, no key) +) + type Model struct { - Provider string // e.g. "groq", "openai", "deepgram" + Provider string // e.g. "groq", "openai", "parakeet" ProviderLabel string // e.g. "Groq" ModelID string // e.g. "whisper-large-v3-turbo" Label string // model display name - HasKey bool + State ModelState + Detail string // size when NeedsDownload, percent when Downloading Active bool } @@ -38,9 +49,9 @@ var ( loginOn bool loginCb func(bool) error - modelMu sync.Mutex - models []Model - modelCb func(provider, model string) + modelMu sync.Mutex + models []Model + modelCb func(provider, model string) isBTFn func(string) bool @@ -48,19 +59,19 @@ var ( langCb func(string) appVersion string - checkUpdateCb func() - saveAudioCb func() - editHintsCb func() + checkUpdateCb func() + saveAudioCb func() + editHintsCb func() ) var languages []transcriber.Language // set via SetLanguages -func OnCopyLast(fn func()) { copyLastFn = fn } -func OnRecord(start, stop func()) { recordFn = start; stopFn = stop } -func SetAutoPaste(on bool) { autoPasteOn = on } -func OnAutoPaste(fn func(bool)) { autoPasteCb = fn } -func SetLogin(on bool) { loginOn = on } -func OnLogin(fn func(bool) error) { loginCb = fn } +func OnCopyLast(fn func()) { copyLastFn = fn } +func OnRecord(start, stop func()) { recordFn = start; stopFn = stop } +func SetAutoPaste(on bool) { autoPasteOn = on } +func OnAutoPaste(fn func(bool)) { autoPasteCb = fn } +func SetLogin(on bool) { loginOn = on } +func OnLogin(fn func(bool) error) { loginCb = fn } func SetRecording(rec bool) { recording = rec @@ -83,6 +94,12 @@ func SetWarning(on bool) { updateWarningIcon(on) } +// SetTranscribing shows the "transcription in progress" icon (a blue status +// dot). The icon returns to idle on the next SetRecording(false). +func SetTranscribing(on bool) { + updateTranscribingIcon(on) +} + func SetError(msg string) { updateTooltip("zee – " + msg) go func() { @@ -112,14 +129,82 @@ func SetModels(m []Model, onSwitch func(provider, model string)) { modelMu.Unlock() } +// UpdateModelState re-renders a single model entry (used while a local model +// downloads, and when it becomes ready). Safe to call from any goroutine. +func UpdateModelState(provider, modelID string, state ModelState, detail string) { + modelMu.Lock() + idx := -1 + for i := range models { + if models[i].Provider == provider && models[i].ModelID == modelID { + models[i].State = state + models[i].Detail = detail + idx = i + break + } + } + modelMu.Unlock() + if idx >= 0 { + updateModelItem(idx) + } +} + +// SetActiveModel marks one model active (checked) and the rest inactive, +// re-rendering only the entries that changed. Called by the app after a +// successful model switch. +func SetActiveModel(provider, modelID string) { + modelMu.Lock() + var changed []int + for i := range models { + want := models[i].Provider == provider && models[i].ModelID == modelID + if models[i].Active != want { + models[i].Active = want + changed = append(changed, i) + } + } + modelMu.Unlock() + for _, i := range changed { + updateModelItem(i) + } + updateStatus() +} + +// modelTitle is the menu label for a model given its state. +func modelTitle(m Model) string { + switch m.State { + case ModelNeedsDownload: + if m.Detail != "" { + return m.Label + " — download " + m.Detail + } + return m.Label + " — download" + case ModelDownloading: + if m.Detail != "" { + return m.Label + " — downloading " + m.Detail + } + return m.Label + " — downloading…" + default: + return m.Label + } +} + func SetLastRecording(dur time.Duration, totalMs float64) { updateCopyLastTitle(fmt.Sprintf("Copy Last Recorded Text (%.1fs | %dms)", dur.Seconds(), int(totalMs))) } +// hintsEnabled gates the "Edit Hints…" item: local providers ignore hints +// (greedy decode has no biasing), so the item is greyed out when local is active. +var hintsEnabled = true + +// SetHintsEnabled greys out / restores the "Edit Hints…" menu item. Safe to +// call before Init (the state is applied when the menu is built). +func SetHintsEnabled(on bool) { + hintsEnabled = on + setHintsEnabled(on) +} + func SetVersion(v string) { appVersion = v } func OnCheckUpdate(fn func()) { checkUpdateCb = fn } -func OnSaveAudio(fn func()) { saveAudioCb = fn } -func OnEditHints(fn func()) { editHintsCb = fn } +func OnSaveAudio(fn func()) { saveAudioCb = fn } +func OnEditHints(fn func()) { editHintsCb = fn } func SetLanguage(code string, onSwitch func(string)) { langCode = code diff --git a/tray/tray_darwin.go b/tray/tray_darwin.go index 1c0e8b6..4d00d21 100644 --- a/tray/tray_darwin.go +++ b/tray/tray_darwin.go @@ -7,22 +7,25 @@ import ( "github.com/energye/systray" "golang.design/x/hotkey/mainthread" + + "zee/transcriber" ) var ( - mStatus *systray.MenuItem - mRecord *systray.MenuItem - mCopy *systray.MenuItem + mStatus *systray.MenuItem + mRecord *systray.MenuItem + mCopy *systray.MenuItem mDevices *systray.MenuItem mDefaultDevice *systray.MenuItem deviceItems []*systray.MenuItem deviceReady chan struct{} - mSettings *systray.MenuItem - mAutoPaste *systray.MenuItem - mLogin *systray.MenuItem - mBackend *systray.MenuItem - mLanguage *systray.MenuItem + mSettings *systray.MenuItem + mAutoPaste *systray.MenuItem + mLogin *systray.MenuItem + mEditHints *systray.MenuItem + mBackend *systray.MenuItem + mLanguage *systray.MenuItem langEntries []struct { item *systray.MenuItem code string @@ -51,7 +54,7 @@ func updateRecordingIcon(rec bool) { mRecord.SetTitle("● Stop Recording (Shift+Control+Space)") } } else { - systray.SetIcon(iconIdleHi) + systray.SetTemplateIcon(iconIdleHi, iconIdleHi) if mRecord != nil { mRecord.SetTitle("○ Start Recording (Shift+Control+Space)") } @@ -78,6 +81,14 @@ func updateWarningIcon(on bool) { } } +func updateTranscribingIcon(on bool) { + if on { + systray.SetIcon(iconBusyHi) + } else { + systray.SetTemplateIcon(iconIdleHi, iconIdleHi) + } +} + func updateTooltip(msg string) { systray.SetTooltip(msg) } @@ -155,7 +166,7 @@ func RefreshDevices(names []string, selected string) { } func onReady() { - systray.SetIcon(iconIdleHi) + systray.SetTemplateIcon(iconIdleHi, iconIdleHi) systray.SetTooltip("zee – push to talk") mStatus = systray.AddMenuItem(statusText(), "") @@ -222,12 +233,15 @@ func onReady() { } }) - mEditHints := mSettings.AddSubMenuItem("Edit Hints…", "Edit vocabulary hints file") + mEditHints = mSettings.AddSubMenuItem("Edit Hints…", "Edit vocabulary hints file") mEditHints.Click(func() { if editHintsCb != nil { go editHintsCb() } }) + if !hintsEnabled { + mEditHints.Disable() + } sep := mSettings.AddSubMenuItem("─────────", "") sep.Disable() @@ -260,52 +274,55 @@ func onReady() { modelMu.Lock() if len(models) > 0 { mBackend = mSettings.AddSubMenuItem("Model", "Select transcription model") - modelItems = make([]*systray.MenuItem, 0, len(models)) - var curProvider string - var provMenu *systray.MenuItem - for i, m := range models { - if m.Provider != curProvider { - curProvider = m.Provider - label := m.ProviderLabel - if !m.HasKey { - label += " (no API key)" - } - provMenu = mBackend.AddSubMenuItem(label, label) - if !m.HasKey { - provMenu.Disable() + modelItems = make([]*systray.MenuItem, len(models)) + // models are grouped by provider (contiguous); one submenu per provider. + for i := 0; i < len(models); { + prov := models[i].Provider + j, anyUsable := i, false + for j < len(models) && models[j].Provider == prov { + if models[j].State != ModelUnavailable { + anyUsable = true } + j++ + } + label := models[i].ProviderLabel + if !anyUsable { + label += " (no API key)" } - idx := i - item := provMenu.AddSubMenuItemCheckbox(m.Label, m.Label, m.Active) - item.Click(func() { - modelMu.Lock() - mm := models[idx] - cb := modelCb - modelMu.Unlock() - if !mm.HasKey || cb == nil { - return + provMenu := mBackend.AddSubMenuItem(label, label) + if !anyUsable { + provMenu.Disable() + } + for k := i; k < j; k++ { + idx := k + m := models[k] + item := provMenu.AddSubMenuItemCheckbox(modelTitle(m), m.Label, m.Active && m.State == ModelReady) + if m.State == ModelUnavailable || m.State == ModelDownloading { + item.Disable() } - cb(mm.Provider, mm.ModelID) - modelMu.Lock() - for j, it := range modelItems { - if j == idx { - it.Check() - models[j].Active = true - } else { - it.Uncheck() - models[j].Active = false + item.Click(func() { + modelMu.Lock() + mm := models[idx] + cb := modelCb + modelMu.Unlock() + // Ready → switch; NeedsDownload → fetch. The handler (main) + // dispatches and drives checkmarks via SetActiveModel. + if cb == nil || (mm.State != ModelReady && mm.State != ModelNeedsDownload) { + return } - } - modelMu.Unlock() - updateStatus() - }) - modelItems = append(modelItems, item) + cb(mm.Provider, mm.ModelID) + }) + modelItems[idx] = item + } + i = j } } modelMu.Unlock() + // Build a fixed item per known language (systray can't add items after + // CreateMenu). refreshLanguageMenu then shows only the active model's set. mLanguage = mSettings.AddSubMenuItem("Language", "Select transcription language") - for _, lang := range languages { + for _, lang := range transcriber.AllLanguages() { addLangEntry(lang.Code, lang.Label) } @@ -322,6 +339,8 @@ func onReady() { mQuit.Click(func() { Quit() }) systray.CreateMenu() + refreshLanguageMenu() // constrain the freshly-built menu to the active model + close(deviceReady) } @@ -332,6 +351,32 @@ func updateCopyLastTitle(title string) { } } +// updateModelItem re-renders one model entry (title, checkmark, enabled) from +// its current state. Called on download progress and on model switch. +func updateModelItem(idx int) { + modelMu.Lock() + if idx < 0 || idx >= len(modelItems) || idx >= len(models) { + modelMu.Unlock() + return + } + m := models[idx] + it := modelItems[idx] + modelMu.Unlock() + if it == nil { + return + } + it.SetTitle(modelTitle(m)) + if m.Active && m.State == ModelReady { + it.Check() + } else { + it.Uncheck() + } + if m.State == ModelReady || m.State == ModelNeedsDownload { + it.Enable() + } else { + it.Disable() + } +} func addLangEntry(code, label string) { idx := len(langEntries) @@ -361,28 +406,31 @@ func refreshLanguageMenu() { for _, l := range languages { want[l.Code] = true } - langValid := false + // If the active model no longer offers the current language, fall back to + // its first offered one (Auto-detect for multilingual models, English for + // the English-only ones). + if !want[langCode] { + langCode = "" + if len(languages) > 0 { + langCode = languages[0].Code + } + if langCb != nil { + langCb(langCode) + } + } for _, e := range langEntries { - if e.code == "" || want[e.code] { + if want[e.code] { e.item.Show() if e.code == langCode { - langValid = true e.item.Check() + } else { + e.item.Uncheck() } } else { e.item.Hide() e.item.Uncheck() } } - if !langValid { - langCode = "" - if len(langEntries) > 0 { - langEntries[0].item.Check() - } - if langCb != nil { - langCb("") - } - } updateStatus() } @@ -392,6 +440,17 @@ func updateStatusItem(text string) { } } +func setHintsEnabled(on bool) { + if mEditHints == nil { + return + } + if on { + mEditHints.Enable() + } else { + mEditHints.Disable() + } +} + func disableBackend() { if mBackend != nil { mBackend.Disable() diff --git a/tray/tray_other.go b/tray/tray_other.go index 2289aa0..8990eec 100644 --- a/tray/tray_other.go +++ b/tray/tray_other.go @@ -2,16 +2,19 @@ package tray -func Init() <-chan struct{} { return make(chan struct{}) } -func RefreshDevices(names []string, selected string) {} -func refreshLanguageMenu() {} -func updateRecordingIcon(bool) {} -func updateWarningIcon(bool) {} -func updateTooltip(string) {} -func updateCopyLastTitle(string) {} -func addUpdateMenuItem(string) {} -func disableDevices() {} -func enableDevices() {} -func disableBackend() {} -func enableBackend() {} -func updateStatusItem(string) {} +func Init() <-chan struct{} { return make(chan struct{}) } +func RefreshDevices(names []string, selected string) {} +func refreshLanguageMenu() {} +func updateRecordingIcon(bool) {} +func updateWarningIcon(bool) {} +func updateTranscribingIcon(bool) {} +func updateTooltip(string) {} +func updateCopyLastTitle(string) {} +func addUpdateMenuItem(string) {} +func disableDevices() {} +func enableDevices() {} +func disableBackend() {} +func enableBackend() {} +func updateStatusItem(string) {} +func updateModelItem(int) {} +func setHintsEnabled(bool) {}