perf: default back to small model; show per-command STT latency

medium added ~3s/command lag (measured ~1.2s small vs ~3s medium on a 7950X3D), so default model -> small; lean on initial_prompt + lenient wake for the coined word. every heard line now shows STT latency as (<ms>/<audio>s) — always on, not just print_heard — so a model change's cost is visible. snappier vad (silence_ms 500) from the prior commit stands. Signed-off-by: disqualifier <dev@disqualifier.me>
2026-06-26 02:57:52 -04:00 · 2026-06-26 02:57:52 -04:00 · 4357b14fad
commit 4357b14fad
parent 8e20b7eb0b
4 changed files with 26 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -189,10 +189,12 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda
 Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT
 key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]`
 (`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`).
-The default model is **`medium`** (best accuracy for the coined wake word on a strong
-CPU); `small` is faster/less accurate, `large-v3` most accurate. `claudedo -c <path>
-...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`,
-`~/.config/claudedo/config.toml`, then `./config.toml`.
+The default model is **`small`** (~1s/command on a strong CPU — snappy, and good
+enough with initial_prompt biasing); `medium` is more accurate on the coined wake
+word but ~3× slower (noticeable lag), `large-v3` most accurate/slowest. Every `heard`
+line shows the STT latency as `(<ms>/<audio>s)` so you can see what a model change
+costs. `claudedo -c <path> ...` points at a specific config; otherwise it searches
+`$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then `./config.toml`.

 - **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the
  configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`),
--- a/config.toml
+++ b/config.toml
@ -21,10 +21,11 @@ mode = "listen"
 ptt_key = "space"

 [stt]
-# faster-whisper model size. "medium" is the default — biggest accuracy gain for the
-# coined wake word ("claudedo" / "claude do") and fine on a strong cpu. "small" is
-# faster but less accurate; "large-v3" is most accurate if medium still struggles.
-model = "medium"
+# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong
+# cpu) and good enough with the initial_prompt biasing + lenient wake matching.
+# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag);
+# "large-v3" is most accurate and slowest. bump only if recognition is poor.
+model = "small"
 language = "en"
 # mic device: "auto", or a sounddevice device index (integer) / substring of a
 # device name. run `claudedo test-audio` to list devices.
--- a/src/claudedo/config.py
+++ b/src/claudedo/config.py
@ -99,7 +99,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
    if mode not in _VALID_MODES:
        raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")

-    model = _require(raw, "stt", "model", (str,), "medium")
+    model = _require(raw, "stt", "model", (str,), "small")
    if model not in _VALID_MODELS:
        log.warning("unknown stt model %r — passing through to faster-whisper", model)

--- a/src/claudedo/daemon.py
+++ b/src/claudedo/daemon.py
@ -117,6 +117,8 @@ class Daemon:
        self._ptt = _PTTKey()
        self._pending: dict[str, int] = {}
        self._console = Console()
+        self._last_stt_ms = 0.0
+        self._last_audio_s = 0.0

    def _install_signals(self) -> None:
        signal.signal(signal.SIGTERM, self._on_signal)
@ -167,12 +169,14 @@ class Daemon:
        parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold,
                               cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words)
        if parsed is None or parsed.action is None:
-            self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow")
+            self._console.emit(VOICE, f'heard "{transcript}" -> no command matched {self._timing()}',
+                               "yellow")
            return
        action = parsed.action

        # a command was recognized — echo what we heard (green) before acting.
-        self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)}', "green")
+        self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)} {self._timing()}',
+                           "green")

        if action.name == "mode":
            new_mode = str(action.arg)
@ -261,6 +265,10 @@ class Daemon:
            self._pending[session] = 0
        self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")

+    def _timing(self) -> str:
+        """compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
+        return f"({self._last_stt_ms:.0f}ms/{self._last_audio_s:.1f}s)"
+
    @staticmethod
    def _describe(action) -> str:
        if action.arg is None:
@ -304,12 +312,15 @@ class Daemon:
                    break
                if audio_chunk is None:
                    continue
+                t0 = time.monotonic()
                transcript = self._transcriber.transcribe(audio_chunk, self.config.samplerate)
+                self._last_stt_ms = (time.monotonic() - t0) * 1000.0
+                self._last_audio_s = audio_chunk.size / self.config.samplerate
                if not transcript:
                    continue
                if self.mode == "listen" and not self._has_wake(transcript):
                    if self.config.print_heard:
-                        self._console.emit(VOICE, f'heard (dropped) "{transcript}"', "red")
+                        self._console.emit(VOICE, f'heard (dropped) "{transcript}" {self._timing()}', "red")
                    else:
                        self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim")
                    continue