perf: default back to small model; show per-command STT latency

medium added ~3s/command lag (measured ~1.2s small vs ~3s medium on a 7950X3D), so
default model -> small; lean on initial_prompt + lenient wake for the coined word.
every heard line now shows STT latency as (<ms>/<audio>s) — always on, not just
print_heard — so a model change's cost is visible. snappier vad (silence_ms 500)
from the prior commit stands.

Signed-off-by: disqualifier <dev@disqualifier.me>
This commit is contained in:
disqualifier 2026-06-26 02:57:52 -04:00
parent 8e20b7eb0b
commit 4357b14fad
4 changed files with 26 additions and 12 deletions

View File

@ -189,10 +189,12 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda
Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT
key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]`
(`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`).
The default model is **`medium`** (best accuracy for the coined wake word on a strong
CPU); `small` is faster/less accurate, `large-v3` most accurate. `claudedo -c <path>
...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`,
`~/.config/claudedo/config.toml`, then `./config.toml`.
The default model is **`small`** (~1s/command on a strong CPU — snappy, and good
enough with initial_prompt biasing); `medium` is more accurate on the coined wake
word but ~3× slower (noticeable lag), `large-v3` most accurate/slowest. Every `heard`
line shows the STT latency as `(<ms>/<audio>s)` so you can see what a model change
costs. `claudedo -c <path> ...` points at a specific config; otherwise it searches
`$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then `./config.toml`.
- **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the
configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`),

View File

@ -21,10 +21,11 @@ mode = "listen"
ptt_key = "space"
[stt]
# faster-whisper model size. "medium" is the default — biggest accuracy gain for the
# coined wake word ("claudedo" / "claude do") and fine on a strong cpu. "small" is
# faster but less accurate; "large-v3" is most accurate if medium still struggles.
model = "medium"
# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong
# cpu) and good enough with the initial_prompt biasing + lenient wake matching.
# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag);
# "large-v3" is most accurate and slowest. bump only if recognition is poor.
model = "small"
language = "en"
# mic device: "auto", or a sounddevice device index (integer) / substring of a
# device name. run `claudedo test-audio` to list devices.

View File

@ -99,7 +99,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
if mode not in _VALID_MODES:
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
model = _require(raw, "stt", "model", (str,), "medium")
model = _require(raw, "stt", "model", (str,), "small")
if model not in _VALID_MODELS:
log.warning("unknown stt model %r — passing through to faster-whisper", model)

View File

@ -117,6 +117,8 @@ class Daemon:
self._ptt = _PTTKey()
self._pending: dict[str, int] = {}
self._console = Console()
self._last_stt_ms = 0.0
self._last_audio_s = 0.0
def _install_signals(self) -> None:
signal.signal(signal.SIGTERM, self._on_signal)
@ -167,12 +169,14 @@ class Daemon:
parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold,
cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words)
if parsed is None or parsed.action is None:
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow")
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched {self._timing()}',
"yellow")
return
action = parsed.action
# a command was recognized — echo what we heard (green) before acting.
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)}', "green")
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)} {self._timing()}',
"green")
if action.name == "mode":
new_mode = str(action.arg)
@ -261,6 +265,10 @@ class Daemon:
self._pending[session] = 0
self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")
def _timing(self) -> str:
"""compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
return f"({self._last_stt_ms:.0f}ms/{self._last_audio_s:.1f}s)"
@staticmethod
def _describe(action) -> str:
if action.arg is None:
@ -304,12 +312,15 @@ class Daemon:
break
if audio_chunk is None:
continue
t0 = time.monotonic()
transcript = self._transcriber.transcribe(audio_chunk, self.config.samplerate)
self._last_stt_ms = (time.monotonic() - t0) * 1000.0
self._last_audio_s = audio_chunk.size / self.config.samplerate
if not transcript:
continue
if self.mode == "listen" and not self._has_wake(transcript):
if self.config.print_heard:
self._console.emit(VOICE, f'heard (dropped) "{transcript}"', "red")
self._console.emit(VOICE, f'heard (dropped) "{transcript}" {self._timing()}', "red")
else:
self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim")
continue