perf: default back to small model; show per-command STT latency
medium added ~3s/command lag (measured ~1.2s small vs ~3s medium on a 7950X3D), so default model -> small; lean on initial_prompt + lenient wake for the coined word. every heard line now shows STT latency as (<ms>/<audio>s) — always on, not just print_heard — so a model change's cost is visible. snappier vad (silence_ms 500) from the prior commit stands. Signed-off-by: disqualifier <dev@disqualifier.me>
This commit is contained in:
parent
8e20b7eb0b
commit
4357b14fad
10
README.md
10
README.md
@ -189,10 +189,12 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda
|
||||
Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT
|
||||
key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]`
|
||||
(`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`).
|
||||
The default model is **`medium`** (best accuracy for the coined wake word on a strong
|
||||
CPU); `small` is faster/less accurate, `large-v3` most accurate. `claudedo -c <path>
|
||||
...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`,
|
||||
`~/.config/claudedo/config.toml`, then `./config.toml`.
|
||||
The default model is **`small`** (~1s/command on a strong CPU — snappy, and good
|
||||
enough with initial_prompt biasing); `medium` is more accurate on the coined wake
|
||||
word but ~3× slower (noticeable lag), `large-v3` most accurate/slowest. Every `heard`
|
||||
line shows the STT latency as `(<ms>/<audio>s)` so you can see what a model change
|
||||
costs. `claudedo -c <path> ...` points at a specific config; otherwise it searches
|
||||
`$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then `./config.toml`.
|
||||
|
||||
- **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the
|
||||
configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`),
|
||||
|
||||
@ -21,10 +21,11 @@ mode = "listen"
|
||||
ptt_key = "space"
|
||||
|
||||
[stt]
|
||||
# faster-whisper model size. "medium" is the default — biggest accuracy gain for the
|
||||
# coined wake word ("claudedo" / "claude do") and fine on a strong cpu. "small" is
|
||||
# faster but less accurate; "large-v3" is most accurate if medium still struggles.
|
||||
model = "medium"
|
||||
# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong
|
||||
# cpu) and good enough with the initial_prompt biasing + lenient wake matching.
|
||||
# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag);
|
||||
# "large-v3" is most accurate and slowest. bump only if recognition is poor.
|
||||
model = "small"
|
||||
language = "en"
|
||||
# mic device: "auto", or a sounddevice device index (integer) / substring of a
|
||||
# device name. run `claudedo test-audio` to list devices.
|
||||
|
||||
@ -99,7 +99,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
|
||||
if mode not in _VALID_MODES:
|
||||
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
|
||||
|
||||
model = _require(raw, "stt", "model", (str,), "medium")
|
||||
model = _require(raw, "stt", "model", (str,), "small")
|
||||
if model not in _VALID_MODELS:
|
||||
log.warning("unknown stt model %r — passing through to faster-whisper", model)
|
||||
|
||||
|
||||
@ -117,6 +117,8 @@ class Daemon:
|
||||
self._ptt = _PTTKey()
|
||||
self._pending: dict[str, int] = {}
|
||||
self._console = Console()
|
||||
self._last_stt_ms = 0.0
|
||||
self._last_audio_s = 0.0
|
||||
|
||||
def _install_signals(self) -> None:
|
||||
signal.signal(signal.SIGTERM, self._on_signal)
|
||||
@ -167,12 +169,14 @@ class Daemon:
|
||||
parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold,
|
||||
cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words)
|
||||
if parsed is None or parsed.action is None:
|
||||
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow")
|
||||
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched {self._timing()}',
|
||||
"yellow")
|
||||
return
|
||||
action = parsed.action
|
||||
|
||||
# a command was recognized — echo what we heard (green) before acting.
|
||||
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)}', "green")
|
||||
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)} {self._timing()}',
|
||||
"green")
|
||||
|
||||
if action.name == "mode":
|
||||
new_mode = str(action.arg)
|
||||
@ -261,6 +265,10 @@ class Daemon:
|
||||
self._pending[session] = 0
|
||||
self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")
|
||||
|
||||
def _timing(self) -> str:
|
||||
"""compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
|
||||
return f"({self._last_stt_ms:.0f}ms/{self._last_audio_s:.1f}s)"
|
||||
|
||||
@staticmethod
|
||||
def _describe(action) -> str:
|
||||
if action.arg is None:
|
||||
@ -304,12 +312,15 @@ class Daemon:
|
||||
break
|
||||
if audio_chunk is None:
|
||||
continue
|
||||
t0 = time.monotonic()
|
||||
transcript = self._transcriber.transcribe(audio_chunk, self.config.samplerate)
|
||||
self._last_stt_ms = (time.monotonic() - t0) * 1000.0
|
||||
self._last_audio_s = audio_chunk.size / self.config.samplerate
|
||||
if not transcript:
|
||||
continue
|
||||
if self.mode == "listen" and not self._has_wake(transcript):
|
||||
if self.config.print_heard:
|
||||
self._console.emit(VOICE, f'heard (dropped) "{transcript}"', "red")
|
||||
self._console.emit(VOICE, f'heard (dropped) "{transcript}" {self._timing()}', "red")
|
||||
else:
|
||||
self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim")
|
||||
continue
|
||||
|
||||
Loading…
Reference in New Issue
Block a user