perf: default back to small model; show per-command STT latency
medium added ~3s/command lag (measured ~1.2s small vs ~3s medium on a 7950X3D), so default model -> small; lean on initial_prompt + lenient wake for the coined word. every heard line now shows STT latency as (<ms>/<audio>s) — always on, not just print_heard — so a model change's cost is visible. snappier vad (silence_ms 500) from the prior commit stands. Signed-off-by: disqualifier <dev@disqualifier.me>
This commit is contained in:
parent
8e20b7eb0b
commit
4357b14fad
10
README.md
10
README.md
@ -189,10 +189,12 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda
|
|||||||
Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT
|
Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT
|
||||||
key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]`
|
key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]`
|
||||||
(`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`).
|
(`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`).
|
||||||
The default model is **`medium`** (best accuracy for the coined wake word on a strong
|
The default model is **`small`** (~1s/command on a strong CPU — snappy, and good
|
||||||
CPU); `small` is faster/less accurate, `large-v3` most accurate. `claudedo -c <path>
|
enough with initial_prompt biasing); `medium` is more accurate on the coined wake
|
||||||
...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`,
|
word but ~3× slower (noticeable lag), `large-v3` most accurate/slowest. Every `heard`
|
||||||
`~/.config/claudedo/config.toml`, then `./config.toml`.
|
line shows the STT latency as `(<ms>/<audio>s)` so you can see what a model change
|
||||||
|
costs. `claudedo -c <path> ...` points at a specific config; otherwise it searches
|
||||||
|
`$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then `./config.toml`.
|
||||||
|
|
||||||
- **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the
|
- **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the
|
||||||
configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`),
|
configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`),
|
||||||
|
|||||||
@ -21,10 +21,11 @@ mode = "listen"
|
|||||||
ptt_key = "space"
|
ptt_key = "space"
|
||||||
|
|
||||||
[stt]
|
[stt]
|
||||||
# faster-whisper model size. "medium" is the default — biggest accuracy gain for the
|
# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong
|
||||||
# coined wake word ("claudedo" / "claude do") and fine on a strong cpu. "small" is
|
# cpu) and good enough with the initial_prompt biasing + lenient wake matching.
|
||||||
# faster but less accurate; "large-v3" is most accurate if medium still struggles.
|
# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag);
|
||||||
model = "medium"
|
# "large-v3" is most accurate and slowest. bump only if recognition is poor.
|
||||||
|
model = "small"
|
||||||
language = "en"
|
language = "en"
|
||||||
# mic device: "auto", or a sounddevice device index (integer) / substring of a
|
# mic device: "auto", or a sounddevice device index (integer) / substring of a
|
||||||
# device name. run `claudedo test-audio` to list devices.
|
# device name. run `claudedo test-audio` to list devices.
|
||||||
|
|||||||
@ -99,7 +99,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
|
|||||||
if mode not in _VALID_MODES:
|
if mode not in _VALID_MODES:
|
||||||
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
|
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
|
||||||
|
|
||||||
model = _require(raw, "stt", "model", (str,), "medium")
|
model = _require(raw, "stt", "model", (str,), "small")
|
||||||
if model not in _VALID_MODELS:
|
if model not in _VALID_MODELS:
|
||||||
log.warning("unknown stt model %r — passing through to faster-whisper", model)
|
log.warning("unknown stt model %r — passing through to faster-whisper", model)
|
||||||
|
|
||||||
|
|||||||
@ -117,6 +117,8 @@ class Daemon:
|
|||||||
self._ptt = _PTTKey()
|
self._ptt = _PTTKey()
|
||||||
self._pending: dict[str, int] = {}
|
self._pending: dict[str, int] = {}
|
||||||
self._console = Console()
|
self._console = Console()
|
||||||
|
self._last_stt_ms = 0.0
|
||||||
|
self._last_audio_s = 0.0
|
||||||
|
|
||||||
def _install_signals(self) -> None:
|
def _install_signals(self) -> None:
|
||||||
signal.signal(signal.SIGTERM, self._on_signal)
|
signal.signal(signal.SIGTERM, self._on_signal)
|
||||||
@ -167,12 +169,14 @@ class Daemon:
|
|||||||
parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold,
|
parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold,
|
||||||
cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words)
|
cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words)
|
||||||
if parsed is None or parsed.action is None:
|
if parsed is None or parsed.action is None:
|
||||||
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow")
|
self._console.emit(VOICE, f'heard "{transcript}" -> no command matched {self._timing()}',
|
||||||
|
"yellow")
|
||||||
return
|
return
|
||||||
action = parsed.action
|
action = parsed.action
|
||||||
|
|
||||||
# a command was recognized — echo what we heard (green) before acting.
|
# a command was recognized — echo what we heard (green) before acting.
|
||||||
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)}', "green")
|
self._console.emit(VOICE, f'heard "{transcript}" -> {self._describe(action)} {self._timing()}',
|
||||||
|
"green")
|
||||||
|
|
||||||
if action.name == "mode":
|
if action.name == "mode":
|
||||||
new_mode = str(action.arg)
|
new_mode = str(action.arg)
|
||||||
@ -261,6 +265,10 @@ class Daemon:
|
|||||||
self._pending[session] = 0
|
self._pending[session] = 0
|
||||||
self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")
|
self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")
|
||||||
|
|
||||||
|
def _timing(self) -> str:
|
||||||
|
"""compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
|
||||||
|
return f"({self._last_stt_ms:.0f}ms/{self._last_audio_s:.1f}s)"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _describe(action) -> str:
|
def _describe(action) -> str:
|
||||||
if action.arg is None:
|
if action.arg is None:
|
||||||
@ -304,12 +312,15 @@ class Daemon:
|
|||||||
break
|
break
|
||||||
if audio_chunk is None:
|
if audio_chunk is None:
|
||||||
continue
|
continue
|
||||||
|
t0 = time.monotonic()
|
||||||
transcript = self._transcriber.transcribe(audio_chunk, self.config.samplerate)
|
transcript = self._transcriber.transcribe(audio_chunk, self.config.samplerate)
|
||||||
|
self._last_stt_ms = (time.monotonic() - t0) * 1000.0
|
||||||
|
self._last_audio_s = audio_chunk.size / self.config.samplerate
|
||||||
if not transcript:
|
if not transcript:
|
||||||
continue
|
continue
|
||||||
if self.mode == "listen" and not self._has_wake(transcript):
|
if self.mode == "listen" and not self._has_wake(transcript):
|
||||||
if self.config.print_heard:
|
if self.config.print_heard:
|
||||||
self._console.emit(VOICE, f'heard (dropped) "{transcript}"', "red")
|
self._console.emit(VOICE, f'heard (dropped) "{transcript}" {self._timing()}', "red")
|
||||||
else:
|
else:
|
||||||
self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim")
|
self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim")
|
||||||
continue
|
continue
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user