diff --git a/config.toml b/config.toml index 524fbb2..5c8a041 100644 --- a/config.toml +++ b/config.toml @@ -21,11 +21,12 @@ mode = "listen" ptt_key = "space" [stt] -# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong -# cpu) and good enough with the initial_prompt biasing + lenient wake matching. -# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag); -# "large-v3" is most accurate and slowest. bump only if recognition is poor. -model = "small" +# faster-whisper model size. "small.en" is the default — the English-only small model +# (~1s/command on a strong cpu, more accurate on english than multilingual "small" at +# the same speed). "medium"/"medium.en" are more accurate but ~3x slower (noticeable +# lag); "large-v3" is most accurate and slowest. drop to "base.en" for max snappiness +# (less accurate). bump only if recognition is poor. +model = "small.en" language = "en" # mic device: "auto", or a sounddevice device index (integer) / substring of a # device name. run `claudedo test-audio` to list devices. @@ -47,7 +48,7 @@ min_utterance = 0.3 # onset and ends after this much trailing silence — the natural end of an utterance. # a real pause both ends the command AND separates it from following chatter (the # chatter becomes a separate capture that the wake gate then discards). -silence_ms = 500 +silence_ms = 700 # hard cap so continuous noise can't record forever. max_seconds = 10.0 diff --git a/src/claudedo/config.py b/src/claudedo/config.py index 654343b..3cbc8e0 100644 --- a/src/claudedo/config.py +++ b/src/claudedo/config.py @@ -17,7 +17,10 @@ except ModuleNotFoundError: log = logging.getLogger(__name__) _VALID_MODES = ("listen", "ptt") -_VALID_MODELS = ("tiny", "base", "small", "medium", "large-v2", "large-v3") +_VALID_MODELS = ( + "tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3", + "tiny.en", "base.en", "small.en", "medium.en", +) DEFAULT_CONFIG_PATHS = ( Path(os.environ.get("CLAUDEDO_CONFIG", "")) if os.environ.get("CLAUDEDO_CONFIG") else None, @@ -99,7 +102,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config: if mode not in _VALID_MODES: raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}") - model = _require(raw, "stt", "model", (str,), "small") + model = _require(raw, "stt", "model", (str,), "small.en") if model not in _VALID_MODELS: log.warning("unknown stt model %r — passing through to faster-whisper", model) @@ -114,7 +117,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config: samplerate=int(_require(raw, "audio", "samplerate", (int,), 16000)), channels=int(_require(raw, "audio", "channels", (int,), 1)), silence_threshold=float(_require(raw, "audio", "silence_threshold", (int, float), 0.012)), - vad_silence_ms=int(_require(raw, "vad", "silence_ms", (int,), 500)), + vad_silence_ms=int(_require(raw, "vad", "silence_ms", (int,), 700)), vad_max_seconds=float(_require(raw, "vad", "max_seconds", (int, float), 10.0)), min_utterance=float(_require(raw, "audio", "min_utterance", (int, float), 0.3)), type_autosend=bool(_require(raw, "behavior", "type_autosend", (bool,), False)), diff --git a/src/claudedo/console.py b/src/claudedo/console.py index 2a70987..616b090 100644 --- a/src/claudedo/console.py +++ b/src/claudedo/console.py @@ -19,6 +19,7 @@ _COLORS = { "yellow": "\033[33m", "cyan": "\033[36m", "blue": "\033[34m", + "brightblue": "\033[94m", "dim": "\033[2m", "bold": "\033[1m", } diff --git a/src/claudedo/daemon.py b/src/claudedo/daemon.py index 8450156..cdbf1bb 100644 --- a/src/claudedo/daemon.py +++ b/src/claudedo/daemon.py @@ -179,7 +179,7 @@ class Daemon: "green") def blue(s): - return self._console.paint(s, "blue") + return self._console.paint(s, "brightblue") if action.name == "mode": new_mode = str(action.arg) if new_mode != self.mode: @@ -219,14 +219,14 @@ class Daemon: self._console.emit(VOICE, f'heard "{transcript}" -> {reason} -> ' f'{self._describe(action)} did nothing', "red") return - self._inject(session, transcript, reason, action) + self._inject(session, action) - def _inject(self, session: str, transcript: str, reason: str, action) -> None: + def _inject(self, session: str, action) -> None: """run a resolved command against `session`, tracking the uncommitted-input buffer so backspace/erase delete only back to the last submit boundary. - the 'heard ...' echo is already printed by _handle; these lines report the - target (reason) and the keystrokes actually injected. + the 'heard ...' echo is already printed by _handle and the [session] prefix + names the target, so these lines just report the keystrokes injected. """ name = action.name @@ -237,34 +237,34 @@ class Daemon: if self.config.type_autosend: inject.send_named(session, inject.keys.SUBMIT) self._pending[session] = 0 - self._console.emit(session, f"{reason} -> typed {text!r}" + self._console.emit(session, f"typed {text!r}" + (" + send" if self.config.type_autosend else ""), "green") return if name == "space": n = int(action.arg) inject.perform(session, action) self._pending[session] = self._pending.get(session, 0) + n - self._console.emit(session, f"{reason} -> space x{n}", "green") + self._console.emit(session, f"space x{n}", "green") return if name == "backspace": n = int(action.arg) if n: inject.perform(session, action) self._pending[session] = max(0, self._pending.get(session, 0) - n) - self._console.emit(session, f"{reason} -> backspace x{n}", "green") + self._console.emit(session, f"backspace x{n}", "green") return if name == "erase": n = self._pending.get(session, 0) if n: inject.perform(session, grammar.Action("erase", n)) self._pending[session] = 0 - self._console.emit(session, f"{reason} -> erase x{n} (to last boundary)", "green") + self._console.emit(session, f"erase x{n} (to last boundary)", "green") return inject.perform(session, action) if name == "submit": self._pending[session] = 0 - self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green") + self._console.emit(session, f"injected {self._describe(action)}", "green") def _timing(self) -> str: """compact STT latency suffix for heard lines (transcribe ms on audio secs)"""