diff --git a/README.md b/README.md index e55757f..3d957f2 100644 --- a/README.md +++ b/README.md @@ -183,19 +183,29 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda ## Config Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT -key, Whisper model/language/device, audio segmentation thresholds, and `[behavior]` -(`type_autosend`, `filler_words`, `auto_target`, `print_heard`). The default model is -`small`; bump to `medium` if the coined wake word is recognized poorly. `claudedo -c - ...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`, +key, Whisper model/language/device, `[vad]` endpointing, and `[behavior]` +(`type_autosend`, fuzzy thresholds, `filler_words`, `auto_target`, `print_heard`). +The default model is **`medium`** (best accuracy for the coined wake word on a strong +CPU); `small` is faster/less accurate, `large-v3` most accurate. `claudedo -c +...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then `./config.toml`. -- **`auto_target`** (default `false`): with no sticky target set and exactly one - `claude-*` session running, `false` makes a bare command do nothing and ask you to - `set` one; `true` auto-targets that single session. -- **`print_heard`** (default `false`, debug): prints non-wake transcripts to the - console so you can see how Whisper renders your wake word. Turn it on to debug - detection, then off. Whisper has no token for "claudedo" — it commonly emits - "claude do", which is in the default wake list. +- **STT biasing.** The transcriber is seeded with an `initial_prompt` built from the + configured wake phrases + command vocabulary (one source — `grammar.vocabulary()`), + so Whisper is conditioned to expect "claudedo" and the command words. +- **Split fuzzy thresholds.** `wake_fuzzy_threshold` (default `0.6`, lenient) vs + `command_fuzzy_threshold` (default `0.8`, tight). The asymmetry is deliberate: a + false *wake* is cheap (it wakes, finds no command, does nothing), but a false + *command* fires the wrong action. Prefer expanding command synonyms over loosening + the command threshold. +- **`[vad]` endpointing.** Capture starts on speech and ends after `silence_ms` + (default 800) of trailing silence — Alexa-style record-until-pause — capped at + `max_seconds` (default 10). The pause both ends a command and separates it from + following chatter (the chatter is a separate capture the wake gate discards). +- **`auto_target`** (default `false`): with no sticky target and one session running, + `false` does nothing and asks you to `set`; `true` auto-uses that session. +- **`print_heard`** (default `false`, debug): prints non-wake transcripts so you can + see how Whisper renders your wake word, then tune the wake list/threshold. ## Requirements diff --git a/config.toml b/config.toml index 0d5c99d..2e64914 100644 --- a/config.toml +++ b/config.toml @@ -21,10 +21,10 @@ mode = "listen" ptt_key = "space" [stt] -# faster-whisper model size. "small" is a good accuracy/latency balance for the -# short command grammar (~sub-second per chunk on a strong cpu). if the coined wake -# word "claudedo" is recognized poorly, bump to "medium" (slower per chunk). -model = "small" +# faster-whisper model size. "medium" is the default — biggest accuracy gain for the +# coined wake word ("claudedo" / "claude do") and fine on a strong cpu. "small" is +# faster but less accurate; "large-v3" is most accurate if medium still struggles. +model = "medium" language = "en" # mic device: "auto", or a sounddevice device index (integer) / substring of a # device name. run `claudedo test-audio` to list devices. @@ -36,21 +36,30 @@ compute = "auto" # capture parameters. 16 kHz mono is what whisper expects. samplerate = 16000 channels = 1 -# listen-mode silence segmentation: an utterance ends after this many seconds below -# the rms threshold. keeps latency low without streaming. +# rms energy below this counts as silence (the VAD onset/endpoint floor). silence_threshold = 0.012 -silence_duration = 0.8 # ignore utterances shorter than this (clicks, coughs). min_utterance = 0.3 -# hard cap on a single utterance so a stuck stream can't grow unbounded. -max_utterance = 15.0 + +[vad] +# Alexa-style record-until-pause endpointing (listen mode). capture starts on speech +# onset and ends after this much trailing silence — the natural end of an utterance. +# a real pause both ends the command AND separates it from following chatter (the +# chatter becomes a separate capture that the wake gate then discards). +silence_ms = 800 +# hard cap so continuous noise can't record forever. +max_seconds = 10.0 [behavior] # dictation never auto-submits: "type " inserts literal text only; you say # "send" separately to submit (read-before-send). type_autosend = false -# fuzzy match ratio (0..1) required to accept a wake phrase / command token. -match_threshold = 0.8 +# fuzzy match ratios (0..1). the asymmetry is deliberate: a false WAKE is cheap (it +# wakes, finds no command, does nothing), so wake is lenient; a false COMMAND fires +# the WRONG action, so commands stay tight. lower = more lenient = more matches. +# prefer expanding command synonyms over loosening command_fuzzy_threshold. +wake_fuzzy_threshold = 0.6 +command_fuzzy_threshold = 0.8 # optional filler words that may precede a command and are ignored for matching: # "select yes" / "use yes" behave like "yes". (a filler word followed by a digit is # the select command, e.g. "select 1", and is not dropped.) diff --git a/pyproject.toml b/pyproject.toml index a7ce5e6..e6e8b95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "claudedo" -version = "0.1.2" +version = "0.1.3" description = "voice-control daemon for claude code (local STT -> tmux send-keys)" readme = "README.md" requires-python = ">=3.10" diff --git a/src/claudedo/__init__.py b/src/claudedo/__init__.py index c404ff3..aeb1223 100644 --- a/src/claudedo/__init__.py +++ b/src/claudedo/__init__.py @@ -1,3 +1,3 @@ """claudedo — voice-control daemon for claude code (local STT -> tmux send-keys)""" -__version__ = "0.1.2" +__version__ = "0.1.3" diff --git a/src/claudedo/config.py b/src/claudedo/config.py index 45f4b23..9b2e299 100644 --- a/src/claudedo/config.py +++ b/src/claudedo/config.py @@ -44,11 +44,12 @@ class Config: samplerate: int channels: int silence_threshold: float - silence_duration: float + vad_silence_ms: int + vad_max_seconds: float min_utterance: float - max_utterance: float type_autosend: bool - match_threshold: float + wake_fuzzy_threshold: float + command_fuzzy_threshold: float filler_words: tuple[str, ...] auto_target: bool print_heard: bool @@ -98,7 +99,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config: if mode not in _VALID_MODES: raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}") - model = _require(raw, "stt", "model", (str,), "small") + model = _require(raw, "stt", "model", (str,), "medium") if model not in _VALID_MODELS: log.warning("unknown stt model %r — passing through to faster-whisper", model) @@ -113,19 +114,25 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config: samplerate=int(_require(raw, "audio", "samplerate", (int,), 16000)), channels=int(_require(raw, "audio", "channels", (int,), 1)), silence_threshold=float(_require(raw, "audio", "silence_threshold", (int, float), 0.012)), - silence_duration=float(_require(raw, "audio", "silence_duration", (int, float), 0.8)), + vad_silence_ms=int(_require(raw, "vad", "silence_ms", (int,), 800)), + vad_max_seconds=float(_require(raw, "vad", "max_seconds", (int, float), 10.0)), min_utterance=float(_require(raw, "audio", "min_utterance", (int, float), 0.3)), - max_utterance=float(_require(raw, "audio", "max_utterance", (int, float), 15.0)), type_autosend=bool(_require(raw, "behavior", "type_autosend", (bool,), False)), - match_threshold=float(_require(raw, "behavior", "match_threshold", (int, float), 0.8)), + wake_fuzzy_threshold=float(_require(raw, "behavior", "wake_fuzzy_threshold", (int, float), 0.6)), + command_fuzzy_threshold=float(_require(raw, "behavior", "command_fuzzy_threshold", + (int, float), 0.8)), filler_words=tuple(_require(raw, "behavior", "filler_words", (list,), ["select", "use", "choose"])), auto_target=bool(_require(raw, "behavior", "auto_target", (bool,), False)), print_heard=bool(_require(raw, "behavior", "print_heard", (bool,), False)), source_path=path, ) - if not 0.0 < cfg.match_threshold <= 1.0: - raise ConfigError("[behavior].match_threshold must be in (0, 1]") + for label, val in (("wake_fuzzy_threshold", cfg.wake_fuzzy_threshold), + ("command_fuzzy_threshold", cfg.command_fuzzy_threshold)): + if not 0.0 < val <= 1.0: + raise ConfigError(f"[behavior].{label} must be in (0, 1]") + if cfg.vad_silence_ms <= 0 or cfg.vad_max_seconds <= 0: + raise ConfigError("[vad].silence_ms and max_seconds must be positive") if cfg.samplerate <= 0 or cfg.channels <= 0: raise ConfigError("[audio].samplerate and channels must be positive") return cfg diff --git a/src/claudedo/daemon.py b/src/claudedo/daemon.py index fb019b7..c75c86b 100644 --- a/src/claudedo/daemon.py +++ b/src/claudedo/daemon.py @@ -136,6 +136,7 @@ class Daemon: model=cfg.stt_model, language=cfg.stt_language, device=cfg.stt_compute if cfg.stt_compute in ("cpu", "cuda") else "auto", compute_type="auto", + initial_prompt=grammar.initial_prompt(cfg.wake_phrases), ) if audio.warm_up(cfg.samplerate, cfg.channels, self._device): log.info("mic warmed up (source live)") @@ -151,20 +152,20 @@ class Daemon: return audio.record_while( cfg.samplerate, cfg.channels, self._device, held=lambda: not self._ptt.wait_press(self.stopped), - max_utterance=cfg.max_utterance, min_utterance=cfg.min_utterance, + max_utterance=cfg.vad_max_seconds, min_utterance=cfg.min_utterance, ) return audio.record_until_silence( cfg.samplerate, cfg.channels, self._device, - silence_threshold=cfg.silence_threshold, silence_duration=cfg.silence_duration, - min_utterance=cfg.min_utterance, max_utterance=cfg.max_utterance, + silence_threshold=cfg.silence_threshold, silence_duration=cfg.vad_silence_ms / 1000.0, + min_utterance=cfg.min_utterance, max_utterance=cfg.vad_max_seconds, stop=self.stopped, ) def _handle(self, transcript: str) -> None: cfg = self.config require_wake = self.mode == "listen" - parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.match_threshold, require_wake, - filler=cfg.filler_words) + parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.wake_fuzzy_threshold, + cfg.command_fuzzy_threshold, require_wake, filler=cfg.filler_words) if parsed is None or parsed.action is None: self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow") return @@ -257,7 +258,8 @@ class Daemon: invariant: non-command speech is discarded, never recorded. """ cfg = self.config - return grammar.strip_wake(transcript, cfg.wake_phrases, cfg.match_threshold, True) is not None + return grammar.strip_wake(transcript, cfg.wake_phrases, + cfg.wake_fuzzy_threshold, True) is not None def _print_startup(self) -> None: cfg = self.config diff --git a/src/claudedo/grammar.py b/src/claudedo/grammar.py index 4fcce1e..da27e2a 100644 --- a/src/claudedo/grammar.py +++ b/src/claudedo/grammar.py @@ -33,11 +33,32 @@ _COUNT_WORDS = { "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20, } +_YES_VERBS = ("yes", "yeah", "yep", "yup") +_NO_VERBS = ("no", "nope", "nah") +_APPROVE_VERBS = ("approve", "allow") +_DENY_VERBS = ("deny", "reject") +_SUBMIT_VERBS = ("send", "enter", "submit") +_CANCEL_VERBS = ("cancel", "escape") +_TYPE_VERBS = ("type", "dictate", "write") +_BACKSPACE_VERBS = ("backspace", "delete") +_SPACE_VERBS = ("space", "spacebar") +_ADD_VERBS = ("add", "insert") +_ERASE_VERBS = ("erase", "clear", "wipe") +_MODE_VERBS = ("mode",) _STICKY_VERBS = ("set", "sticky", "switch") _ONESHOT_VERBS = ("target",) _UNSET_VERBS = ("unset", "unsticky") _LIST_VERBS = ("list", "sessions") _SELECT_VERBS = ("select", "option", "choose", "number") + +# every command/synonym word, for biasing the STT toward the vocabulary we expect. +_COMMAND_WORDS = ( + _YES_VERBS + _NO_VERBS + _APPROVE_VERBS + _DENY_VERBS + _SUBMIT_VERBS + + _CANCEL_VERBS + _TYPE_VERBS + _BACKSPACE_VERBS + _SPACE_VERBS + _ADD_VERBS + + _ERASE_VERBS + _MODE_VERBS + _STICKY_VERBS + _ONESHOT_VERBS + _UNSET_VERBS + + _LIST_VERBS + _SELECT_VERBS + ("ptt", "listen") + + ("one", "two", "three", "four") +) DEFAULT_FILLER = ("select", "use", "choose") @@ -79,6 +100,26 @@ def normalize(text: str) -> str: return " ".join(tokens) +def vocabulary(wake_phrases: list[str]) -> list[str]: + """the wake + command vocabulary, deduped in first-seen order. + + single source for biasing the STT: the same wake phrases the matcher uses plus + every command/synonym word in _COMMAND_WORDS. no separate hardcoded copy. + """ + seen: dict[str, None] = {} + for word in list(wake_phrases) + list(_COMMAND_WORDS): + key = word.strip() + if key and key not in seen: + seen[key] = None + return list(seen) + + +def initial_prompt(wake_phrases: list[str]) -> str: + """a comma-joined vocabulary string to pass faster-whisper as initial_prompt, + conditioning transcription toward the words we expect (esp. the coined wake)""" + return ", ".join(vocabulary(wake_phrases)) + + def _ratio(a: str, b: str) -> float: return SequenceMatcher(None, a, b).ratio() @@ -163,40 +204,40 @@ def match_command(remainder: str, threshold: float) -> Action | None: if head in _INDEX_WORDS: return Action("select", _INDEX_WORDS[head]) - if _fuzzy_in(head, ("yes", "yeah", "yep", "yup"), threshold): + if _fuzzy_in(head, _YES_VERBS, threshold): return Action("yes") - if _fuzzy_in(head, ("no", "nope", "nah"), threshold): + if _fuzzy_in(head, _NO_VERBS, threshold): return Action("no") - if _fuzzy_in(head, ("approve", "allow"), threshold): + if _fuzzy_in(head, _APPROVE_VERBS, threshold): return Action("approve") - if _fuzzy_in(head, ("deny", "reject"), threshold): + if _fuzzy_in(head, _DENY_VERBS, threshold): return Action("deny") - if _fuzzy_in(head, ("send", "enter", "submit"), threshold): + if _fuzzy_in(head, _SUBMIT_VERBS, threshold): return Action("submit") - if _fuzzy_in(head, ("cancel", "escape"), threshold): + if _fuzzy_in(head, _CANCEL_VERBS, threshold): return Action("cancel") if _fuzzy_in(head, _SELECT_VERBS, threshold) and rest and rest[0] in _INDEX_WORDS: return Action("select", _INDEX_WORDS[rest[0]]) - if _fuzzy_in(head, ("type", "dictate", "write"), threshold): + if _fuzzy_in(head, _TYPE_VERBS, threshold): text = " ".join(rest).strip() return Action("type", text) if text else None - if _fuzzy_in(head, ("backspace", "delete"), threshold): + if _fuzzy_in(head, _BACKSPACE_VERBS, threshold): return Action("backspace", _leading_count(rest, default=1)) - if _fuzzy_in(head, ("space", "spacebar"), threshold): + if _fuzzy_in(head, _SPACE_VERBS, threshold): return Action("space", _leading_count(rest, default=1)) - if _fuzzy_in(head, ("add", "insert"), threshold) and rest: + if _fuzzy_in(head, _ADD_VERBS, threshold) and rest: tail = [t for t in rest if t not in ("a", "an")] if any(_fuzzy_in(t, ("space", "spaces"), threshold) for t in tail): count = next((int(t) for t in tail if t.isdigit()), next((_COUNT_WORDS[t] for t in tail if t in _COUNT_WORDS), 1)) return Action("space", count) - if _fuzzy_in(head, ("erase", "clear", "wipe"), threshold): + if _fuzzy_in(head, _ERASE_VERBS, threshold): return Action("erase") - if _fuzzy_in(head, ("mode",), threshold) and rest: + if _fuzzy_in(head, _MODE_VERBS, threshold) and rest: if _fuzzy_in(rest[0], ("ptt",), threshold) or "push" in rest[0]: return Action("mode", "ptt") if _fuzzy_in(rest[0], ("listen",), threshold): @@ -227,24 +268,28 @@ def _strip_filler(tokens: list[str], filler: tuple[str, ...], threshold: float) return tokens -def parse(transcript: str, wake_phrases: list[str], threshold: float, - require_wake: bool, filler: tuple[str, ...] = DEFAULT_FILLER) -> ParsedCommand | None: +def parse(transcript: str, wake_phrases: list[str], wake_threshold: float, + command_threshold: float, require_wake: bool, + filler: tuple[str, ...] = DEFAULT_FILLER) -> ParsedCommand | None: """full parse: wake gate -> optional one-shot target -> filler -> command. - returns a ParsedCommand (one_shot, action), or None if the wake gate dropped the - utterance (listen mode, no wake phrase). a ParsedCommand with action=None means a - wake phrase was present but no command matched. + wake_threshold gates the wake phrase (lenient — a false wake is cheap, it just + finds no command); command_threshold gates the command words (stricter — a false + command fires the wrong action). returns a ParsedCommand (one_shot, action), or + None if the wake gate dropped the utterance (listen mode, no wake phrase). a + ParsedCommand with action=None means a wake phrase was present but no command + matched. """ - remainder = strip_wake(transcript, wake_phrases, threshold, require_wake) + remainder = strip_wake(transcript, wake_phrases, wake_threshold, require_wake) if remainder is None: return None tokens = remainder.split(" ") if remainder else [] one_shot: str | None = None - if tokens and _fuzzy_in(tokens[0], _ONESHOT_VERBS, threshold) and len(tokens) >= 2: + if tokens and _fuzzy_in(tokens[0], _ONESHOT_VERBS, command_threshold) and len(tokens) >= 2: one_shot = tokens[1] tokens = tokens[2:] - tokens = _strip_filler(tokens, filler, threshold) - action = match_command(" ".join(tokens), threshold) + tokens = _strip_filler(tokens, filler, command_threshold) + action = match_command(" ".join(tokens), command_threshold) return ParsedCommand(one_shot=one_shot, action=action) diff --git a/src/claudedo/stt.py b/src/claudedo/stt.py index c85530e..624e08a 100644 --- a/src/claudedo/stt.py +++ b/src/claudedo/stt.py @@ -82,8 +82,9 @@ class Transcriber: """a loaded faster-whisper model that transcribes float32 mono audio chunks""" def __init__(self, model: str = "small", language: str = "en", device: str = "auto", - compute_type: str = "auto") -> None: + compute_type: str = "auto", initial_prompt: str | None = None) -> None: self.language = language + self.initial_prompt = initial_prompt self._model = self._load(model, device, compute_type) self._warm() @@ -120,6 +121,7 @@ class Transcriber: beam_size=1, vad_filter=True, condition_on_previous_text=False, + initial_prompt=self.initial_prompt, ) text = " ".join(seg.text for seg in segments).strip() return text