diff --git a/README.md b/README.md index a38d30b..482ff8f 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,9 @@ Wake phrases (listen mode), fuzzy-matched: **"claudedo"**, **"hey claude"**. | `approve` / `deny` | allow / deny a permission prompt | | `send` / `enter` | submit (Enter) | | `type ` | insert literal text, **no** submit (read-before-send; say "send") | +| `space []` | insert n spaces (default 1) | +| `backspace []` (alias `delete`) | delete n chars (default 1), capped at the last submit boundary | +| `erase` (alias `clear`/`wipe`) | delete everything typed since the last submit/boundary | | `mode ptt` / `mode listen` | switch input mode | | `set ` (alias `sticky`/`switch`) | set the **sticky** target → `claude-` (persists) | | `target ` | **one-shot** override: run that command on `claude-` for this utterance only; sticky default unchanged | @@ -171,11 +174,19 @@ If Claude Code changes its prompt UI, re-confirm against a live session and upda ## Config Everything tunable lives in [`config.toml`](config.toml): wake phrases, mode + PTT -key, Whisper model/language/device, audio segmentation thresholds, and -`type_autosend = false`. The default model is `small`; bump to `medium` if the coined -wake word is recognized poorly. `claudedo -c ...` points at a specific config; -otherwise it searches `$CLAUDEDO_CONFIG`, `~/.config/claudedo/config.toml`, then -`./config.toml`. +key, Whisper model/language/device, audio segmentation thresholds, and `[behavior]` +(`type_autosend`, `filler_words`, `auto_target`, `print_heard`). The default model is +`small`; bump to `medium` if the coined wake word is recognized poorly. `claudedo -c + ...` points at a specific config; otherwise it searches `$CLAUDEDO_CONFIG`, +`~/.config/claudedo/config.toml`, then `./config.toml`. + +- **`auto_target`** (default `false`): with no sticky target set and exactly one + `claude-*` session running, `false` makes a bare command do nothing and ask you to + `set` one; `true` auto-targets that single session. +- **`print_heard`** (default `false`, debug): prints non-wake transcripts to the + console so you can see how Whisper renders your wake word. Turn it on to debug + detection, then off. Whisper has no token for "claudedo" — it commonly emits + "claude do" or "claude due", both of which are in the default wake list. ## Requirements diff --git a/pyproject.toml b/pyproject.toml index 4d6a72b..a7ce5e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "claudedo" -version = "0.1.1" +version = "0.1.2" description = "voice-control daemon for claude code (local STT -> tmux send-keys)" readme = "README.md" requires-python = ">=3.10" diff --git a/src/claudedo/__init__.py b/src/claudedo/__init__.py index 1997cee..c404ff3 100644 --- a/src/claudedo/__init__.py +++ b/src/claudedo/__init__.py @@ -1,3 +1,3 @@ """claudedo — voice-control daemon for claude code (local STT -> tmux send-keys)""" -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/src/claudedo/console.py b/src/claudedo/console.py new file mode 100644 index 0000000..e13d155 --- /dev/null +++ b/src/claudedo/console.py @@ -0,0 +1,51 @@ +"""colored, prefixed console output for the daemon's recognition/action feed. + +every line is ``HH:MM:SS [PREFIX] message``. prefixes group the source: a session +name (e.g. ``[claude-libs]``) for anything injected into a tmux session, ``[SYSTEM]`` +for daemon-control/state lines, and ``[VOICE]`` for STT/recognition lines. color is +opt-in via tty detection (or forced): green for successful injections, red for +drops/errors, dim for routine. falls back to plain text when stdout is not a tty. +""" + +from __future__ import annotations + +import sys +import time + +RESET = "\033[0m" +_COLORS = { + "green": "\033[32m", + "red": "\033[31m", + "yellow": "\033[33m", + "cyan": "\033[36m", + "dim": "\033[2m", + "bold": "\033[1m", +} + +SYSTEM = "SYSTEM" +VOICE = "VOICE" + + +class Console: + """formats and prints daemon log lines with timestamp, prefix, and color""" + + def __init__(self, color: bool | None = None, stream=None, clock=None) -> None: + self.stream = stream if stream is not None else sys.stdout + self._clock = clock or time.localtime + if color is None: + color = hasattr(self.stream, "isatty") and self.stream.isatty() + self.color = bool(color) + + def _stamp(self) -> str: + t = self._clock() + return f"{t.tm_hour:02d}:{t.tm_min:02d}:{t.tm_sec:02d}" + + def _paint(self, text: str, color: str | None) -> str: + if not self.color or not color or color not in _COLORS: + return text + return f"{_COLORS[color]}{text}{RESET}" + + def emit(self, prefix: str, message: str, color: str | None = None) -> None: + """print one line: ``HH:MM:SS [prefix] message`` (message optionally colored)""" + line = f"{self._stamp()} {self._paint(f'[{prefix}]', 'dim')} {self._paint(message, color)}" + print(line, file=self.stream, flush=True) diff --git a/src/claudedo/daemon.py b/src/claudedo/daemon.py index f457373..fb019b7 100644 --- a/src/claudedo/daemon.py +++ b/src/claudedo/daemon.py @@ -18,6 +18,7 @@ from pathlib import Path from . import audio, grammar, inject, target from .config import Config +from .console import SYSTEM, VOICE, Console from .stt import Transcriber log = logging.getLogger(__name__) @@ -114,6 +115,8 @@ class Daemon: self._transcriber: Transcriber | None = None self._device: int | None = None self._ptt = _PTTKey() + self._pending: dict[str, int] = {} + self._console = Console() def _install_signals(self) -> None: signal.signal(signal.SIGTERM, self._on_signal) @@ -163,7 +166,7 @@ class Daemon: parsed = grammar.parse(transcript, cfg.wake_phrases, cfg.match_threshold, require_wake, filler=cfg.filler_words) if parsed is None or parsed.action is None: - self._emit(f'heard: "{transcript}" -> no command matched') + self._console.emit(VOICE, f'heard "{transcript}" -> no command matched', "yellow") return action = parsed.action @@ -171,36 +174,75 @@ class Daemon: new_mode = str(action.arg) if new_mode != self.mode: self.mode = new_mode - self._emit(f"mode -> {new_mode}") + self._console.emit(SYSTEM, f"mode -> {new_mode}", "cyan") self._refresh_state() return if action.name == "set": session = target.set_target(str(action.arg)) - self._emit(f"set sticky -> {session}") + self._pending.pop(session, None) + self._console.emit(SYSTEM, f"set sticky -> {session}", "cyan") self._refresh_state() return if action.name == "unset": target.unset_target() - self._emit("unset (cleared)") + self._console.emit(SYSTEM, "unset (cleared)", "cyan") self._refresh_state() return if action.name == "list": sessions = target.list_sessions() - self._emit("list -> " + (", ".join(sessions) if sessions else "(none running)")) + self._console.emit(SYSTEM, "list -> " + (", ".join(sessions) if sessions else "(none running)")) return - session, reason = target.resolve(parsed.one_shot) + session, reason = target.resolve(parsed.one_shot, auto_target=cfg.auto_target) if session is None: - self._emit(f'heard: "{transcript}" -> {reason} -> matched {self._describe(action)} ' - f'-> did nothing') + self._console.emit(VOICE, f'heard "{transcript}" -> {reason} -> ' + f'{self._describe(action)} did nothing', "red") return - prefix = f'heard: "{transcript}" -> {reason} -> matched {self._describe(action)}' - if action.name == "type" and not cfg.type_autosend: - inject.send_literal(session, str(action.arg)) - self._emit(f"{prefix} -> injected literal {str(action.arg)!r} -> {session}") + self._inject(session, transcript, reason, action) + + def _inject(self, session: str, transcript: str, reason: str, action) -> None: + """run a resolved command against `session`, tracking the uncommitted-input + buffer so backspace/erase delete only back to the last submit boundary""" + heard = f'heard "{transcript}" ({reason})' + name = action.name + + if name == "type": + text = str(action.arg) + inject.send_literal(session, text) + self._pending[session] = self._pending.get(session, 0) + len(text) + if self.config.type_autosend: + inject.send_named(session, inject.keys.SUBMIT) + self._pending[session] = 0 + self._console.emit(session, f"{heard} -> typed {text!r}" + + (" + send" if self.config.type_autosend else ""), "green") return + if name == "space": + n = int(action.arg) + inject.perform(session, action) + self._pending[session] = self._pending.get(session, 0) + n + self._console.emit(session, f"{heard} -> space x{n}", "green") + return + if name == "backspace": + have = self._pending.get(session, 0) + n = min(int(action.arg), have) + if n: + inject.perform(session, grammar.Action("backspace", n)) + self._pending[session] = have - n + self._console.emit(session, f"{heard} -> backspace x{n}" + + ("" if n == int(action.arg) else " (capped at boundary)"), "green") + return + if name == "erase": + n = self._pending.get(session, 0) + if n: + inject.perform(session, grammar.Action("erase", n)) + self._pending[session] = 0 + self._console.emit(session, f"{heard} -> erase x{n} (to last boundary)", "green") + return + inject.perform(session, action) - self._emit(f"{prefix} -> injected {self._describe(action)} -> {session}") + if name == "submit": + self._pending[session] = 0 + self._console.emit(session, f"{heard} -> {self._describe(action)}", "green") @staticmethod def _describe(action) -> str: @@ -208,11 +250,6 @@ class Daemon: return action.name.upper() return f"{action.name.upper()}({action.arg})" - @staticmethod - def _emit(line: str) -> None: - """print a recognition/action line to the watched terminal""" - print(line, flush=True) - def _has_wake(self, transcript: str) -> bool: """true if the utterance starts with a wake phrase (listen-mode gate). @@ -225,15 +262,11 @@ class Daemon: def _print_startup(self) -> None: cfg = self.config dev = cfg.stt_device if cfg.stt_device != "auto" else "default" - target_now = target.read_active() or "(none — run cc to attach)" - self._emit("── claudedo ─────────────────────────────────") - self._emit(f" model: {cfg.stt_model} ({cfg.stt_language})") - self._emit(f" mic: {dev}") - self._emit(f" mode: {self.mode}") - self._emit(f" target: {target_now}") - self._emit(f" wake: {', '.join(cfg.wake_phrases)}") - self._emit(" Ctrl-C to stop") - self._emit("─────────────────────────────────────────────") + target_now = target.read_active() or "(none — run cc / set )" + self._console.emit(SYSTEM, f"claudedo {self.mode} mode — Ctrl-C to stop", "bold") + self._console.emit(SYSTEM, f"model {cfg.stt_model} ({cfg.stt_language}) · mic {dev} · " + f"target {target_now}") + self._console.emit(SYSTEM, "wake: " + ", ".join(cfg.wake_phrases)) def _refresh_state(self) -> None: write_state(os.getpid(), self.mode, target.read_active()) @@ -257,7 +290,10 @@ class Daemon: if not transcript: continue if self.mode == "listen" and not self._has_wake(transcript): - self._emit("dropped: non-wake speech (not recorded)") + if self.config.print_heard: + self._console.emit(VOICE, f'heard (dropped) "{transcript}"', "red") + else: + self._console.emit(VOICE, "dropped: non-wake speech (not recorded)", "dim") continue self._handle(transcript) finally: diff --git a/src/claudedo/grammar.py b/src/claudedo/grammar.py index f44cbd7..7c69551 100644 --- a/src/claudedo/grammar.py +++ b/src/claudedo/grammar.py @@ -27,6 +27,12 @@ _NUMBER_WORDS = { _INDEX_WORDS = {"1": 1, "2": 2, "3": 3, "4": 4} +_COUNT_WORDS = { + "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, + "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, + "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20, +} + _STICKY_VERBS = ("set", "sticky", "switch") _ONESHOT_VERBS = ("target",) _UNSET_VERBS = ("unset", "unsticky") @@ -39,9 +45,10 @@ DEFAULT_FILLER = ("select", "use", "choose") class Action: """a matched command: a name plus an optional argument. - names: yes, no, select, approve, deny, submit, type, cancel, mode, set, unset, - list. arg carries the select index (int), the literal text for ``type``, the mode - for ``mode``, or the session short-name for ``set``. + names: yes, no, select, approve, deny, submit, type, space, backspace, erase, + cancel, mode, set, unset, list. arg carries the select index (int), the literal + text for ``type``, the count for ``space``/``backspace`` (int), the mode for + ``mode``, or the session short-name for ``set``. """ name: str @@ -123,6 +130,22 @@ def _fuzzy_in(token: str, options: tuple[str, ...], threshold: float) -> bool: return any(_ratio(token, opt) >= threshold for opt in options) +def _leading_count(rest: list[str], default: int = 1) -> int: + """read a count from the first token (digit or number word), else the default. + + 'backspace 3' -> 3, 'backspace ten' -> 10 (normalize maps small words to digits; + larger words come from _COUNT_WORDS), 'backspace' -> default. + """ + if not rest: + return default + tok = rest[0] + if tok.isdigit(): + return max(0, int(tok)) + if tok in _COUNT_WORDS: + return _COUNT_WORDS[tok] + return default + + def match_command(remainder: str, threshold: float) -> Action | None: """map a normalized command remainder to an Action, or None if unrecognized. @@ -160,6 +183,13 @@ def match_command(remainder: str, threshold: float) -> Action | None: text = " ".join(rest).strip() return Action("type", text) if text else None + if _fuzzy_in(head, ("backspace", "delete"), threshold): + return Action("backspace", _leading_count(rest, default=1)) + if _fuzzy_in(head, ("space",), threshold): + return Action("space", _leading_count(rest, default=1)) + if _fuzzy_in(head, ("erase", "clear", "wipe"), threshold): + return Action("erase") + if _fuzzy_in(head, ("mode",), threshold) and rest: if _fuzzy_in(rest[0], ("ptt",), threshold) or "push" in rest[0]: return Action("mode", "ptt") diff --git a/src/claudedo/inject.py b/src/claudedo/inject.py index 64b1a46..555bb66 100644 --- a/src/claudedo/inject.py +++ b/src/claudedo/inject.py @@ -45,11 +45,18 @@ class OutputHandler(ABC): def send_literal(self, session: str, text: str) -> None: """emit literal text into the input box without submitting (``type``)""" + def send_repeat(self, session: str, token: str, count: int) -> None: + """emit a named key `count` times (e.g. BSpace x n). default impl loops.""" + if count <= 0: + return + self.send_named(session, [token] * count) + def perform(self, session: str, action) -> bool: """resolve a grammar.Action to keystrokes and emit them. returns acted?. - ``switch`` and ``mode`` are handled by the daemon (they change daemon state, - not the claude session), so they are ignored here. + ``switch``/``set``/``mode`` etc. are handled by the daemon (they change daemon + state, not the claude session), so they are ignored here. ``erase`` arrives + with action.arg already set to the count the daemon wants backspaced. """ name = action.name if name == "yes": @@ -72,6 +79,10 @@ class OutputHandler(ABC): self.send_named(session, seq) elif name == "type": self.send_literal(session, str(action.arg)) + elif name == "space": + self.send_literal(session, " " * int(action.arg)) + elif name in ("backspace", "erase"): + self.send_repeat(session, keys.BACKSPACE[0], int(action.arg)) else: return False return True diff --git a/src/claudedo/keys.py b/src/claudedo/keys.py index 679ad95..b18ca5d 100644 --- a/src/claudedo/keys.py +++ b/src/claudedo/keys.py @@ -37,6 +37,12 @@ DENY = ["3"] SUBMIT = ["Enter"] CANCEL = ["Escape"] +# BACKSPACE deletes one char left; SPACE inserts one literal space. both are emitted +# repeatedly for `backspace ` / `space ` and for `erase` (n = the daemon's +# tracked uncommitted-input count). BSpace is tmux's name for the backspace key. +BACKSPACE = ["BSpace"] +SPACE = [" "] + SELECT_BY_INDEX = { 1: SELECT_1, 2: SELECT_2,