claudedo/config.toml
disqualifier 1a593b95fa v0.2.1: earcons — audio feedback tones (eyes-free confirmation)
short confirmation tones on daemon events so the user gets eyes-free "did it hear me?"
feedback without watching the terminal. NOT TTS — short pre-generated .wav beeps.

- audio_out.py — reusable audio-OUT module (the reverse of audio.py's capture, the
  less-tested WSLg direction). three-tier player: paplay-first (a SEPARATE process, so
  it doesn't contend with the sounddevice mic stream on the duplex-flaky WSLg bridge),
  then in-process sounddevice, then powershell.exe SoundPlayer. best-effort per-backend
  volume. plays a wav path and knows nothing about events — v0.3 TTS reuses it.
- sound.py — Earcons: the single event->tone map (wake/accept/no_match/submit) gated by
  [sound] config (master enabled + per-event flags). daemon._handle wiring: an injected
  command plays accept (submit plays submit); no-match / target-missing / unknown-context
  plays no_match; pure daemon-control commands (list/version/…) play nothing.
- sounds/ — committed earcon wavs + generate.py (regen-only). committed (not generated
  at install) so the package is self-contained and a missing tone can never appear.
  packaged via pyproject [tool.setuptools.package-data].
- [sound] config: enabled (master, on), on_wake (OFF by default — bleed/chatty),
  on_accept/on_no_match/on_submit (on), volume (0-1 best-effort), [sound.files] overrides.
- claudedo test-tone — plays each tone, the audio-OUT gate (mirrors test-audio).
- install.sh now also checks RDPSink (audio-out) alongside RDPSource.

INVARIANT: earcons are fire-and-forget on a worker thread and NEVER block or break the
inject path. a missing tone file or dead speaker logs once and is swallowed, never
raised — a broken speaker must never stop "claudedo yes" from injecting.

de-risks the WSLg audio-OUT path that v0.3 TTS-readback will reuse.

Signed-off-by: disqualifier <dev@disqualifier.me>
2026-06-27 18:32:34 -04:00

115 lines
5.9 KiB
TOML

# claudedo configuration. everything tunable lives here — no hardcoded paths or
# secrets in code. loaded and validated by config.py with clear errors.
[wake]
# wake phrases for listen mode. fuzzy-matched: case/space-insensitive, lenient on
# the coined word "claudedo" (whisper renders it inconsistently). number words are
# normalized to digits before command matching.
phrases = ["claudedo", "claude do", "hey claude", "ok claude", "okay claude"]
[input]
# "listen" (default): continuous capture; only acts on utterances that start with a
# wake phrase; all other speech is transcribed locally and discarded immediately.
# this is the hands-free path — works while another window (a game) is focused,
# because the trigger is your voice over the mic bridge, not a Windows keyboard
# hook. no system-wide hotkey is installed by design.
# "ptt": push-to-talk; capture only while ptt_key is held. DESK-ONLY: it captures
# only while the daemon's own terminal window is focused (there is deliberately no
# global hotkey — a system-wide keyboard hook is the keylogger/cheat silhouette we
# refuse to build). use "listen" for hands-free-while-gaming.
mode = "listen"
ptt_key = "space"
[stt]
# faster-whisper model size. "small.en" is the default — the English-only small model
# (~1s/command on a strong cpu, more accurate on english than multilingual "small" at
# the same speed). "medium"/"medium.en" are more accurate but ~3x slower (noticeable
# lag); "large-v3" is most accurate and slowest. drop to "base.en" for max snappiness
# (less accurate). bump only if recognition is poor.
model = "small.en"
language = "en"
# mic device: "auto", or a sounddevice device index (integer) / substring of a
# device name. run `claudedo test-audio` to list devices.
device = "auto"
# faster-whisper compute device: "auto" (cpu here), "cpu", or "cuda".
compute = "auto"
[audio]
# capture parameters. 16 kHz mono is what whisper expects.
samplerate = 16000
channels = 1
# rms energy below this counts as silence (the VAD onset/endpoint floor).
silence_threshold = 0.012
# ignore utterances shorter than this (clicks, coughs).
min_utterance = 0.3
[vad]
# Alexa-style record-until-pause endpointing (listen mode). capture starts on speech
# onset and ends after this much trailing silence — the natural end of an utterance.
# a real pause both ends the command AND separates it from following chatter (the
# chatter becomes a separate capture that the wake gate then discards).
silence_ms = 700
# hard cap so continuous noise can't record forever (also the ceiling for a long
# dictated `type` phrase).
max_seconds = 15.0
[behavior]
# dictation never auto-submits: "type <phrase>" inserts literal text only; you say
# "send" separately to submit (read-before-send).
type_autosend = false
# fuzzy match ratios (0..1). the asymmetry is deliberate: a false WAKE is cheap (it
# wakes, finds no command, does nothing), so wake is lenient; a false COMMAND fires
# the WRONG action, so commands stay tight. lower = more lenient = more matches.
# prefer expanding command synonyms over loosening command_fuzzy_threshold.
wake_fuzzy_threshold = 0.65
command_fuzzy_threshold = 0.8
# optional filler words that may precede a command and are ignored for matching:
# "select yes" / "use yes" behave like "yes". (a filler word followed by a digit is
# the select command, e.g. "select 1", and is not dropped.)
filler_words = ["select", "use", "choose"]
# when no sticky target is set and exactly ONE claude-* session is running:
# false (default) -> require an explicit `set <name>` or one-shot `target <name>`;
# a bare command does nothing and tells you to set one.
# true -> auto-target that single session (convenience).
auto_target = false
# DEBUG ONLY — relaxes the privacy invariant. when true, the daemon console prints
# the raw transcript of EVERY utterance, including non-wake speech it would otherwise
# drop silently (shown as `heard (dropped): "<transcript>"`). use it to see exactly
# how Whisper renders your wake word, then turn it OFF. default false: non-wake speech
# is discarded without ever printing the transcript.
print_heard = false
# how the `context <name> <dictation>` command assembles the blurb + instruction.
# true (default): blurb, a soft newline (Shift+Enter — needs the extended-keys tmux
# settings install.sh appends), then the instruction. if Shift+Enter is at all flaky
# in your terminal (it submits or does nothing), set false to flatten onto one line
# with context_separator between blurb and instruction — the blank line is cosmetic,
# not worth a submit risk. either way the assembled text is NEVER auto-submitted.
context_multiline = true
# separator inserted between blurb and instruction when context_multiline = false.
context_separator = " — "
[sound]
# earcons — short confirmation tones on daemon events so you get eyes-free feedback
# ("did it hear me?") without watching the terminal. tones are SHORT (<300ms) and quiet;
# they play OUT through WSLg's PulseAudio sink (paplay-first, sounddevice fallback, then
# powershell.exe). additive to the console feed — mute these and read at the desk, or
# hear them eyes-free. a dead speaker never blocks/breaks a command (fire-and-forget).
enabled = true
# blip when a wake phrase is recognized. OFF by default: a blip right before you speak
# the command can bleed into its capture, and it's chatty. turn on only if you want it.
on_wake = false
# positive blip when a command is recognized/injected.
on_accept = true
# distinct lower buzz when nothing matched or the target was missing (did nothing).
on_no_match = true
# rising chime when a send/submit is injected.
on_submit = true
# best-effort 0.0-1.0 (scaled for sounddevice, --volume for paplay; ignored by the
# powershell fallback, which has no volume control).
volume = 0.5
# optional per-event overrides to swap in your own .wav files, e.g.:
# [sound.files]
# accept = "~/sounds/my_accept.wav"
[sound.files]