tune: small.en default, vad 700ms, lighter command color, lean inject lines
default model -> small.en (english-only small; better english accuracy, same ~1s latency; .en variants added to the validator). raise [vad].silence_ms 500 -> 700 (500 cut off too early). command words now brightblue (lighter/cyan-ish) instead of dark blue. drop the redundant target from injection lines — the [session] prefix already names it, so e.g. '[claude-testing] typed ...' not '... sticky claude-testing -> typed ...'. Signed-off-by: disqualifier <dev@disqualifier.me>
This commit is contained in:
parent
2cbbabfaa1
commit
e84ef91e7b
13
config.toml
13
config.toml
@ -21,11 +21,12 @@ mode = "listen"
|
|||||||
ptt_key = "space"
|
ptt_key = "space"
|
||||||
|
|
||||||
[stt]
|
[stt]
|
||||||
# faster-whisper model size. "small" is the default — snappy (~1s/command on a strong
|
# faster-whisper model size. "small.en" is the default — the English-only small model
|
||||||
# cpu) and good enough with the initial_prompt biasing + lenient wake matching.
|
# (~1s/command on a strong cpu, more accurate on english than multilingual "small" at
|
||||||
# "medium" is more accurate on the coined wake word but ~3x slower (noticeable lag);
|
# the same speed). "medium"/"medium.en" are more accurate but ~3x slower (noticeable
|
||||||
# "large-v3" is most accurate and slowest. bump only if recognition is poor.
|
# lag); "large-v3" is most accurate and slowest. drop to "base.en" for max snappiness
|
||||||
model = "small"
|
# (less accurate). bump only if recognition is poor.
|
||||||
|
model = "small.en"
|
||||||
language = "en"
|
language = "en"
|
||||||
# mic device: "auto", or a sounddevice device index (integer) / substring of a
|
# mic device: "auto", or a sounddevice device index (integer) / substring of a
|
||||||
# device name. run `claudedo test-audio` to list devices.
|
# device name. run `claudedo test-audio` to list devices.
|
||||||
@ -47,7 +48,7 @@ min_utterance = 0.3
|
|||||||
# onset and ends after this much trailing silence — the natural end of an utterance.
|
# onset and ends after this much trailing silence — the natural end of an utterance.
|
||||||
# a real pause both ends the command AND separates it from following chatter (the
|
# a real pause both ends the command AND separates it from following chatter (the
|
||||||
# chatter becomes a separate capture that the wake gate then discards).
|
# chatter becomes a separate capture that the wake gate then discards).
|
||||||
silence_ms = 500
|
silence_ms = 700
|
||||||
# hard cap so continuous noise can't record forever.
|
# hard cap so continuous noise can't record forever.
|
||||||
max_seconds = 10.0
|
max_seconds = 10.0
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,10 @@ except ModuleNotFoundError:
|
|||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
_VALID_MODES = ("listen", "ptt")
|
_VALID_MODES = ("listen", "ptt")
|
||||||
_VALID_MODELS = ("tiny", "base", "small", "medium", "large-v2", "large-v3")
|
_VALID_MODELS = (
|
||||||
|
"tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3",
|
||||||
|
"tiny.en", "base.en", "small.en", "medium.en",
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_CONFIG_PATHS = (
|
DEFAULT_CONFIG_PATHS = (
|
||||||
Path(os.environ.get("CLAUDEDO_CONFIG", "")) if os.environ.get("CLAUDEDO_CONFIG") else None,
|
Path(os.environ.get("CLAUDEDO_CONFIG", "")) if os.environ.get("CLAUDEDO_CONFIG") else None,
|
||||||
@ -99,7 +102,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
|
|||||||
if mode not in _VALID_MODES:
|
if mode not in _VALID_MODES:
|
||||||
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
|
raise ConfigError(f"[input].mode must be one of {_VALID_MODES}, got {mode!r}")
|
||||||
|
|
||||||
model = _require(raw, "stt", "model", (str,), "small")
|
model = _require(raw, "stt", "model", (str,), "small.en")
|
||||||
if model not in _VALID_MODELS:
|
if model not in _VALID_MODELS:
|
||||||
log.warning("unknown stt model %r — passing through to faster-whisper", model)
|
log.warning("unknown stt model %r — passing through to faster-whisper", model)
|
||||||
|
|
||||||
@ -114,7 +117,7 @@ def load_config(explicit: str | os.PathLike | None = None) -> Config:
|
|||||||
samplerate=int(_require(raw, "audio", "samplerate", (int,), 16000)),
|
samplerate=int(_require(raw, "audio", "samplerate", (int,), 16000)),
|
||||||
channels=int(_require(raw, "audio", "channels", (int,), 1)),
|
channels=int(_require(raw, "audio", "channels", (int,), 1)),
|
||||||
silence_threshold=float(_require(raw, "audio", "silence_threshold", (int, float), 0.012)),
|
silence_threshold=float(_require(raw, "audio", "silence_threshold", (int, float), 0.012)),
|
||||||
vad_silence_ms=int(_require(raw, "vad", "silence_ms", (int,), 500)),
|
vad_silence_ms=int(_require(raw, "vad", "silence_ms", (int,), 700)),
|
||||||
vad_max_seconds=float(_require(raw, "vad", "max_seconds", (int, float), 10.0)),
|
vad_max_seconds=float(_require(raw, "vad", "max_seconds", (int, float), 10.0)),
|
||||||
min_utterance=float(_require(raw, "audio", "min_utterance", (int, float), 0.3)),
|
min_utterance=float(_require(raw, "audio", "min_utterance", (int, float), 0.3)),
|
||||||
type_autosend=bool(_require(raw, "behavior", "type_autosend", (bool,), False)),
|
type_autosend=bool(_require(raw, "behavior", "type_autosend", (bool,), False)),
|
||||||
|
|||||||
@ -19,6 +19,7 @@ _COLORS = {
|
|||||||
"yellow": "\033[33m",
|
"yellow": "\033[33m",
|
||||||
"cyan": "\033[36m",
|
"cyan": "\033[36m",
|
||||||
"blue": "\033[34m",
|
"blue": "\033[34m",
|
||||||
|
"brightblue": "\033[94m",
|
||||||
"dim": "\033[2m",
|
"dim": "\033[2m",
|
||||||
"bold": "\033[1m",
|
"bold": "\033[1m",
|
||||||
}
|
}
|
||||||
|
|||||||
@ -179,7 +179,7 @@ class Daemon:
|
|||||||
"green")
|
"green")
|
||||||
|
|
||||||
def blue(s):
|
def blue(s):
|
||||||
return self._console.paint(s, "blue")
|
return self._console.paint(s, "brightblue")
|
||||||
if action.name == "mode":
|
if action.name == "mode":
|
||||||
new_mode = str(action.arg)
|
new_mode = str(action.arg)
|
||||||
if new_mode != self.mode:
|
if new_mode != self.mode:
|
||||||
@ -219,14 +219,14 @@ class Daemon:
|
|||||||
self._console.emit(VOICE, f'heard "{transcript}" -> {reason} -> '
|
self._console.emit(VOICE, f'heard "{transcript}" -> {reason} -> '
|
||||||
f'{self._describe(action)} did nothing', "red")
|
f'{self._describe(action)} did nothing', "red")
|
||||||
return
|
return
|
||||||
self._inject(session, transcript, reason, action)
|
self._inject(session, action)
|
||||||
|
|
||||||
def _inject(self, session: str, transcript: str, reason: str, action) -> None:
|
def _inject(self, session: str, action) -> None:
|
||||||
"""run a resolved command against `session`, tracking the uncommitted-input
|
"""run a resolved command against `session`, tracking the uncommitted-input
|
||||||
buffer so backspace/erase delete only back to the last submit boundary.
|
buffer so backspace/erase delete only back to the last submit boundary.
|
||||||
|
|
||||||
the 'heard ...' echo is already printed by _handle; these lines report the
|
the 'heard ...' echo is already printed by _handle and the [session] prefix
|
||||||
target (reason) and the keystrokes actually injected.
|
names the target, so these lines just report the keystrokes injected.
|
||||||
"""
|
"""
|
||||||
name = action.name
|
name = action.name
|
||||||
|
|
||||||
@ -237,34 +237,34 @@ class Daemon:
|
|||||||
if self.config.type_autosend:
|
if self.config.type_autosend:
|
||||||
inject.send_named(session, inject.keys.SUBMIT)
|
inject.send_named(session, inject.keys.SUBMIT)
|
||||||
self._pending[session] = 0
|
self._pending[session] = 0
|
||||||
self._console.emit(session, f"{reason} -> typed {text!r}"
|
self._console.emit(session, f"typed {text!r}"
|
||||||
+ (" + send" if self.config.type_autosend else ""), "green")
|
+ (" + send" if self.config.type_autosend else ""), "green")
|
||||||
return
|
return
|
||||||
if name == "space":
|
if name == "space":
|
||||||
n = int(action.arg)
|
n = int(action.arg)
|
||||||
inject.perform(session, action)
|
inject.perform(session, action)
|
||||||
self._pending[session] = self._pending.get(session, 0) + n
|
self._pending[session] = self._pending.get(session, 0) + n
|
||||||
self._console.emit(session, f"{reason} -> space x{n}", "green")
|
self._console.emit(session, f"space x{n}", "green")
|
||||||
return
|
return
|
||||||
if name == "backspace":
|
if name == "backspace":
|
||||||
n = int(action.arg)
|
n = int(action.arg)
|
||||||
if n:
|
if n:
|
||||||
inject.perform(session, action)
|
inject.perform(session, action)
|
||||||
self._pending[session] = max(0, self._pending.get(session, 0) - n)
|
self._pending[session] = max(0, self._pending.get(session, 0) - n)
|
||||||
self._console.emit(session, f"{reason} -> backspace x{n}", "green")
|
self._console.emit(session, f"backspace x{n}", "green")
|
||||||
return
|
return
|
||||||
if name == "erase":
|
if name == "erase":
|
||||||
n = self._pending.get(session, 0)
|
n = self._pending.get(session, 0)
|
||||||
if n:
|
if n:
|
||||||
inject.perform(session, grammar.Action("erase", n))
|
inject.perform(session, grammar.Action("erase", n))
|
||||||
self._pending[session] = 0
|
self._pending[session] = 0
|
||||||
self._console.emit(session, f"{reason} -> erase x{n} (to last boundary)", "green")
|
self._console.emit(session, f"erase x{n} (to last boundary)", "green")
|
||||||
return
|
return
|
||||||
|
|
||||||
inject.perform(session, action)
|
inject.perform(session, action)
|
||||||
if name == "submit":
|
if name == "submit":
|
||||||
self._pending[session] = 0
|
self._pending[session] = 0
|
||||||
self._console.emit(session, f"{reason} -> injected {self._describe(action)}", "green")
|
self._console.emit(session, f"injected {self._describe(action)}", "green")
|
||||||
|
|
||||||
def _timing(self) -> str:
|
def _timing(self) -> str:
|
||||||
"""compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
|
"""compact STT latency suffix for heard lines (transcribe ms on audio secs)"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user