diff --git a/src/claudedo/stt.py b/src/claudedo/stt.py
index 35e94e8..c85530e 100644
--- a/src/claudedo/stt.py
+++ b/src/claudedo/stt.py
@@ -6,12 +6,77 @@ short in-memory chunk; nothing is written to disk or sent anywhere.
 
 from __future__ import annotations
 
+import contextlib
 import logging
+import os
+import re
+import sys
 
 import numpy as np
 
 log = logging.getLogger(__name__)
 
+_NOISE = re.compile(r"GPU device discovery failed|device_discovery\.cc|DiscoverDevicesForPlatform")
+
+
+def _quiet_backends() -> None:
+    """quiet onnxruntime/ctranslate2 chatter and the faster_whisper INFO log.
+
+    faster-whisper's VAD loads an onnx model whose device discovery prints a noisy
+    'GPU device discovery failed' warning on headless/WSL hosts with no GPU sysfs.
+    the env var + logger severity stop most onnx logging; the warning itself is
+    emitted at C++ init and is filtered out of stderr by _filter_stderr().
+    """
+    os.environ.setdefault("ORT_LOGGING_LEVEL", "3")
+    os.environ.setdefault("OMP_NUM_THREADS", os.environ.get("OMP_NUM_THREADS", "4"))
+    logging.getLogger("faster_whisper").setLevel(logging.WARNING)
+    try:
+        import onnxruntime
+        onnxruntime.set_default_logger_severity(3)
+    except Exception:
+        pass
+
+
+@contextlib.contextmanager
+def _filter_stderr():
+    """drop onnxruntime's GPU-discovery warning lines from stderr for this block.
+
+    a pipe temporarily replaces fd 2; a pump thread forwards every line to the real
+    stderr EXCEPT the known GPU-discovery noise, so real errors still surface. the
+    original fd is always restored on exit.
+    """
+    import threading
+
+    try:
+        stderr_fd = sys.stderr.fileno()
+    except (AttributeError, OSError):
+        yield
+        return
+
+    saved_fd = os.dup(stderr_fd)
+    read_fd, write_fd = os.pipe()
+    os.dup2(write_fd, stderr_fd)
+    os.close(write_fd)
+
+    def pump():
+        with os.fdopen(read_fd, "rb") as reader, os.fdopen(saved_fd, "wb", closefd=False) as out:
+            for line in reader:
+                if not _NOISE.search(line.decode("utf-8", "replace")):
+                    out.write(line)
+                    out.flush()
+
+    thread = threading.Thread(target=pump, daemon=True)
+    thread.start()
+    try:
+        yield
+    finally:
+        import time
+
+        time.sleep(0.05)
+        os.dup2(saved_fd, stderr_fd)
+        os.close(saved_fd)
+        thread.join(timeout=1.0)
+
 
 class Transcriber:
     """a loaded faster-whisper model that transcribes float32 mono audio chunks"""
@@ -20,17 +85,25 @@ class Transcriber:
                  compute_type: str = "auto") -> None:
         self.language = language
         self._model = self._load(model, device, compute_type)
+        self._warm()
 
     @staticmethod
     def _load(model: str, device: str, compute_type: str):
-        from faster_whisper import WhisperModel
-
         if device == "auto":
             device = "cpu"
         if compute_type == "auto":
             compute_type = "int8" if device == "cpu" else "float16"
         log.info("loading faster-whisper model=%s device=%s compute=%s", model, device, compute_type)
-        return WhisperModel(model, device=device, compute_type=compute_type)
+        with _filter_stderr():
+            _quiet_backends()
+            from faster_whisper import WhisperModel
+            return WhisperModel(model, device=device, compute_type=compute_type)
+
+    def _warm(self) -> None:
+        """run one throwaway transcribe so the VAD onnx session inits now, under the
+        stderr filter — the GPU-discovery warning fires here once, not in the loop"""
+        with _filter_stderr():
+            list(self._model.transcribe(np.zeros(1600, dtype=np.float32), vad_filter=True)[0])
 
     def transcribe(self, audio: np.ndarray, samplerate: int = 16000) -> str:
         """transcribe a mono float32 numpy array to a stripped text string.