Upload main file

2026-04-05 16:56:06 +00:00
parent 1b90d53c10
commit 03df49c921
1 changed files with 189 additions and 0 deletions
--- a/voice_cloak.py
+++ b/voice_cloak.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink
+Speaks each phrase as it's transcribed, not after a long silence.
+
+Requirements:
+    pip install faster-whisper sounddevice numpy edge-tts
+    pactl / pw-cli (PulseAudio/Pipewire)
+
+First run: select "VoiceCloakMic" as your mic in Discord/whatever.
+"""
+
+import asyncio
+import subprocess
+import tempfile
+import threading
+import queue
+import sys
+import time
+import os
+import numpy as np
+import sounddevice as sd
+from faster_whisper import WhisperModel
+
+# ── Config ────────────────────────────────────────────────────────────────────
+SAMPLE_RATE       = 16000
+CHANNELS          = 1
+BLOCK_SECONDS     = 0.3          # smaller = more responsive
+SILENCE_THRESH    = 0.012        # RMS threshold for silence
+SILENCE_CHUNKS    = 5            # silence chunks before flushing (~1.5s)
+MAX_BUFFER_CHUNKS = 60           # hard cap ~18s
+WHISPER_MODEL     = "base.en"    # tiny.en / base.en / small.en
+TTS_VOICE         = "en-US-RogerNeural"
+SINK_NAME         = "VoiceCloak"
+VIRTUAL_MIC       = "VoiceCloakMic"
+# ─────────────────────────────────────────────────────────────────────────────
+
+audio_queue: queue.Queue = queue.Queue()
+tts_queue: queue.Queue = queue.Queue()
+stop_event = threading.Event()
+
+
+# ── Virtual sink ──────────────────────────────────────────────────────────────
+
+def run(cmd: str) -> str:
+    return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()
+
+def setup_virtual_sink():
+    if SINK_NAME in run("pactl list short sinks"):
+        print(f"[sink] {SINK_NAME} already exists")
+        return
+    print(f"[sink] Creating virtual sink '{SINK_NAME}'...")
+    mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")
+    mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")
+    if mod1 and mod2:
+        print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")
+    else:
+        print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")
+        sys.exit(1)
+
+
+# ── Audio capture ─────────────────────────────────────────────────────────────
+
+def audio_callback(indata, frames, time_info, status):
+    if status:
+        print(f"[audio] {status}", file=sys.stderr)
+    audio_queue.put(indata.copy())
+
+def capture_thread():
+    with sd.InputStream(
+        samplerate=SAMPLE_RATE,
+        channels=CHANNELS,
+        dtype="float32",
+        blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),
+        callback=audio_callback,
+    ):
+        print("[mic] Listening... (Ctrl+C to stop)")
+        while not stop_event.is_set():
+            time.sleep(0.1)
+
+
+# ── Transcribe loop ───────────────────────────────────────────────────────────
+
+def transcribe_loop(model: WhisperModel):
+    buffer = []
+    silent_count = 0
+
+    while not stop_event.is_set():
+        try:
+            chunk = audio_queue.get(timeout=0.5)
+        except queue.Empty:
+            continue
+
+        rms = float(np.sqrt(np.mean(chunk ** 2)))
+
+        if rms < SILENCE_THRESH:
+            silent_count += 1
+        else:
+            silent_count = 0
+            buffer.append(chunk)
+
+        flush = buffer and (
+            silent_count >= SILENCE_CHUNKS or
+            len(buffer) >= MAX_BUFFER_CHUNKS
+        )
+
+        if flush:
+            audio_np = np.concatenate(buffer, axis=0).flatten()
+            buffer.clear()
+            silent_count = 0
+
+            # Stream segments as Whisper produces them — fires per phrase
+            segments, _ = model.transcribe(
+                audio_np,
+                language="en",
+                vad_filter=True,
+                vad_parameters={"min_silence_duration_ms": 200},
+            )
+
+            for segment in segments:
+                text = segment.text.strip()
+                if text:
+                    print(f"[transcribed] {text}")
+                    tts_queue.put(text)
+
+
+# ── TTS worker (serial so segments don't overlap) ─────────────────────────────
+
+def tts_worker():
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    while not stop_event.is_set():
+        try:
+            text = tts_queue.get(timeout=0.5)
+        except queue.Empty:
+            continue
+        loop.run_until_complete(speak(text))
+
+
+async def speak(text: str):
+    import edge_tts
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        tmp_path = f.name
+    try:
+        communicate = edge_tts.Communicate(text, TTS_VOICE)
+        await communicate.save(tmp_path)
+        # Play to both simultaneously
+        p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])
+        p2 = subprocess.Popen(["paplay", tmp_path])  # default output = your speakers
+        p1.wait()
+        p2.wait()
+    except Exception as e:
+        print(f"[tts] Error: {e}")
+    finally:
+        os.unlink(tmp_path)
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    print("=" * 55)
+    print("  voice_cloak — streaming phrase-by-phrase")
+    print("=" * 55)
+
+    setup_virtual_sink()
+
+    print(f"[whisper] Loading '{WHISPER_MODEL}'...")
+    model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
+    print("[whisper] ✓ Ready")
+
+    threads = [
+        threading.Thread(target=capture_thread, daemon=True),
+        threading.Thread(target=tts_worker, daemon=True),
+    ]
+    for t in threads:
+        t.start()
+
+    try:
+        transcribe_loop(model)
+    except KeyboardInterrupt:
+        print("\n[main] Stopping...")
+        stop_event.set()
+        for t in threads:
+            t.join(timeout=2)
+        print("[main] Done.")
+
+
+if __name__ == "__main__":
+    main()