Upload main file

2026-04-05 16:56:06 +00:00
parent 1b90d53c10
commit 03df49c921
1 changed files with 189 additions and 0 deletions
--- a/voice_cloak.py
+++ b/voice_cloak.py
@@ -0,0 +1,189 @@
 #!/usr/bin/env python3
 """
 voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink
 Speaks each phrase as it's transcribed, not after a long silence.
 Requirements:
    pip install faster-whisper sounddevice numpy edge-tts
    pactl / pw-cli (PulseAudio/Pipewire)
 First run: select "VoiceCloakMic" as your mic in Discord/whatever.
 """
 import asyncio
 import subprocess
 import tempfile
 import threading
 import queue
 import sys
 import time
 import os
 import numpy as np
 import sounddevice as sd
 from faster_whisper import WhisperModel
 # ── Config ────────────────────────────────────────────────────────────────────
 SAMPLE_RATE       = 16000
 CHANNELS          = 1
 BLOCK_SECONDS     = 0.3          # smaller = more responsive
 SILENCE_THRESH    = 0.012        # RMS threshold for silence
 SILENCE_CHUNKS    = 5            # silence chunks before flushing (~1.5s)
 MAX_BUFFER_CHUNKS = 60           # hard cap ~18s
 WHISPER_MODEL     = "base.en"    # tiny.en / base.en / small.en
 TTS_VOICE         = "en-US-RogerNeural"
 SINK_NAME         = "VoiceCloak"
 VIRTUAL_MIC       = "VoiceCloakMic"
 # ─────────────────────────────────────────────────────────────────────────────
 audio_queue: queue.Queue = queue.Queue()
 tts_queue: queue.Queue = queue.Queue()
 stop_event = threading.Event()
 # ── Virtual sink ──────────────────────────────────────────────────────────────
 def run(cmd: str) -> str:
    return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()
 def setup_virtual_sink():
    if SINK_NAME in run("pactl list short sinks"):
        print(f"[sink] {SINK_NAME} already exists")
        return
    print(f"[sink] Creating virtual sink '{SINK_NAME}'...")
    mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")
    mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")
    if mod1 and mod2:
        print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")
    else:
        print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")
        sys.exit(1)
 # ── Audio capture ─────────────────────────────────────────────────────────────
 def audio_callback(indata, frames, time_info, status):
    if status:
        print(f"[audio] {status}", file=sys.stderr)
    audio_queue.put(indata.copy())
 def capture_thread():
    with sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype="float32",
        blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),
        callback=audio_callback,
    ):
        print("[mic] Listening... (Ctrl+C to stop)")
        while not stop_event.is_set():
            time.sleep(0.1)
 # ── Transcribe loop ───────────────────────────────────────────────────────────
 def transcribe_loop(model: WhisperModel):
    buffer = []
    silent_count = 0
    while not stop_event.is_set():
        try:
            chunk = audio_queue.get(timeout=0.5)
        except queue.Empty:
            continue
        rms = float(np.sqrt(np.mean(chunk ** 2)))
        if rms < SILENCE_THRESH:
            silent_count += 1
        else:
            silent_count = 0
            buffer.append(chunk)
        flush = buffer and (
            silent_count >= SILENCE_CHUNKS or
            len(buffer) >= MAX_BUFFER_CHUNKS
        )
        if flush:
            audio_np = np.concatenate(buffer, axis=0).flatten()
            buffer.clear()
            silent_count = 0
            # Stream segments as Whisper produces them — fires per phrase
            segments, _ = model.transcribe(
                audio_np,
                language="en",
                vad_filter=True,
                vad_parameters={"min_silence_duration_ms": 200},
            )
            for segment in segments:
                text = segment.text.strip()
                if text:
                    print(f"[transcribed] {text}")
                    tts_queue.put(text)
 # ── TTS worker (serial so segments don't overlap) ─────────────────────────────
 def tts_worker():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    while not stop_event.is_set():
        try:
            text = tts_queue.get(timeout=0.5)
        except queue.Empty:
            continue
        loop.run_until_complete(speak(text))
 async def speak(text: str):
    import edge_tts
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
        tmp_path = f.name
    try:
        communicate = edge_tts.Communicate(text, TTS_VOICE)
        await communicate.save(tmp_path)
        # Play to both simultaneously
        p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])
        p2 = subprocess.Popen(["paplay", tmp_path])  # default output = your speakers
        p1.wait()
        p2.wait()
    except Exception as e:
        print(f"[tts] Error: {e}")
    finally:
        os.unlink(tmp_path)
 # ── Main ──────────────────────────────────────────────────────────────────────
 def main():
    print("=" * 55)
    print("  voice_cloak — streaming phrase-by-phrase")
    print("=" * 55)
    setup_virtual_sink()
    print(f"[whisper] Loading '{WHISPER_MODEL}'...")
    model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
    print("[whisper] ✓ Ready")
    threads = [
        threading.Thread(target=capture_thread, daemon=True),
        threading.Thread(target=tts_worker, daemon=True),
    ]
    for t in threads:
        t.start()
    try:
        transcribe_loop(model)
    except KeyboardInterrupt:
        print("\n[main] Stopping...")
        stop_event.set()
        for t in threads:
            t.join(timeout=2)
        print("[main] Done.")
 if __name__ == "__main__":
    main()