voice_cloak.py

#!/usr/bin/env python3
"""
voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink
Speaks each phrase as it's transcribed, not after a long silence.

Requirements:
    pip install faster-whisper sounddevice numpy edge-tts
    pactl / pw-cli (PulseAudio/Pipewire)

First run: select "VoiceCloakMic" as your mic in Discord/whatever.
"""

import asyncio
import subprocess
import tempfile
import threading
import queue
import sys
import time
import os
import numpy as np
import sounddevice as sd
from faster_whisper import WhisperModel

# ── Config ────────────────────────────────────────────────────────────────────
SAMPLE_RATE       = 16000
CHANNELS          = 1
BLOCK_SECONDS     = 0.3          # smaller = more responsive
SILENCE_THRESH    = 0.012        # RMS threshold for silence
SILENCE_CHUNKS    = 5            # silence chunks before flushing (~1.5s)
MAX_BUFFER_CHUNKS = 60           # hard cap ~18s
WHISPER_MODEL     = "base.en"    # tiny.en / base.en / small.en
TTS_VOICE         = "en-US-RogerNeural"
SINK_NAME         = "VoiceCloak"
VIRTUAL_MIC       = "VoiceCloakMic"
# ─────────────────────────────────────────────────────────────────────────────

audio_queue: queue.Queue = queue.Queue()
tts_queue: queue.Queue = queue.Queue()
stop_event = threading.Event()


# ── Virtual sink ──────────────────────────────────────────────────────────────

def run(cmd: str) -> str:
    return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()

def setup_virtual_sink():
    if SINK_NAME in run("pactl list short sinks"):
        print(f"[sink] {SINK_NAME} already exists")
        return
    print(f"[sink] Creating virtual sink '{SINK_NAME}'...")
    mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")
    mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")
    if mod1 and mod2:
        print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")
    else:
        print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")
        sys.exit(1)


# ── Audio capture ─────────────────────────────────────────────────────────────

def audio_callback(indata, frames, time_info, status):
    if status:
        print(f"[audio] {status}", file=sys.stderr)
    audio_queue.put(indata.copy())

def capture_thread():
    with sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype="float32",
        blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),
        callback=audio_callback,
    ):
        print("[mic] Listening... (Ctrl+C to stop)")
        while not stop_event.is_set():
            time.sleep(0.1)


# ── Transcribe loop ───────────────────────────────────────────────────────────

def transcribe_loop(model: WhisperModel):
    buffer = []
    silent_count = 0

    while not stop_event.is_set():
        try:
            chunk = audio_queue.get(timeout=0.5)
        except queue.Empty:
            continue

        rms = float(np.sqrt(np.mean(chunk ** 2)))

        if rms < SILENCE_THRESH:
            silent_count += 1
        else:
            silent_count = 0
            buffer.append(chunk)

        flush = buffer and (
            silent_count >= SILENCE_CHUNKS or
            len(buffer) >= MAX_BUFFER_CHUNKS
        )

        if flush:
            audio_np = np.concatenate(buffer, axis=0).flatten()
            buffer.clear()
            silent_count = 0

            # Stream segments as Whisper produces them — fires per phrase
            segments, _ = model.transcribe(
                audio_np,
                language="en",
                vad_filter=True,
                vad_parameters={"min_silence_duration_ms": 200},
            )

            for segment in segments:
                text = segment.text.strip()
                if text:
                    print(f"[transcribed] {text}")
                    tts_queue.put(text)


# ── TTS worker (serial so segments don't overlap) ─────────────────────────────

def tts_worker():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    while not stop_event.is_set():
        try:
            text = tts_queue.get(timeout=0.5)
        except queue.Empty:
            continue
        loop.run_until_complete(speak(text))


async def speak(text: str):
    import edge_tts
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
        tmp_path = f.name
    try:
        communicate = edge_tts.Communicate(text, TTS_VOICE)
        await communicate.save(tmp_path)
        # Play to both simultaneously
        p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])
        p2 = subprocess.Popen(["paplay", tmp_path])  # default output = your speakers
        p1.wait()
        p2.wait()
    except Exception as e:
        print(f"[tts] Error: {e}")
    finally:
        os.unlink(tmp_path)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    print("=" * 55)
    print("  voice_cloak — streaming phrase-by-phrase")
    print("=" * 55)

    setup_virtual_sink()

    print(f"[whisper] Loading '{WHISPER_MODEL}'...")
    model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
    print("[whisper] ✓ Ready")

    threads = [
        threading.Thread(target=capture_thread, daemon=True),
        threading.Thread(target=tts_worker, daemon=True),
    ]
    for t in threads:
        t.start()

    try:
        transcribe_loop(model)
    except KeyboardInterrupt:
        print("\n[main] Stopping...")
        stop_event.set()
        for t in threads:
            t.join(timeout=2)
        print("[main] Done.")


if __name__ == "__main__":
    main()
Upload main file 2026-04-05 16:56:06 +00:00			`#!/usr/bin/env python3`
			`"""`
			`voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink`
			`Speaks each phrase as it's transcribed, not after a long silence.`

			`Requirements:`
			`pip install faster-whisper sounddevice numpy edge-tts`
			`pactl / pw-cli (PulseAudio/Pipewire)`

			`First run: select "VoiceCloakMic" as your mic in Discord/whatever.`
			`"""`

			`import asyncio`
			`import subprocess`
			`import tempfile`
			`import threading`
			`import queue`
			`import sys`
			`import time`
			`import os`
			`import numpy as np`
			`import sounddevice as sd`
			`from faster_whisper import WhisperModel`

			`# ── Config ────────────────────────────────────────────────────────────────────`
			`SAMPLE_RATE = 16000`
			`CHANNELS = 1`
			`BLOCK_SECONDS = 0.3 # smaller = more responsive`
			`SILENCE_THRESH = 0.012 # RMS threshold for silence`
			`SILENCE_CHUNKS = 5 # silence chunks before flushing (~1.5s)`
			`MAX_BUFFER_CHUNKS = 60 # hard cap ~18s`
			`WHISPER_MODEL = "base.en" # tiny.en / base.en / small.en`
			`TTS_VOICE = "en-US-RogerNeural"`
			`SINK_NAME = "VoiceCloak"`
			`VIRTUAL_MIC = "VoiceCloakMic"`
			`# ─────────────────────────────────────────────────────────────────────────────`

			`audio_queue: queue.Queue = queue.Queue()`
			`tts_queue: queue.Queue = queue.Queue()`
			`stop_event = threading.Event()`


			`# ── Virtual sink ──────────────────────────────────────────────────────────────`

			`def run(cmd: str) -> str:`
			`return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()`

			`def setup_virtual_sink():`
			`if SINK_NAME in run("pactl list short sinks"):`
			`print(f"[sink] {SINK_NAME} already exists")`
			`return`
			`print(f"[sink] Creating virtual sink '{SINK_NAME}'...")`
			`mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")`
			`mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")`
			`if mod1 and mod2:`
			`print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")`
			`else:`
			`print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")`
			`sys.exit(1)`


			`# ── Audio capture ─────────────────────────────────────────────────────────────`

			`def audio_callback(indata, frames, time_info, status):`
			`if status:`
			`print(f"[audio] {status}", file=sys.stderr)`
			`audio_queue.put(indata.copy())`

			`def capture_thread():`
			`with sd.InputStream(`
			`samplerate=SAMPLE_RATE,`
			`channels=CHANNELS,`
			`dtype="float32",`
			`blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),`
			`callback=audio_callback,`
			`):`
			`print("[mic] Listening... (Ctrl+C to stop)")`
			`while not stop_event.is_set():`
			`time.sleep(0.1)`


			`# ── Transcribe loop ───────────────────────────────────────────────────────────`

			`def transcribe_loop(model: WhisperModel):`
			`buffer = []`
			`silent_count = 0`

			`while not stop_event.is_set():`
			`try:`
			`chunk = audio_queue.get(timeout=0.5)`
			`except queue.Empty:`
			`continue`

			`rms = float(np.sqrt(np.mean(chunk ** 2)))`

			`if rms < SILENCE_THRESH:`
			`silent_count += 1`
			`else:`
			`silent_count = 0`
			`buffer.append(chunk)`

			`flush = buffer and (`
			`silent_count >= SILENCE_CHUNKS or`
			`len(buffer) >= MAX_BUFFER_CHUNKS`
			`)`

			`if flush:`
			`audio_np = np.concatenate(buffer, axis=0).flatten()`
			`buffer.clear()`
			`silent_count = 0`

			`# Stream segments as Whisper produces them — fires per phrase`
			`segments, _ = model.transcribe(`
			`audio_np,`
			`language="en",`
			`vad_filter=True,`
			`vad_parameters={"min_silence_duration_ms": 200},`
			`)`

			`for segment in segments:`
			`text = segment.text.strip()`
			`if text:`
			`print(f"[transcribed] {text}")`
			`tts_queue.put(text)`


			`# ── TTS worker (serial so segments don't overlap) ─────────────────────────────`

			`def tts_worker():`
			`loop = asyncio.new_event_loop()`
			`asyncio.set_event_loop(loop)`
			`while not stop_event.is_set():`
			`try:`
			`text = tts_queue.get(timeout=0.5)`
			`except queue.Empty:`
			`continue`
			`loop.run_until_complete(speak(text))`


			`async def speak(text: str):`
			`import edge_tts`
			`with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:`
			`tmp_path = f.name`
			`try:`
			`communicate = edge_tts.Communicate(text, TTS_VOICE)`
			`await communicate.save(tmp_path)`
			`# Play to both simultaneously`
			`p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])`
			`p2 = subprocess.Popen(["paplay", tmp_path]) # default output = your speakers`
			`p1.wait()`
			`p2.wait()`
			`except Exception as e:`
			`print(f"[tts] Error: {e}")`
			`finally:`
			`os.unlink(tmp_path)`


			`# ── Main ──────────────────────────────────────────────────────────────────────`

			`def main():`
			`print("=" * 55)`
			`print(" voice_cloak — streaming phrase-by-phrase")`
			`print("=" * 55)`

			`setup_virtual_sink()`

			`print(f"[whisper] Loading '{WHISPER_MODEL}'...")`
			`model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")`
			`print("[whisper] ✓ Ready")`

			`threads = [`
			`threading.Thread(target=capture_thread, daemon=True),`
			`threading.Thread(target=tts_worker, daemon=True),`
			`]`
			`for t in threads:`
			`t.start()`

			`try:`
			`transcribe_loop(model)`
			`except KeyboardInterrupt:`
			`print("\n[main] Stopping...")`
			`stop_event.set()`
			`for t in threads:`
			`t.join(timeout=2)`
			`print("[main] Done.")`


			`if __name__ == "__main__":`
			`main()`