From 03df49c921672ae3eefd870d90d9e612fdbd50fa Mon Sep 17 00:00:00 2001 From: Zane V Date: Sun, 5 Apr 2026 16:56:06 +0000 Subject: [PATCH] Upload main file --- voice_cloak.py | 189 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 voice_cloak.py diff --git a/voice_cloak.py b/voice_cloak.py new file mode 100644 index 0000000..666fa21 --- /dev/null +++ b/voice_cloak.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink +Speaks each phrase as it's transcribed, not after a long silence. + +Requirements: + pip install faster-whisper sounddevice numpy edge-tts + pactl / pw-cli (PulseAudio/Pipewire) + +First run: select "VoiceCloakMic" as your mic in Discord/whatever. +""" + +import asyncio +import subprocess +import tempfile +import threading +import queue +import sys +import time +import os +import numpy as np +import sounddevice as sd +from faster_whisper import WhisperModel + +# ── Config ──────────────────────────────────────────────────────────────────── +SAMPLE_RATE = 16000 +CHANNELS = 1 +BLOCK_SECONDS = 0.3 # smaller = more responsive +SILENCE_THRESH = 0.012 # RMS threshold for silence +SILENCE_CHUNKS = 5 # silence chunks before flushing (~1.5s) +MAX_BUFFER_CHUNKS = 60 # hard cap ~18s +WHISPER_MODEL = "base.en" # tiny.en / base.en / small.en +TTS_VOICE = "en-US-RogerNeural" +SINK_NAME = "VoiceCloak" +VIRTUAL_MIC = "VoiceCloakMic" +# ───────────────────────────────────────────────────────────────────────────── + +audio_queue: queue.Queue = queue.Queue() +tts_queue: queue.Queue = queue.Queue() +stop_event = threading.Event() + + +# ── Virtual sink ────────────────────────────────────────────────────────────── + +def run(cmd: str) -> str: + return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip() + +def setup_virtual_sink(): + if SINK_NAME in run("pactl list short sinks"): + print(f"[sink] {SINK_NAME} already exists") + return + print(f"[sink] Creating virtual sink '{SINK_NAME}'...") + mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}") + mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor") + if mod1 and mod2: + print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.") + else: + print("[sink] ✗ Failed. Running PulseAudio/Pipewire?") + sys.exit(1) + + +# ── Audio capture ───────────────────────────────────────────────────────────── + +def audio_callback(indata, frames, time_info, status): + if status: + print(f"[audio] {status}", file=sys.stderr) + audio_queue.put(indata.copy()) + +def capture_thread(): + with sd.InputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype="float32", + blocksize=int(SAMPLE_RATE * BLOCK_SECONDS), + callback=audio_callback, + ): + print("[mic] Listening... (Ctrl+C to stop)") + while not stop_event.is_set(): + time.sleep(0.1) + + +# ── Transcribe loop ─────────────────────────────────────────────────────────── + +def transcribe_loop(model: WhisperModel): + buffer = [] + silent_count = 0 + + while not stop_event.is_set(): + try: + chunk = audio_queue.get(timeout=0.5) + except queue.Empty: + continue + + rms = float(np.sqrt(np.mean(chunk ** 2))) + + if rms < SILENCE_THRESH: + silent_count += 1 + else: + silent_count = 0 + buffer.append(chunk) + + flush = buffer and ( + silent_count >= SILENCE_CHUNKS or + len(buffer) >= MAX_BUFFER_CHUNKS + ) + + if flush: + audio_np = np.concatenate(buffer, axis=0).flatten() + buffer.clear() + silent_count = 0 + + # Stream segments as Whisper produces them — fires per phrase + segments, _ = model.transcribe( + audio_np, + language="en", + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 200}, + ) + + for segment in segments: + text = segment.text.strip() + if text: + print(f"[transcribed] {text}") + tts_queue.put(text) + + +# ── TTS worker (serial so segments don't overlap) ───────────────────────────── + +def tts_worker(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + while not stop_event.is_set(): + try: + text = tts_queue.get(timeout=0.5) + except queue.Empty: + continue + loop.run_until_complete(speak(text)) + + +async def speak(text: str): + import edge_tts + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tmp_path = f.name + try: + communicate = edge_tts.Communicate(text, TTS_VOICE) + await communicate.save(tmp_path) + # Play to both simultaneously + p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path]) + p2 = subprocess.Popen(["paplay", tmp_path]) # default output = your speakers + p1.wait() + p2.wait() + except Exception as e: + print(f"[tts] Error: {e}") + finally: + os.unlink(tmp_path) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + print("=" * 55) + print(" voice_cloak — streaming phrase-by-phrase") + print("=" * 55) + + setup_virtual_sink() + + print(f"[whisper] Loading '{WHISPER_MODEL}'...") + model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8") + print("[whisper] ✓ Ready") + + threads = [ + threading.Thread(target=capture_thread, daemon=True), + threading.Thread(target=tts_worker, daemon=True), + ] + for t in threads: + t.start() + + try: + transcribe_loop(model) + except KeyboardInterrupt: + print("\n[main] Stopping...") + stop_event.set() + for t in threads: + t.join(timeout=2) + print("[main] Done.") + + +if __name__ == "__main__": + main() \ No newline at end of file