Upload main file
This commit is contained in:
189
voice_cloak.py
Normal file
189
voice_cloak.py
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink
|
||||||
|
Speaks each phrase as it's transcribed, not after a long silence.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
pip install faster-whisper sounddevice numpy edge-tts
|
||||||
|
pactl / pw-cli (PulseAudio/Pipewire)
|
||||||
|
|
||||||
|
First run: select "VoiceCloakMic" as your mic in Discord/whatever.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import sounddevice as sd
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
|
SAMPLE_RATE = 16000
|
||||||
|
CHANNELS = 1
|
||||||
|
BLOCK_SECONDS = 0.3 # smaller = more responsive
|
||||||
|
SILENCE_THRESH = 0.012 # RMS threshold for silence
|
||||||
|
SILENCE_CHUNKS = 5 # silence chunks before flushing (~1.5s)
|
||||||
|
MAX_BUFFER_CHUNKS = 60 # hard cap ~18s
|
||||||
|
WHISPER_MODEL = "base.en" # tiny.en / base.en / small.en
|
||||||
|
TTS_VOICE = "en-US-RogerNeural"
|
||||||
|
SINK_NAME = "VoiceCloak"
|
||||||
|
VIRTUAL_MIC = "VoiceCloakMic"
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
audio_queue: queue.Queue = queue.Queue()
|
||||||
|
tts_queue: queue.Queue = queue.Queue()
|
||||||
|
stop_event = threading.Event()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Virtual sink ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def run(cmd: str) -> str:
|
||||||
|
return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()
|
||||||
|
|
||||||
|
def setup_virtual_sink():
|
||||||
|
if SINK_NAME in run("pactl list short sinks"):
|
||||||
|
print(f"[sink] {SINK_NAME} already exists")
|
||||||
|
return
|
||||||
|
print(f"[sink] Creating virtual sink '{SINK_NAME}'...")
|
||||||
|
mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")
|
||||||
|
mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")
|
||||||
|
if mod1 and mod2:
|
||||||
|
print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")
|
||||||
|
else:
|
||||||
|
print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Audio capture ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def audio_callback(indata, frames, time_info, status):
|
||||||
|
if status:
|
||||||
|
print(f"[audio] {status}", file=sys.stderr)
|
||||||
|
audio_queue.put(indata.copy())
|
||||||
|
|
||||||
|
def capture_thread():
|
||||||
|
with sd.InputStream(
|
||||||
|
samplerate=SAMPLE_RATE,
|
||||||
|
channels=CHANNELS,
|
||||||
|
dtype="float32",
|
||||||
|
blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),
|
||||||
|
callback=audio_callback,
|
||||||
|
):
|
||||||
|
print("[mic] Listening... (Ctrl+C to stop)")
|
||||||
|
while not stop_event.is_set():
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Transcribe loop ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def transcribe_loop(model: WhisperModel):
|
||||||
|
buffer = []
|
||||||
|
silent_count = 0
|
||||||
|
|
||||||
|
while not stop_event.is_set():
|
||||||
|
try:
|
||||||
|
chunk = audio_queue.get(timeout=0.5)
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
rms = float(np.sqrt(np.mean(chunk ** 2)))
|
||||||
|
|
||||||
|
if rms < SILENCE_THRESH:
|
||||||
|
silent_count += 1
|
||||||
|
else:
|
||||||
|
silent_count = 0
|
||||||
|
buffer.append(chunk)
|
||||||
|
|
||||||
|
flush = buffer and (
|
||||||
|
silent_count >= SILENCE_CHUNKS or
|
||||||
|
len(buffer) >= MAX_BUFFER_CHUNKS
|
||||||
|
)
|
||||||
|
|
||||||
|
if flush:
|
||||||
|
audio_np = np.concatenate(buffer, axis=0).flatten()
|
||||||
|
buffer.clear()
|
||||||
|
silent_count = 0
|
||||||
|
|
||||||
|
# Stream segments as Whisper produces them — fires per phrase
|
||||||
|
segments, _ = model.transcribe(
|
||||||
|
audio_np,
|
||||||
|
language="en",
|
||||||
|
vad_filter=True,
|
||||||
|
vad_parameters={"min_silence_duration_ms": 200},
|
||||||
|
)
|
||||||
|
|
||||||
|
for segment in segments:
|
||||||
|
text = segment.text.strip()
|
||||||
|
if text:
|
||||||
|
print(f"[transcribed] {text}")
|
||||||
|
tts_queue.put(text)
|
||||||
|
|
||||||
|
|
||||||
|
# ── TTS worker (serial so segments don't overlap) ─────────────────────────────
|
||||||
|
|
||||||
|
def tts_worker():
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
while not stop_event.is_set():
|
||||||
|
try:
|
||||||
|
text = tts_queue.get(timeout=0.5)
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
loop.run_until_complete(speak(text))
|
||||||
|
|
||||||
|
|
||||||
|
async def speak(text: str):
|
||||||
|
import edge_tts
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
|
||||||
|
tmp_path = f.name
|
||||||
|
try:
|
||||||
|
communicate = edge_tts.Communicate(text, TTS_VOICE)
|
||||||
|
await communicate.save(tmp_path)
|
||||||
|
# Play to both simultaneously
|
||||||
|
p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])
|
||||||
|
p2 = subprocess.Popen(["paplay", tmp_path]) # default output = your speakers
|
||||||
|
p1.wait()
|
||||||
|
p2.wait()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[tts] Error: {e}")
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 55)
|
||||||
|
print(" voice_cloak — streaming phrase-by-phrase")
|
||||||
|
print("=" * 55)
|
||||||
|
|
||||||
|
setup_virtual_sink()
|
||||||
|
|
||||||
|
print(f"[whisper] Loading '{WHISPER_MODEL}'...")
|
||||||
|
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
||||||
|
print("[whisper] ✓ Ready")
|
||||||
|
|
||||||
|
threads = [
|
||||||
|
threading.Thread(target=capture_thread, daemon=True),
|
||||||
|
threading.Thread(target=tts_worker, daemon=True),
|
||||||
|
]
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
transcribe_loop(model)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n[main] Stopping...")
|
||||||
|
stop_event.set()
|
||||||
|
for t in threads:
|
||||||
|
t.join(timeout=2)
|
||||||
|
print("[main] Done.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user