#!/usr/bin/env python3
"""
voice_cloak.py — Real-time mic → Whisper → TTS → virtual sink
Speaks each phrase as it's transcribed, not after a long silence.

Requirements:
    pip install faster-whisper sounddevice numpy edge-tts
    pactl / pw-cli (PulseAudio/Pipewire)

First run: select "VoiceCloakMic" as your mic in Discord/whatever.
"""

import asyncio
import subprocess
import tempfile
import threading
import queue
import sys
import time
import os
import numpy as np
import sounddevice as sd
from faster_whisper import WhisperModel

# ── Config ────────────────────────────────────────────────────────────────────
SAMPLE_RATE       = 16000
CHANNELS          = 1
BLOCK_SECONDS     = 0.3          # smaller = more responsive
SILENCE_THRESH    = 0.012        # RMS threshold for silence
SILENCE_CHUNKS    = 5            # silence chunks before flushing (~1.5s)
MAX_BUFFER_CHUNKS = 60           # hard cap ~18s
WHISPER_MODEL     = "base.en"    # tiny.en / base.en / small.en
TTS_VOICE         = "en-US-RogerNeural"
SINK_NAME         = "VoiceCloak"
VIRTUAL_MIC       = "VoiceCloakMic"
# ─────────────────────────────────────────────────────────────────────────────

audio_queue: queue.Queue = queue.Queue()
tts_queue: queue.Queue = queue.Queue()
stop_event = threading.Event()


# ── Virtual sink ──────────────────────────────────────────────────────────────

def run(cmd: str) -> str:
    return subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout.strip()

def setup_virtual_sink():
    if SINK_NAME in run("pactl list short sinks"):
        print(f"[sink] {SINK_NAME} already exists")
        return
    print(f"[sink] Creating virtual sink '{SINK_NAME}'...")
    mod1 = run(f"pactl load-module module-null-sink sink_name={SINK_NAME} sink_properties=device.description={SINK_NAME}")
    mod2 = run(f"pactl load-module module-virtual-source source_name={VIRTUAL_MIC} master={SINK_NAME}.monitor")
    if mod1 and mod2:
        print(f"[sink] ✓ Ready. Set '{VIRTUAL_MIC}' as your mic in apps.")
    else:
        print("[sink] ✗ Failed. Running PulseAudio/Pipewire?")
        sys.exit(1)


# ── Audio capture ─────────────────────────────────────────────────────────────

def audio_callback(indata, frames, time_info, status):
    if status:
        print(f"[audio] {status}", file=sys.stderr)
    audio_queue.put(indata.copy())

def capture_thread():
    with sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype="float32",
        blocksize=int(SAMPLE_RATE * BLOCK_SECONDS),
        callback=audio_callback,
    ):
        print("[mic] Listening... (Ctrl+C to stop)")
        while not stop_event.is_set():
            time.sleep(0.1)


# ── Transcribe loop ───────────────────────────────────────────────────────────

def transcribe_loop(model: WhisperModel):
    buffer = []
    silent_count = 0

    while not stop_event.is_set():
        try:
            chunk = audio_queue.get(timeout=0.5)
        except queue.Empty:
            continue

        rms = float(np.sqrt(np.mean(chunk ** 2)))

        if rms < SILENCE_THRESH:
            silent_count += 1
        else:
            silent_count = 0
            buffer.append(chunk)

        flush = buffer and (
            silent_count >= SILENCE_CHUNKS or
            len(buffer) >= MAX_BUFFER_CHUNKS
        )

        if flush:
            audio_np = np.concatenate(buffer, axis=0).flatten()
            buffer.clear()
            silent_count = 0

            # Stream segments as Whisper produces them — fires per phrase
            segments, _ = model.transcribe(
                audio_np,
                language="en",
                vad_filter=True,
                vad_parameters={"min_silence_duration_ms": 200},
            )

            for segment in segments:
                text = segment.text.strip()
                if text:
                    print(f"[transcribed] {text}")
                    tts_queue.put(text)


# ── TTS worker (serial so segments don't overlap) ─────────────────────────────

def tts_worker():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    while not stop_event.is_set():
        try:
            text = tts_queue.get(timeout=0.5)
        except queue.Empty:
            continue
        loop.run_until_complete(speak(text))


async def speak(text: str):
    import edge_tts
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
        tmp_path = f.name
    try:
        communicate = edge_tts.Communicate(text, TTS_VOICE)
        await communicate.save(tmp_path)
        # Play to both simultaneously
        p1 = subprocess.Popen(["paplay", "--device", SINK_NAME, tmp_path])
        p2 = subprocess.Popen(["paplay", tmp_path])  # default output = your speakers
        p1.wait()
        p2.wait()
    except Exception as e:
        print(f"[tts] Error: {e}")
    finally:
        os.unlink(tmp_path)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    print("=" * 55)
    print("  voice_cloak — streaming phrase-by-phrase")
    print("=" * 55)

    setup_virtual_sink()

    print(f"[whisper] Loading '{WHISPER_MODEL}'...")
    model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
    print("[whisper] ✓ Ready")

    threads = [
        threading.Thread(target=capture_thread, daemon=True),
        threading.Thread(target=tts_worker, daemon=True),
    ]
    for t in threads:
        t.start()

    try:
        transcribe_loop(model)
    except KeyboardInterrupt:
        print("\n[main] Stopping...")
        stop_event.set()
        for t in threads:
            t.join(timeout=2)
        print("[main] Done.")


if __name__ == "__main__":
    main()