"""
response_iq_tts.py
──────────────────────────────────────────────────────────────────────────────
Text-to-Speech module for Response IQ powered by Microsoft Edge Neural TTS.

Architecture
────────────
  • 100% Serverless/Cloud-based CPU engine (0 VRAM used).
  • Zero local transformers models required.
  • Spawns a dedicated ThreadPoolExecutor background pipeline to prevent
    FastAPI uvloop event-loop blocking.
  • Decodes cloud-generated MP3 streams to native PCM WAV formats via
    PyTorch audio decoders seamlessly.

Supported languages (14)
  english · hinglish · hindi · marathi · punjabi · tamil · telugu · kannada
  gujarati · bengali · odia · assamese · malayalam · urdu

Version: 5.0.0 (Edge Neural Migration)
──────────────────────────────────────────────────────────────────────────────
"""

from __future__ import annotations

import asyncio
import base64
import concurrent.futures
import io
import logging
import re
import unicodedata
from typing import Dict, List, Tuple

import numpy as np
import soundfile as sf
import torchaudio

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("response_iq_tts")

# ===========================================================================
# Audio & Format settings & System Constants
# ===========================================================================
TARGET_SAMPLE_RATE = 22050
OUTPUT_FORMAT      = "WAV"
TTS_MODEL_ID       = "Microsoft Edge Azure Neural TTS"

# ===========================================================================
# Microsoft Edge TTS Language -> Neural Voice Configuration mapping
# ===========================================================================
TTS_LANGUAGE_CONFIG: Dict[str, Dict[str, str]] = {
    "english":   {"display_name": "English",   "bcp47_lang_tag": "en-IN",  "voice": "en-IN-NeerjaNeural"},
    "hinglish":  {"display_name": "Hinglish",  "bcp47_lang_tag": "hi-IN",  "voice": "hi-IN-SwaraNeural"},
    "hindi":     {"display_name": "Hindi",     "bcp47_lang_tag": "hi-IN",  "voice": "hi-IN-SwaraNeural"},
    "bengali":   {"display_name": "Bengali",   "bcp47_lang_tag": "bn-IN",  "voice": "bn-IN-TanishaaNeural"},
    "urdu":      {"display_name": "Urdu",      "bcp47_lang_tag": "ur-IN",  "voice": "ur-IN-GulNeural"},
    "marathi":   {"display_name": "Marathi",   "bcp47_lang_tag": "mr-IN",  "voice": "mr-IN-AarohiNeural"},
    "gujarati":  {"display_name": "Gujarati",  "bcp47_lang_tag": "gu-IN",  "voice": "gu-IN-DhwaniNeural"},
    "punjabi":   {"display_name": "Punjabi",   "bcp47_lang_tag": "pa-IN",  "voice": "pa-IN-OjasNeural"},
    "tamil":     {"display_name": "Tamil",     "bcp47_lang_tag": "ta-IN",  "voice": "ta-IN-PallaviNeural"},
    "telugu":    {"display_name": "Telugu",    "bcp47_lang_tag": "te-IN",  "voice": "te-IN-ShrutiNeural"},
    "kannada":   {"display_name": "Kannada",   "bcp47_lang_tag": "kn-IN",  "voice": "kn-IN-SapnaNeural"},
    "malayalam": {"display_name": "Malayalam", "bcp47_lang_tag": "ml-IN",  "voice": "ml-IN-SobhanaNeural"},
    "odia":      {"display_name": "Odia",      "bcp47_lang_tag": "or-IN",  "voice": "hi-IN-SwaraNeural"},
    "assamese":  {"display_name": "Assamese",  "bcp47_lang_tag": "as-IN",  "voice": "hi-IN-SwaraNeural"},
    "maithili":  {"display_name": "Maithili",  "bcp47_lang_tag": "mai-IN", "voice": "hi-IN-SwaraNeural"},
}

TTS_SUPPORTED_LANGUAGES: List[str] = sorted(TTS_LANGUAGE_CONFIG.keys())
TTS_DEFAULT_LANGUAGE: str          = "english"


# ===========================================================================
# Text pre-processing
# ===========================================================================
def _normalise_text(text: str) -> str:
    """NFC normalisation and removal of invisible control characters."""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", text)
    text = re.sub(r"[ \t]+", " ", text).strip()
    return text


# ===========================================================================
# Edge TTS Core Synthesis
# ===========================================================================
def _synthesise_edge_tts(text: str, voice: str) -> Tuple[np.ndarray, int]:
    """
    Synthesises high-quality text using Microsoft Azure Neural voices via edge-tts.
    Executes entirely in a separate thread to cleanly bypass FastAPI uvloop.
    """
    try:
        import edge_tts
    except ImportError:
        raise RuntimeError(
            "The 'edge-tts' package is missing. "
            "Please install it: pip install edge-tts"
        )

    def _run_async_tts() -> bytes:
        async def _generate() -> bytes:
            communicate = edge_tts.Communicate(text, voice)
            audio_data = bytearray()
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    audio_data.extend(chunk["data"])
            return bytes(audio_data)
            
        # Spawn a pristine native asyncio event-loop in this thread
        return asyncio.run(_generate())

    try:
        # Background worker isolation for async bridging
        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
            mp3_bytes = executor.submit(_run_async_tts).result(timeout=60)
    except Exception as exc:
        raise RuntimeError(f"Edge TTS Cloud Engine generation failed: {exc}")

    try:
        # Decode the raw MP3 output back into a manipulatable numpy array
        # This pipeline leverages torchaudio natively which seamlessly bridges MP3
        waveform, sample_rate = torchaudio.load(io.BytesIO(mp3_bytes), format="mp3")
        audio_np = waveform.squeeze().cpu().numpy()
        return audio_np, sample_rate
    except Exception as exc:
        raise RuntimeError(f"Failed to decode MP3 stream from cloud: {exc}")


def _resample_if_needed(audio: np.ndarray, src_rate: int, target_rate: int) -> np.ndarray:
    if src_rate == target_rate:
        return audio
    try:
        import librosa
        return librosa.resample(audio, orig_sr=src_rate, target_sr=target_rate)
    except ImportError:
        logger.warning("librosa not installed — skipping fallback resample.")
        return audio


def _audio_to_base64_wav(audio: np.ndarray, sample_rate: int) -> str:
    buf = io.BytesIO()
    sf.write(buf, audio, sample_rate, format="WAV", subtype="PCM_16")
    buf.seek(0)
    return base64.b64encode(buf.read()).decode("utf-8")


# ===========================================================================
# Public API
# ===========================================================================
def generate_tts(text: str, language: str = TTS_DEFAULT_LANGUAGE) -> Dict:
    """
    Convert text to human-like speech using Microsoft Edge Neural Cloud APIs.

    Returns (success)
    ─────────────────
    {
        "audio_tts"   : "<Base64-encoded WAV string>",
        "language"    : "hindi",
        "sample_rate" : 22050,
        "duration_sec": 4.83,
        "char_count"  : 232,
    }
    """

    # 1. Validation
    language = (language or TTS_DEFAULT_LANGUAGE).strip().lower()
    if language not in TTS_LANGUAGE_CONFIG:
        return {
            "error": (
                f"Unsupported language '{language}'. "
                f"Valid values: {TTS_SUPPORTED_LANGUAGES}"
            )
        }
    
    if not text or not text.strip():
        return {"error": "Text payload is empty. Provide a non-empty 'predefinedAnswer'."}

    # 2. Pre-processing
    text = _normalise_text(text)
    lang_cfg = TTS_LANGUAGE_CONFIG[language]
    voice = lang_cfg["voice"]

    logger.info(
        f"TTS Cloud Request | language={language} ({lang_cfg['display_name']}) | "
        f"voice={voice} | chars={len(text)}"
    )

    # 3. Core Synthesis
    try:
        audio_np, raw_sample_rate = _synthesise_edge_tts(text, voice)
        
        # 4. Conversion Pipeline
        final_audio  = _resample_if_needed(audio_np, raw_sample_rate, TARGET_SAMPLE_RATE)
        audio_b64    = _audio_to_base64_wav(final_audio, TARGET_SAMPLE_RATE)
        duration_sec = round(len(final_audio) / TARGET_SAMPLE_RATE, 2)
        
        logger.info(
            f"TTS Cloud Output complete | duration={duration_sec}s | "
            f"base64_len={len(audio_b64)}"
        )
        
        return {
            "audio_tts":    audio_b64,
            "language":     language,
            "sample_rate":  TARGET_SAMPLE_RATE,
            "duration_sec": duration_sec,
            "char_count":   len(text),
        }

    except Exception as exc:
        logger.error(f"TTS Cloud Pipeline failed: {exc}")
        return {"error": str(exc)}