import os
import requests
import json
import logging
import tempfile
import librosa
import numpy as np
import soundfile as sf
import base64
import io
import re
from typing import Dict, Any, Optional

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("response_iq_analysis")

OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "llama3.1:8b"


def download_or_save_audio(voice_input: str) -> str:
    """Handles both URL and Base64 audio inputs, saving to a temporary file."""
    try:
        if not voice_input:
            return ""

        # Check if it's a URL
        if voice_input.startswith("http://") or voice_input.startswith("https://"):
            response = requests.get(voice_input, stream=True)
            response.raise_for_status()
            suffix = ".mpeg"
            if "." in voice_input.split("/")[-1]:
                suffix = "." + voice_input.split("/")[-1].split(".")[-1]
            
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
                for chunk in response.iter_content(chunk_size=8192):
                    tmp_file.write(chunk)
                return tmp_file.name

        # Otherwise, assume it's Base64
        else:
            # Clean up base64 string if it contains data URI prefix
            if "," in voice_input:
                voice_input = voice_input.split(",")[1]
            
            audio_data = base64.b64decode(voice_input)
            # We don't know the format, so we save it as .wav or .mp3 and let librosa handle it
            # Using .wav is generally safe for raw decoded bytes if it's a standard format
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                tmp_file.write(audio_data)
                return tmp_file.name

    except Exception as e:
        logger.error(f"Failed to process audio input: {e}")
        return ""

def analyze_audio_features(file_path: str) -> Dict[str, Any]:
    """
    Extracts advanced acoustic features using librosa for improved tonal analysis.
    """
    try:
        y, sr = librosa.load(file_path, sr=None)
        
        # 1. Pitch / Fundamental Frequency
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitches_masked = pitches[magnitudes > np.median(magnitudes)]
        avg_pitch = np.mean(pitches_masked) if len(pitches_masked) > 0 else 0
        pitch_std = np.std(pitches_masked) if len(pitches_masked) > 0 else 0
        
        # 2. Tempo / Speaking Rate
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
        avg_tempo = tempo[0] if len(tempo) > 0 else 0
        
        # 3. Energy / Loudness & Stability
        rms = librosa.feature.rms(y=y)[0]
        avg_energy = np.mean(rms)
        energy_std = np.std(rms) # Variation in energy (shakiness vs steady)
        
        # 4. Spectral Features (Advanced)
        # Spectral Centroid: "brightness" of the sound
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        avg_centroid = np.mean(spectral_centroid)
        
        # Spectral Flatness: noisy/breathiness vs tonal
        flatness = librosa.feature.spectral_flatness(y=y)[0]
        avg_flatness = np.mean(flatness)
        
        # Zero Crossing Rate: how often the signal changes sign (indicator of percussiveness/consonants)
        zcr = librosa.feature.zero_crossing_rate(y=y)[0]
        avg_zcr = np.mean(zcr)

        # 5. MFCCs (Mel-frequency cepstral coefficients) - Captures speech texture
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_means = np.mean(mfcc, axis=1)

        description = []
        # Pace
        if avg_tempo > 130: description.append("Fast speaking pace (high energy or nervous).")
        elif avg_tempo < 80: description.append("Slow speaking pace (measured or low energy).")
        else: description.append("Normal/Moderate speaking pace.")
            
        # Pitch variation
        if pitch_std > 60: description.append("High pitch variation (expressive/excited).")
        elif pitch_std < 20: description.append("Low pitch variation (monotone/robotic).")
        
        # Energy/Confidence
        if energy_std / (avg_energy + 1e-6) > 0.5: description.append("Significant energy fluctuations (might indicate hesitation or shaky voice).")
        else: description.append("Stable vocal energy (suggesting confidence).")

        # Texture/Flatness
        if avg_flatness > 0.05: description.append("Breathy or noisy vocal texture.")
        
        return {
            "summary": " ".join(description),
            "stats": {
                "avg_pitch_hz": float(avg_pitch),
                "pitch_std": float(pitch_std),
                "tempo_bpm": float(avg_tempo),
                "avg_energy": float(avg_energy),
                "energy_fluctuation": float(energy_std),
                "avg_centroid": float(avg_centroid),
                "avg_flatness": float(avg_flatness),
                "avg_zcr": float(avg_zcr)
            }
        }
    except Exception as e:
        logger.error(f"Audio analysis failed: {e}")
        return {"summary": "Technical audio extraction failed.", "stats": {}}

def analyze_response(
    question: str, 
    predefined_answer: str, 
    user_answer: str, 
    user_voice_url: Optional[str] = None
) -> Dict[str, Any]:
    
    audio_analysis_text = ""
    audio_file_path = None
    
    # 1. Process Audio if present
    audio_features = None
    if user_voice_url:
        logger.info(f"Processing audio input...")
        audio_file_path = download_or_save_audio(user_voice_url)
        if audio_file_path:
            logger.info("Extracting advanced audio features...")
            audio_features = analyze_audio_features(audio_file_path)
            # Cleanup
            try:
                os.remove(audio_file_path)
            except:
                pass

    # 2. Construct Prompt
    system_prompt = (
        "You are a world-class communication expert, behavioral psychologist, and tonal analyst. "
        "Your goal is to provide a deep, professional evaluation of a candidate's response. "
        "Analyze the text for semantic accuracy AND the voice for emotional nuance. "
        "Return the result ONLY in strict JSON format."
    )
    
    audio_context = ""
    if audio_features:
        audio_context = f"""
        Voice Tonal Metadata: {audio_features['summary']}
        Detailed Acoustic Stats: {json.dumps(audio_features['stats'])}
        (Interpreting stats: Low pitch_std = monotone, High energy_fluctuation = shaky/hesitant, 
        High avg_flatness = breathy/unclear, Moderate tempo = confident/composed).
        """

    prompt = f"""
    Evaluate the following interview response with high precision.

    INPUT DATA:
    - Question: "{question}"
    - Ideal/Predefined Answer: "{predefined_answer}"
    - User's Actual Answer: "{user_answer}"
    {audio_context}
    
    REQUIRED ANALYSIS:
    1. Match Score (0-100): Quantify how well the user's answer covers the core concepts of the ideal answer. Be strict but fair.
    2. Comparison: A professional summary of why the score was given.
    3. Behavioral Analysis: Analyze the user's communication style. Are they assertive, analytical, empathetic, or vague? Check for logical flow and vocabulary choice.
    4. Tonal Analysis: 
       - If voice metadata is present: Provide a detailed breakdown of the user's delivery (confidence, pace, emotional stability). 
       - If voice metadata is NOT present: Base this on the 'tone' of the written text (formal, casual, apologetic).
    5. Visuals: Exactly 3 professional keywords summarizing the vibe (e.g., "Precise", "Authoritative", "Strategic").
    
    OUTPUT FORMAT (Strict JSON):
    {{
        "matchScore": <int>,
        "comparison": "<str>",
        "behavioralAnalysis": "<str>",
        "tonalAnalysis": "<str>",
        "visuals": ["<kw1>", "<kw2>", "<kw3>"]
    }}
    """

    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "system": system_prompt,
        "stream": False,
        "format": "json",
        "options": {
            "temperature": 0.2,
            "num_ctx": 4096,
            "num_predict": 512
        }
    }


    try:
        logger.info(f"Sending request to Ollama using model {OLLAMA_MODEL}...")
        response = requests.post(OLLAMA_API_URL, json=payload, timeout=180)
        
        if response.status_code == 404:
            logger.error(f"Model ID '{OLLAMA_MODEL}' not found (404).")
            return {
                "error": f"Model '{OLLAMA_MODEL}' not found via Ollama. Please pull it using 'ollama pull {OLLAMA_MODEL}'"
            }
            
        response.raise_for_status()
        result = response.json()
        
        # Parse output
        llm_response_text = result.get("response", "")
        # Sometimes small models might add markdown code blocks
        if "```json" in llm_response_text:
            llm_response_text = llm_response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in llm_response_text:
            llm_response_text = llm_response_text.split("```")[1].split("```")[0].strip()
            
        parsed_json = json.loads(llm_response_text)
        return parsed_json
        
    except json.JSONDecodeError:
        logger.error("Failed to parse JSON from LLM")
        return {
            "error": "LLM returned invalid JSON",
            "raw_response": llm_response_text
        }

    except requests.RequestException as e:
        logger.error(f"Ollama connection error: {e}")
        return {
            "error": f"Could not connect to LLM: {str(e)}"
        }

if __name__ == "__main__":
    import argparse
    import sys

    parser = argparse.ArgumentParser(description="Run Response IQ Analysis from Terminal")
    parser.add_argument("--question", type=str, help="The interview question")
    parser.add_argument("--predefined", type=str, help="The correct/ideal answer")
    parser.add_argument("--user", type=str, help="The user's answer")
    parser.add_argument("--voice", type=str, help="URL to user's voice recording", default=None)
    parser.add_argument("--json_input", type=str, help="JSON string with keys: question, predefinedAnswer, userAnswer, userVoice")

    args = parser.parse_args()

    q = args.question
    pa = args.predefined
    ua = args.user
    uv = args.voice

    # Support passing a single JSON string/file for convenience
    if args.json_input:
        try:
            # If it looks like a file, read it
            if os.path.isfile(args.json_input):
                with open(args.json_input, 'r') as f:
                    data = json.load(f)
            else:
                data = json.loads(args.json_input)
            
            q = data.get("question")
            pa = data.get("predefinedAnswer")
            ua = data.get("userAnswer")
            uv = data.get("userVoice")
        except Exception as e:
            print(f"Error parsing JSON input: {e}")
            sys.exit(1)

    # Default values for quick test if nothing provided
    if not q or not pa or not ua:
        print("No input provided. Running with DEFAULT DEMO values...")
        q = "this is the question"
        pa = "this is the predefined answer"
        ua = "this is the user answer"
        uv = None # "https://api.edurigo1.com/response_iq/3_223_oeficcrc_1864074.mpeg"

    print("\n--- Starting Analysis ---")
    print(f"Question: {q}")
    print(f"User Answer: {ua}")
    if uv:
        print(f"Voice URL: {uv}")
    
    result = analyze_response(q, pa, ua, uv)
    
    print("\n--- Analysis Result ---")
    print(json.dumps(result, indent=2))