import re
import requests
import json
from typing import List

class Translator:
    def __init__(self, ollama_url: str = "http://localhost:11434", model: str = "gemma3:12b"):
        self.ollama_url = ollama_url
        self.model = model
        self.api_endpoint = f"{ollama_url}/api/generate"
    
    def translate_text(self, text: str, target_language: str = "Hindi") -> tuple[str, dict]:
        try:
            if not text.strip():
                return text, {"prompt_eval_count": 0, "eval_count": 0}
            
            # Simple, direct prompt to avoid instruction bleeding
            prompt = f"""Translate this text to {target_language}. Provide only the translated text, no explanations or additional content. Keep all HTML tags unchanged:

{text}"""
            
            payload = {
                "model": self.model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.2,
                    "top_p": 0.9,
                    "max_tokens": 2000,
                    "stop": ["IMPORTANT", "Instructions:", "Text to translate:"]
                }
            }
            
            response = requests.post(
                self.api_endpoint,
                json=payload,
                headers={"Content-Type": "application/json"},
                timeout=60
            )
            
            if response.status_code == 200:
                result = response.json()
                translated_text = result.get('response', '').strip()
                
                # Clean up any instruction bleeding
                translated_text = self._clean_response(translated_text)
                
                # Extract token usage from Ollama response
                token_usage = {
                    "prompt_eval_count": result.get('prompt_eval_count', 0),
                    "eval_count": result.get('eval_count', 0)
                }
                
                return translated_text, token_usage
            else:
                return text, {"prompt_eval_count": 0, "eval_count": 0}
                
        except Exception:
            return text, {"prompt_eval_count": 0, "eval_count": 0}
    
    def _clean_response(self, text: str) -> str:
        """Clean up LLM response to remove instruction bleeding and artifacts"""
        # Remove common instruction artifacts
        cleanup_patterns = [
            r'IMPORTANT INSTRUCTIONS?:.*?(?=\n|$)',
            r'Instructions?:.*?(?=\n|$)',
            r'Text to translate:.*?(?=\n|$)',
            r'Hindi translation:.*?(?=\n|$)',
            r'[A-Za-z]+ translation:.*?(?=\n|$)',
            r'- Keep all HTML.*?(?=\n|$)',
            r'- Only translate.*?(?=\n|$)',
            r'- Preserve.*?(?=\n|$)',
            r'- If there are.*?(?=\n|$)',
            r'<[^>]*?\s+[^>]*?>',  # Fix broken HTML tags with spaces
        ]
        
        for pattern in cleanup_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
        
        # Fix common HTML tag issues
        text = re.sub(r'<\s+([^>]+)\s*>', r'<\1>', text)  # Remove spaces in tags
        text = re.sub(r'<([^>]+)\s+>', r'<\1>', text)     # Remove trailing spaces in tags
        
        # Clean up multiple newlines and extra whitespace
        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
        text = text.strip()
        
        return text
    
    def count_tokens(self, text: str) -> int:
        """
        Estimate token count for given text
        Basic estimation: ~4 characters per token for English
        """
        if not text:
            return 0
        return len(text) // 4
    
    def translate_array(self, sentences: List[str], target_language: str = "Hindi") -> tuple[List[str], dict]:
        translated_sentences = []
        total_prompt_tokens = 0
        total_completion_tokens = 0
        
        for sentence in sentences:
            translated, token_usage = self.translate_text(sentence, target_language)
            translated_sentences.append(translated)
            total_prompt_tokens += token_usage.get('prompt_eval_count', 0)
            total_completion_tokens += token_usage.get('eval_count', 0)
        
        token_summary = {
            "input_tokens": total_prompt_tokens,
            "output_tokens": total_completion_tokens,
            "total_tokens": total_prompt_tokens + total_completion_tokens
        }
        
        return translated_sentences, token_summary