import whisper
import json
from datetime import timedelta

class TranscriptionProcessor:
    def __init__(self, model_size='large'):
        print(f"Loading Whisper model: {model_size}")
        self.model = whisper.load_model(model_size)
    
    def transcribe_with_timestamps(self, audio_file):
        """Generate transcript with precise timestamps"""
        result = self.model.transcribe(
            str(audio_file),
            language='en',
            word_timestamps=True,
            verbose=False
        )
        return result
    
    def create_srt_file(self, transcript_result, output_file):
        """Convert transcript to SRT format"""
        srt_content = []
        
        for i, segment in enumerate(transcript_result['segments']):
            start_time = self.format_timestamp(segment['start'])
            end_time = self.format_timestamp(segment['end'])
            text = segment['text'].strip()
            
            srt_content.append(f"{i+1}")
            srt_content.append(f"{start_time} --> {end_time}")
            srt_content.append(text)
            srt_content.append("")  # Empty line
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(srt_content))
    
    def format_timestamp(self, seconds):
        """Convert seconds to SRT timestamp format"""
        td = timedelta(seconds=seconds)
        hours, remainder = divmod(td.total_seconds(), 3600)
        minutes, seconds = divmod(remainder, 60)
        milliseconds = int((seconds % 1) * 1000)
        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{milliseconds:03d}"
    
    def extract_technical_terms(self, transcript_result):
        """Identify technical terms for translation reference"""
        import re
        
        technical_terms = set()
        for segment in transcript_result['segments']:
            # Simple heuristic for technical terms
            words = re.findall(r'\b[A-Z][a-z]*(?:[A-Z][a-z]*)*\b', segment['text'])
            technical_terms.update(words)
        
        return list(technical_terms)