import os
import time
import random
import re
import json
import heapq
from jsonschema import validate, ValidationError
from langchain_core.messages import AIMessage
import requests
import numpy as np
#from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from crawl4ai import AsyncWebCrawler
import asyncio
from langchain.output_parsers import OutputFixingParser
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredPowerPointLoader
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from urllib.parse import urlparse
from typing import Union, Dict
from langchain_core.runnables import RunnableLambda
import shutil
from pptx import Presentation
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.output_parsers import OutputFixingParser
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
import xml.etree.ElementTree

GROQ_API_KEY = "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"
OLLAMA_MODEL = "nomic-embed-text"
PIXABAY_API_KEY = "44622834-f22df6f12cf45558ee180dd8d"



class SlideContent(BaseModel):
    #heading: str = Field(..., description="The main heading of the slide")
    type: str = Field("flash")
    subheading: Optional[str] = Field(None, description="An optional subheading for the slide")
    paragraphs: List[str] = Field(..., description="List of paragraphs for the slide content")  
    visualization_suggestion: str = Field(..., description="A specific and concise suggestion for a relevant visualization or image (max 5 words)")
    image: Optional[str] = Field(None, description="URL of the image for the slide")
    #subheading = heading

class MCQContent(BaseModel):
    type: str = Field("Question")
    question: str = Field(..., description="The multiple-choice question")
    options: List[str] = Field(..., description="A list of 4 answer options")
    correct_answer: str = Field(..., description="The correct answer (e.g., 'a', 'b', 'c', or 'd')")

class StorigoContent(BaseModel):
    slides: Dict[str, SlideContent] = Field(..., description="Dictionary of slide contents with slide numbers as keys")
    #mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    #token_count: int = Field(..., description="Total token count for all the generated content")
    token_count: int = 0


class StorigoContentMCQ(BaseModel):
    slides: Dict[str, SlideContent] = Field(..., description="Dictionary of slide contents with slide numbers as keys")
    mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    token_count: int = Field(..., description="Total token count for all the generated content")



class StorigoContentMCQMid(BaseModel):
    slides: Dict[str, Union[SlideContent, MCQContent]] = Field(..., description="Dictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keys")
    #mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    token_count: int = Field(..., description="Total token count for all the generated content")

class CustomMCQParser(PydanticOutputParser):
    def parse_result(self, result):
        # Step 1: Ensure that result is a string
        if isinstance(result, list):
            # If it's a list, join it into a single string
            result = " ".join(result)
        
        # Step 2: Convert single quotes to double quotes
        result = result.replace("'", "\"")

        # Step 3: Optionally, remove any unwanted text (e.g., "Here is the MCQ:")
        result = result.replace("Here is the MCQ:", "").strip()

        # Step 4: Try parsing the cleaned output as JSON
        try:
            # Parse the cleaned string into a JSON object
            json_object = json.loads(result)
            
            # Step 5: Use the Pydantic model to validate the JSON
            return self.pydantic_object.model_validate(json_object)
        except json.JSONDecodeError as e:
            raise Exception(f"Error decoding JSON: {str(e)}")
        except Exception as e:
            raise Exception(f"Error parsing result: {str(e)}")
        
        
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

# def create_embeddings(text, client_id):
#     try:
#         embeddings = OllamaEmbeddings(model=OLLAMA_MODEL)
#         vectors = FAISS.from_texts([text], embeddings)
        
#         client_dir = f"my_embeddings/{client_id}"
#         os.makedirs(client_dir, exist_ok=True)
#         vectors.save_local(client_dir)
        
#         return vectors
#     except Exception as e:
#         raise Exception(f"Error creating embeddings: {str(e)}")

def generate_search_query(visualization_suggestion, slide_content):
    context_keywords = extract_context_keywords(slide_content)
    
    # Combine visualization suggestion with context keywords
    combined_query = f"{visualization_suggestion} {' '.join(context_keywords)}"
    
    # Extract key words from the combined query
    words = re.findall(r'\w+', combined_query.lower())
    
    # Remove common words
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    filtered_words = [word for word in words if word not in common_words]
    
    # Prioritize words from the visualization suggestion
    suggestion_words = visualization_suggestion.lower().split()
    prioritized_words = suggestion_words + [word for word in filtered_words if word not in suggestion_words]
    
    # Take the first 5 words
    query_words = prioritized_words[:min(5, len(prioritized_words))]
    
    return " ".join(query_words)

def generate_search_query_new(visualization_suggestion, slide_content):
    context_keywords = extract_context_keywords_new(slide_content)
    
    # Combine visualization suggestion with context keywords
    combined_query = f"{visualization_suggestion} {' '.join(context_keywords)}"
    
    # Extract key words from the combined query
    words = re.findall(r'\w+', combined_query.lower())
    
    # Remove common words
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    filtered_words = [word for word in words if word not in common_words]
    
    # Prioritize words from the visualization suggestion
    suggestion_words = visualization_suggestion.lower().split()
    prioritized_words = suggestion_words + [word for word in filtered_words if word not in suggestion_words]
    
    # Take the first 5 words
    query_words = prioritized_words[:min(5, len(prioritized_words))]
    
    return " ".join(query_words)

def extract_context_keywords(slide_content):
    # Extract keywords from slide content to provide context
    text = f"{slide_content.subheading or ''} {' '.join(slide_content.paragraphs)}"
    words = re.findall(r'\w+', text.lower())
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    keywords = [word for word in words if word not in common_words]
    return list(set(keywords))[:3]  # Return up to 3 unique keywords
def extract_context_keywords_new(slide_content):
    # Extract keywords from slide content to provide context
    text = f"{slide_content.get('subheading', '')} {' '.join(slide_content.get('paragraphs', []))}"    
    words = re.findall(r'\w+', text.lower())
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    keywords = [word for word in words if word not in common_words]
    return list(set(keywords))[:3]  # Return up to 3 unique keywords

def fetch_pixabay_image(query):
    url = "https://pixabay.com/api/"
    params = {
        "key": PIXABAY_API_KEY,
        "q": query,
        "image_type": "photo",
        "orientation": "horizontal",
        "per_page": 5,  # Fetch top 5 images
        "safesearch": "true",
        "order": "relevance"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data["hits"]:
            # Sort images by relevance score (you may need to adjust this based on Pixabay's API)
            sorted_hits = sorted(data["hits"], key=lambda x: x["likes"] + x["downloads"], reverse=True)
            return sorted_hits[0]["webformatURL"]  # Return the most relevant image
        else:
            print(f"No image found for query: {query}")
            return None
    except requests.RequestException as e:
        print(f"Error fetching image from Pixabay: {str(e)}")
        return None
    except Exception as e:
        print(f"Unexpected error in fetch_pixabay_image: {str(e)}")
        return None

def get_valid_image(visualization_suggestion, slide_content, max_attempts=3):
    if not visualization_suggestion:
        print("No visualization suggestion provided.")
        return None

    for attempt in range(max_attempts):
        try:
            query = generate_search_query(visualization_suggestion, slide_content)
            print(f"Attempt {attempt + 1} to fetch image for query: {query}")
            image_url = fetch_pixabay_image(query)
            
            if image_url:
                print(f"Valid image found: {image_url}")
                return image_url
            else:
                print(f"No image URL returned for query: {query}")
            
            time.sleep(1)
        except Exception as e:
            print(f"Error in get_valid_image (attempt {attempt + 1}): {str(e)}")
    
    print(f"No valid image found after {max_attempts} attempts")
    return None

def get_valid_image_new(visualization_suggestion, slide_content, max_attempts=3):
    if not visualization_suggestion:
        print("No visualization suggestion provided.")
        return None

    for attempt in range(max_attempts):
        try:
            query = generate_search_query_new(visualization_suggestion, slide_content)
            print(f"Attempt {attempt + 1} to fetch image for query: {query}")
            image_url = fetch_pixabay_image(query)
            
            if image_url:
                print(f"Valid image found: {image_url}")
                return image_url
            else:
                print(f"No image URL returned for query: {query}")
            
            time.sleep(1)
        except Exception as e:
            print(f"Error in get_valid_image (attempt {attempt + 1}): {str(e)}")
    
    print(f"No valid image found after {max_attempts} attempts")
    return None

def count_tokens(text):
    # Here you can define how you count tokens.
    # A simple way is to count words, assuming each word is a token.
    tokens = re.findall(r'\w+', text)
    return len(tokens)

def generate_slide_content_old(vectors,client_id, num_slides,num_mcqs, is_image, is_question, question_position,isGPU):
    try:
        #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
        if(isGPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b",
            model = "gemma3:12b"
            #model = "deepseek-r1:8b"
        )

        # Prompt template
        slide_content_template = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.
        
        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - and a specific, concise **visualization suggestion**

        **Context**: {context}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading or subheading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source materialor words like 
        "In this video" and such words.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
       
        - "subheading" (if applicable): A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        -"visualization_suggestion":visualization_suggestion for each slide should be very specific, context-aware, and no longer than 5 words.
        
        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain
        slide_content_chain = (
            {
                "context": lambda x: vectors.similarity_search(x["query"], k=3),
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)
        print("result")
        print(result)
        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )
        print("ordered_slides")
        print(ordered_slides)

        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        #token_count = 0
        token_count_text = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            #token_count += count_tokens(text_content)
            token_count_text += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
            Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

            **Context**: {context}

            The MCQ must include:
            - A **question** related to the context  
            - Exactly **4 answer options**  
            - A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

            ⚠️ **Critical Requirements**:
            - ✅ Return **only valid JSON** — no explanations, headers, or extra text.
            - ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
            - ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
            - Dont give Here is the MCQ while generating MCQ
            - Directly follow the format given below

            The final output **must strictly follow** this format:
            ```json
            {{
                "question": "<The MCQ question>",
                "options": [
                    "<Option 1>",
                    "<Option 2>",
                    "<Option 3>",
                    "<Option 4>"
                ],
                "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

                Always give "question","options","correct_answer" these labels in double quotes only
            }}






            """


            def is_valid_json(response):
                try:
                    json.loads(response)
                    return True
                except json.JSONDecodeError:
                    return False

            #mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            llm = llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b")
            
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)
                    print("Context")
                    print(context_text)
                    try:
                        mcq_result = (
                            RunnableLambda(lambda x: {"context": context_text})
                            | mcq_prompt
                            | llm
                        ).invoke({})

                        print("qwer")
                        print(mcq_result)
                        print("END")
                        #tokens1 = mcq_result['content']
                        #tokens1 = mcq_result['usage_metadata']['total_tokens']
                        print("tokens1")
                        #print(tokens1)

                        if hasattr(mcq_result, 'content'):
                            # Extract the content (this is the LLM's response)
                            content = mcq_result.content
                        else:
                            # If mcq_result does not have 'content', try to convert it to a string
                            content = str(mcq_result)

                        # Now, process the content to extract the JSON inside the curly braces

                        start = content.find('{')  # Find the first opening curly brace
                        if start != -1:
                            open_braces = 0
                            end = -1
                            for i, char in enumerate(content[start:], start=start):
                                if char == '{':
                                    open_braces += 1
                                elif char == '}':
                                    open_braces -= 1
                                    if open_braces == 0:
                                        end = i
                                        break

                            # Extract the content inside the first pair of curly braces
                            if end != -1:
                                content_only = content[start:end+1]
                                print(content_only) 
                                json_object = json.loads(content_only)
                                 # This will print the content inside the outermost curly braces
                            else:
                                print("No matching closing brace found.")
                        else:
                            print("No opening brace found.")
                        print("match")
                            
                        
                         # Extract the content from the AI response
                        mcq_content = mcq_result.content if hasattr(mcq_result, "content") else mcq_result
                        print("mcq_content")
                        print(mcq_content)

                        print("content_only")
                        print(content_only)
                        print("content_only12")
                        # Clean and fix the JSON format
                        # if not mcq_content or not mcq_content.strip():
                        #     raise ValueError("Empty or invalid response from Ollama")

                        if not content_only or not content_only.strip():
                            raise ValueError("Empty or invalid response from Ollama")

                        try:
                            # cleaned_content = mcq_content.replace("'", '"')  
                            # mcq_json = json.loads(cleaned_content)
                            
                            # formatted_mcq = {
                            #     "question": mcq_json.get("question", ""),
                            #     "options": mcq_json.get("options", []),
                            #     "correct_answer": mcq_json.get("correct_answer", "")
                            # }
                            #cleaned_content = content_only.replace("'", '"')  
                            #mcq_json = json.loads(cleaned_content)
                            
                            formatted_mcq = {
                                "question": json_object.get("question", ""),
                                "options": json_object.get("options", []),
                                "correct_answer": json_object.get("correct_answer", "")
                            }

                            print("Formatted MCQ:")
                            print(json.dumps(formatted_mcq, indent=4))

                            if formatted_mcq["question"] and len(formatted_mcq["options"]) == 4 and formatted_mcq["correct_answer"]:
                                mcq_key = f"mcq_{len(mcqs) + 1}"
                                mcqs[mcq_key] = formatted_mcq
                                
                                print(f"✅ MCQ {mcq_key} generated and saved!")
                            else:
                                raise ValueError("Incomplete MCQ data")
            
                            
                        except json.JSONDecodeError as e:
                            print(f"JSON Decode Error: {e}")
                            print("Raw Response:", mcq_content)
                        # Validate the JSON structure
                        # if not is_valid_json(mcq_json):
                        #     raise ValueError("Invalid JSON structure for MCQ")

                        # # Save the valid MCQ
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_json
                        # # if not mcq_result or not is_valid_json(mcq_result):
                        # #     raise ValueError("Invalid or empty response from Ollama")
                        # if not mcq_content or not is_valid_json(mcq_content):
                        #     raise ValueError("Invalid or empty response from Ollama")
                        
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result.model_dump()
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = json.loads(mcq_content)
                        
                    # except Exception as e:
                    #     print(f"Error generating MCQ: {e}")
                    #     mcqs[f"mcq_{len(mcqs) + 1}"] = {
                    #         "question": "Could not generate question",
                    #         "options": ["N/A", "N/A", "N/A", "N/A"],
                    #         "correct_answer": "a"
                    #     }
                    except Exception as e:
                        print(f"Error generating MCQ: {e}")
            
            #Calculate token count
            token_count=0
            for mcq in mcqs.values():
                text_content = f"{mcq['question']} {' '.join(mcq['options'])} {mcq['correct_answer']}"
                token_count += count_tokens(text_content)

            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs
            
            if question_position == 1:
                interval = total_slides // total_mcqs if total_mcqs > 0 else total_slides
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        print(mcq_key)
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count_text+token_count)
                return storigo_content
            
            else:
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count_text+token_count)
                return storigo_content
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")

def generate_slide_content(vectors, num_slides,num_mcqs, is_image, is_question, question_position, GPU):
    try:
        #llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        if(GPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b",
            model = "gemma3:12b"
            #model = "deepseek-r1:8b"
        )

        # Load all chunks from vector store
        all_chunks = list(vectors.docstore._dict.values())
        print("YOYO")
        print(all_chunks)
        total_chunks = len(all_chunks)

        # Step 1: Estimate total possible slides
        def estimate_slides(text, words_per_slide=50):
            return max(1, round(len(text.split()) / words_per_slide))
        

        total_possible = sum(estimate_slides(doc.page_content) for doc in all_chunks)
        print(f"🧠 Estimated possible slides from document: {total_possible}")

        # Step 2: Select chunks for context
        selected_chunks = []

        if num_slides <= total_chunks:
            print("📊 Scenario 1: Fewer slides than chunks — sampling evenly.")
            indices = np.linspace(0, total_chunks - 1, num=num_slides, dtype=int)
            selected_chunks = [all_chunks[i] for i in indices]
        else:
            print("📊 Scenario 2: More slides than chunks — distributing proportionally.")
            word_counts = [len(c.page_content.split()) for c in all_chunks]
            total_words = sum(word_counts)
            raw_alloc = [w / total_words * num_slides for w in word_counts]
            alloc = [max(1, round(x)) for x in raw_alloc]

            # Adjust total to exactly num_slides
            diff = num_slides - sum(alloc)
            i = 0
            while diff != 0:
                if diff > 0:
                    alloc[i] += 1
                    diff -= 1
                elif diff < 0 and alloc[i] > 1:
                    alloc[i] -= 1
                    diff += 1
                i = (i + 1) % total_chunks

            # Duplicate chunks proportionally
            for chunk, count in zip(all_chunks, alloc):
                selected_chunks.extend([chunk] * count)

        context_text = "\n\n".join(doc.page_content for doc in selected_chunks[:num_slides])
        print("Context text is ")
        print(context_text)
        # Prompt template
        slide_content_template = """
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like “as we know”, “in general”, or “in this video”.

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **visualization suggestion** (max 5 words, specific to the content)

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
"""

        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        raw_parser = PydanticOutputParser(pydantic_object=StorigoContent)
        parser = OutputFixingParser.from_llm(parser=raw_parser, llm=llm)    
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain
        slide_content_chain = (
            {
                #"context": lambda x: vectors.similarity_search(x["query"], k=3),
                #"context": lambda x: "\n\n".join([doc.page_content for doc in vectors.docstore._dict.values()]),
                "context": lambda x: context_text,
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )


        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context  
- Exactly **4 answer options**  
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}






"""


            def is_valid_json(response):
                try:
                    json.loads(response)
                    return True
                except json.JSONDecodeError:
                    return False

            #mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            llm = llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b"
            model = "gemma3:12b")
            
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)
                    print("Context")
                    print(context_text)
                    try:
                        mcq_result = (
                            RunnableLambda(lambda x: {"context": context_text})
                            | mcq_prompt
                            | llm
                        ).invoke({})

                        print("qwer")
                        print(mcq_result)
                        print("END")
                        #tokens1 = mcq_result['content']
                        #tokens1 = mcq_result['usage_metadata']['total_tokens']
                        print("tokens1")
                        #print(tokens1)

                        if hasattr(mcq_result, 'content'):
                            # Extract the content (this is the LLM's response)
                            content = mcq_result.content
                        else:
                            # If mcq_result does not have 'content', try to convert it to a string
                            content = str(mcq_result)

                        # Now, process the content to extract the JSON inside the curly braces

                        start = content.find('{')  # Find the first opening curly brace
                        if start != -1:
                            open_braces = 0
                            end = -1
                            for i, char in enumerate(content[start:], start=start):
                                if char == '{':
                                    open_braces += 1
                                elif char == '}':
                                    open_braces -= 1
                                    if open_braces == 0:
                                        end = i
                                        break

                            # Extract the content inside the first pair of curly braces
                            if end != -1:
                                content_only = content[start:end+1]
                                print(content_only) 
                                json_object = json.loads(content_only)
                                 # This will print the content inside the outermost curly braces
                            else:
                                print("No matching closing brace found.")
                        else:
                            print("No opening brace found.")
                        print("match")
                            
                        
                         # Extract the content from the AI response
                        mcq_content = mcq_result.content if hasattr(mcq_result, "content") else mcq_result
                        print("mcq_content")
                        print(mcq_content)

                        print("content_only")
                        print(content_only)
                        print("content_only12")
                        # Clean and fix the JSON format
                        # if not mcq_content or not mcq_content.strip():
                        #     raise ValueError("Empty or invalid response from Ollama")

                        if not content_only or not content_only.strip():
                            raise ValueError("Empty or invalid response from Ollama")

                        try:
                            # cleaned_content = mcq_content.replace("'", '"')  
                            # mcq_json = json.loads(cleaned_content)
                            
                            # formatted_mcq = {
                            #     "question": mcq_json.get("question", ""),
                            #     "options": mcq_json.get("options", []),
                            #     "correct_answer": mcq_json.get("correct_answer", "")
                            # }
                            #cleaned_content = content_only.replace("'", '"')  
                            #mcq_json = json.loads(cleaned_content)
                            
                            formatted_mcq = {
                                "question": json_object.get("question", ""),
                                "options": json_object.get("options", []),
                                "correct_answer": json_object.get("correct_answer", "")
                            }

                            print("Formatted MCQ:")
                            print(json.dumps(formatted_mcq, indent=4))

                            if formatted_mcq["question"] and len(formatted_mcq["options"]) == 4 and formatted_mcq["correct_answer"]:
                                mcq_key = f"mcq_{len(mcqs) + 1}"
                                mcqs[mcq_key] = formatted_mcq
                                
                                print(f"✅ MCQ {mcq_key} generated and saved!")
                            else:
                                raise ValueError("Incomplete MCQ data")
            
                            
                        except json.JSONDecodeError as e:
                            print(f"JSON Decode Error: {e}")
                            print("Raw Response:", mcq_content)
                        # Validate the JSON structure
                        # if not is_valid_json(mcq_json):
                        #     raise ValueError("Invalid JSON structure for MCQ")

                        # # Save the valid MCQ
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_json
                        # # if not mcq_result or not is_valid_json(mcq_result):
                        # #     raise ValueError("Invalid or empty response from Ollama")
                        # if not mcq_content or not is_valid_json(mcq_content):
                        #     raise ValueError("Invalid or empty response from Ollama")
                        
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result.model_dump()
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = json.loads(mcq_content)
                        
                    # except Exception as e:
                    #     print(f"Error generating MCQ: {e}")
                    #     mcqs[f"mcq_{len(mcqs) + 1}"] = {
                    #         "question": "Could not generate question",
                    #         "options": ["N/A", "N/A", "N/A", "N/A"],
                    #         "correct_answer": "a"
                    #     }
                    except Exception as e:
                        print(f"Error generating MCQ: {e}")
            
            #Calculate token count
            #token_count=0
            for mcq in mcqs.values():
                text_content = f"{mcq['question']} {' '.join(mcq['options'])} {mcq['correct_answer']}"
                token_count += count_tokens(text_content)

            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs
            
            if question_position == 1:
                interval = total_slides // total_mcqs if total_mcqs > 0 else total_slides
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        print(mcq_key)
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
            
            else:
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")



def generate_slide_content_new(vectors,client_id, num_slides,num_mcqs, is_image, is_question, question_position, GPU):
    try:
        #llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        if(GPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b",
            #model = "gemma3:12b"
            #model = "deepseek-r1:8b"
        )

    
        slide_content_template = """
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like “as we know”, “in general”, or “in this video”.

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **visualization suggestion** (max 5 words, specific to the content)

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
"""

        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        raw_parser = PydanticOutputParser(pydantic_object=StorigoContent)
        parser = OutputFixingParser.from_llm(parser=raw_parser, llm=llm)    
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain
        slide_content_chain = (
            {
                "context": lambda x: vectors.similarity_search(x["query"], k=3),
                #"context": lambda x: "\n\n".join([doc.page_content for doc in vectors.docstore._dict.values()]),
                #"context": lambda x: context_text,
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )


        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context  
- Exactly **4 answer options**  
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}






"""


            def is_valid_json(response):
                try:
                    json.loads(response)
                    return True
                except json.JSONDecodeError:
                    return False

            #mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            llm = llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b"
            model = "gemma3:12b")
            
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)
                    print("Context")
                    print(context_text)
                    try:
                        mcq_result = (
                            RunnableLambda(lambda x: {"context": context_text})
                            | mcq_prompt
                            | llm
                        ).invoke({})

                        print("qwer")
                        print(mcq_result)
                        print("END")
                        #tokens1 = mcq_result['content']
                        #tokens1 = mcq_result['usage_metadata']['total_tokens']
                        print("tokens1")
                        #print(tokens1)

                        if hasattr(mcq_result, 'content'):
                            # Extract the content (this is the LLM's response)
                            content = mcq_result.content
                        else:
                            # If mcq_result does not have 'content', try to convert it to a string
                            content = str(mcq_result)

                        # Now, process the content to extract the JSON inside the curly braces

                        start = content.find('{')  # Find the first opening curly brace
                        if start != -1:
                            open_braces = 0
                            end = -1
                            for i, char in enumerate(content[start:], start=start):
                                if char == '{':
                                    open_braces += 1
                                elif char == '}':
                                    open_braces -= 1
                                    if open_braces == 0:
                                        end = i
                                        break

                            # Extract the content inside the first pair of curly braces
                            if end != -1:
                                content_only = content[start:end+1]
                                print(content_only) 
                                json_object = json.loads(content_only)
                                 # This will print the content inside the outermost curly braces
                            else:
                                print("No matching closing brace found.")
                        else:
                            print("No opening brace found.")
                        print("match")
                            
                        
                         # Extract the content from the AI response
                        mcq_content = mcq_result.content if hasattr(mcq_result, "content") else mcq_result
                        print("mcq_content")
                        print(mcq_content)

                        print("content_only")
                        print(content_only)
                        print("content_only12")
                        # Clean and fix the JSON format
                        # if not mcq_content or not mcq_content.strip():
                        #     raise ValueError("Empty or invalid response from Ollama")

                        if not content_only or not content_only.strip():
                            raise ValueError("Empty or invalid response from Ollama")

                        try:
                            # cleaned_content = mcq_content.replace("'", '"')  
                            # mcq_json = json.loads(cleaned_content)
                            
                            # formatted_mcq = {
                            #     "question": mcq_json.get("question", ""),
                            #     "options": mcq_json.get("options", []),
                            #     "correct_answer": mcq_json.get("correct_answer", "")
                            # }
                            #cleaned_content = content_only.replace("'", '"')  
                            #mcq_json = json.loads(cleaned_content)
                            
                            formatted_mcq = {
                                "question": json_object.get("question", ""),
                                "options": json_object.get("options", []),
                                "correct_answer": json_object.get("correct_answer", "")
                            }

                            print("Formatted MCQ:")
                            print(json.dumps(formatted_mcq, indent=4))

                            if formatted_mcq["question"] and len(formatted_mcq["options"]) == 4 and formatted_mcq["correct_answer"]:
                                mcq_key = f"mcq_{len(mcqs) + 1}"
                                mcqs[mcq_key] = formatted_mcq
                                
                                print(f"✅ MCQ {mcq_key} generated and saved!")
                            else:
                                raise ValueError("Incomplete MCQ data")
            
                            
                        except json.JSONDecodeError as e:
                            print(f"JSON Decode Error: {e}")
                            print("Raw Response:", mcq_content)
                        # Validate the JSON structure
                        # if not is_valid_json(mcq_json):
                        #     raise ValueError("Invalid JSON structure for MCQ")

                        # # Save the valid MCQ
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_json
                        # # if not mcq_result or not is_valid_json(mcq_result):
                        # #     raise ValueError("Invalid or empty response from Ollama")
                        # if not mcq_content or not is_valid_json(mcq_content):
                        #     raise ValueError("Invalid or empty response from Ollama")
                        
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result.model_dump()
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = json.loads(mcq_content)
                        
                    # except Exception as e:
                    #     print(f"Error generating MCQ: {e}")
                    #     mcqs[f"mcq_{len(mcqs) + 1}"] = {
                    #         "question": "Could not generate question",
                    #         "options": ["N/A", "N/A", "N/A", "N/A"],
                    #         "correct_answer": "a"
                    #     }
                    except Exception as e:
                        print(f"Error generating MCQ: {e}")
            
            #Calculate token count
            #token_count=0
            for mcq in mcqs.values():
                text_content = f"{mcq['question']} {' '.join(mcq['options'])} {mcq['correct_answer']}"
                token_count += count_tokens(text_content)

            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs
            
            if question_position == 1:
                interval = total_slides // total_mcqs if total_mcqs > 0 else total_slides
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        print(mcq_key)
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
            
            else:
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")


def generate_slide_content_seperate(full_transcript, num_slides, num_mcqs, is_image, is_question, question_position, GPU):
    try:
        # Initialize LLM based on GPU preference
        if GPU == 0:
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
                base_url='http://127.0.0.1:11434',
                model="gemma3:12b"
            )

        print(f"📝 Processing transcript of {len(full_transcript)} characters")
        print(f"🎯 Generating {num_slides} slides")

        # Direct context approach - use the full transcript
        context_text = full_transcript
        
        # Enhanced prompt template for better slide generation
        slide_content_template = """
You are creating presentation slides from a transcript. Extract {num_slides} distinct topics directly from the content below.

ABSOLUTE RULES:
❌ NEVER use phrases like: "Here's a breakdown", "I'm structuring", "Overall theme", "According to", "This is a lot of information"
❌ NEVER use generic titles like "Topic 1", "Core Strategies", "Detailed Breakdown"
❌ NEVER include meta-commentary about organizing or structuring content
❌ NEVER use bullet points or numbered lists in subheadings (no "**1.", "**2.", etc.)

✅ EXTRACT real topics/concepts directly from the transcript
✅ Use specific, descriptive titles that reflect actual content
✅ Present information as if from the original speaker/content
✅ Focus on actionable insights and key points

TRANSCRIPT CONTENT:
{context}

Create exactly {num_slides} slides where each slide covers a specific concept, strategy, or insight from the transcript.

REQUIRED JSON FORMAT (no additional text):
{{
    "slides": {{
        "slide_1": {{
            "type": "flash",
            "subheading": "Specific topic from transcript (no meta language)",
            "paragraphs": ["Direct content from transcript", "More direct content"],
            "visualization_suggestion": "relevant visual",
            "image": null
        }}
    }}
}}

Focus on practical insights, strategies, skills, and actionable advice from the transcript content.
"""
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain with direct context
        slide_content_chain = (
            {
                "context": lambda x: context_text,
                "num_slides": lambda x: x["num_slides"]
            }
            | slide_content_prompt
            | llm
        )

        # Invoke the chain with the number of slides
        print("🚀 Generating slides from full transcript...")
        result = slide_content_chain.invoke({"num_slides": num_slides})
        
        # Extract and parse the response
        response_content = result.content if hasattr(result, 'content') else str(result)
        print(f"📝 Raw LLM response length: {len(response_content)} characters")
        
        # Clean and extract JSON from response
        try:
            # Find JSON content within curly braces
            start = response_content.find('{')
            if start == -1:
                raise ValueError("No JSON object found in response")
                
            # Find the matching closing brace
            open_braces = 0
            end = -1
            for i, char in enumerate(response_content[start:], start=start):
                if char == '{':
                    open_braces += 1
                elif char == '}':
                    open_braces -= 1
                    if open_braces == 0:
                        end = i
                        break
            
            if end == -1:
                raise ValueError("No matching closing brace found")
                
            json_content = response_content[start:end+1]
            print(f"📄 Extracted JSON length: {len(json_content)} characters")
            
            # Parse the JSON
            parsed_data = json.loads(json_content)
            
            # Convert to the expected format
            if "slides" in parsed_data:
                slides_data = parsed_data["slides"]
            else:
                slides_data = parsed_data
                
            # Create SlideContent objects using your Pydantic model
            ordered_slides = {}
            for slide_key, slide_content in slides_data.items():
                slide_obj = SlideContent(
                    type=slide_content.get('type', 'flash'),
                    subheading=slide_content.get('subheading', ''),
                    paragraphs=slide_content.get('paragraphs', []),
                    visualization_suggestion=slide_content.get('visualization_suggestion', ''),
                    image=slide_content.get('image', None)
                )
                ordered_slides[slide_key] = slide_obj
                
        except (json.JSONDecodeError, ValueError) as e:
            print(f"❌ JSON parsing failed: {e}")
            print(f"🔍 Raw response: {response_content[:500]}...")
            
            # Create fallback slides using your Pydantic model
            print("🔄 Creating fallback slides...")
            ordered_slides = {}
            
            # Split transcript into meaningful chunks instead of random word counts
            sentences = context_text.replace('\n', ' ').split('. ')
            sentences_per_slide = max(1, len(sentences) // num_slides)
            
            for i in range(num_slides):
                start_idx = i * sentences_per_slide
                end_idx = (i + 1) * sentences_per_slide if i < num_slides - 1 else len(sentences)
                
                slide_sentences = sentences[start_idx:end_idx]
                slide_content = '. '.join(slide_sentences)
                
                # Extract a meaningful title from the content
                if slide_sentences:
                    # Look for actual topics, skip meta-commentary
                    meaningful_sentence = ""
                    for sentence in slide_sentences:
                        # Skip sentences with meta-commentary
                        if not any(phrase in sentence.lower() for phrase in [
                            "here's a breakdown", "i'm structuring", "overall theme", 
                            "according to", "this is a lot", "let me", "i've organized"
                        ]):
                            meaningful_sentence = sentence
                            break
                    
                    if meaningful_sentence:
                        # Extract first meaningful phrase (up to first punctuation)
                        title = meaningful_sentence.split('.')[0].split(',')[0].split(':')[0]
                        title = title.strip()
                        # Remove formatting markers
                        title = title.replace('**', '').replace('*', '').replace('#', '')
                        # Limit length
                        if len(title) > 60:
                            title = title[:60] + "..."
                    else:
                        title = f"Key Insight {i + 1}"
                else:
                    title = f"Key Point {i + 1}"
                
                # Clean up paragraph content to remove meta-commentary
                clean_paragraphs = []
                for sentence in slide_sentences:
                    # Skip meta-commentary sentences
                    if not any(phrase in sentence.lower() for phrase in [
                        "here's a breakdown", "i'm structuring", "overall theme",
                        "according to", "this is a lot", "let me organize", "i've organized"
                    ]):
                        clean_paragraphs.append(sentence.strip())
                
                # Combine clean sentences into paragraphs
                if clean_paragraphs:
                    # Group sentences into paragraphs (max 2 sentences per paragraph)
                    final_paragraphs = []
                    current_para = []
                    
                    for sentence in clean_paragraphs:
                        current_para.append(sentence)
                        if len(current_para) >= 2:  # Max 2 sentences per paragraph
                            final_paragraphs.append('. '.join(current_para) + '.')
                            current_para = []
                    
                    if current_para:  # Add remaining sentences
                        final_paragraphs.append('. '.join(current_para) + '.')
                    
                    paragraphs = final_paragraphs[:4]  # Max 4 paragraphs
                else:
                    # Fallback if no clean content found
                    paragraphs = [slide_content[:300] + "..." if len(slide_content) > 300 else slide_content]
                
                # Create fallback slide using your Pydantic model
                slide_obj = SlideContent(
                    type="flash",
                    subheading=title,
                    paragraphs=paragraphs[:4],  # Max 4 paragraphs
                    visualization_suggestion="diagram",
                    image=None
                )
                ordered_slides[f"slide_{i + 1}"] = slide_obj

        print(f"✅ Successfully generated {len(ordered_slides)} slides")

        # Image handling logic
        if is_image:
            print("🖼️ Processing images for slides...")
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    # Call your get_valid_image function here
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"⚠️ No suitable image found for slide {slide_key}")
                        slide_content.image = None
                else:
                    print(f"⚠️ No visualization suggestion for slide {slide_key}")
                    slide_content.image = None
        else:
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        # Calculate token count
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            token_count += count_tokens(text_content)
        
        # MCQ Generation (if requested)
        if is_question:
            print(f"❓ Generating {num_mcqs} MCQs...")
            mcqs = generate_mcqs_from_slides(ordered_slides, num_mcqs, llm)
            
            # Add MCQ tokens to count
            for mcq in mcqs.values():
                text_content = f"{mcq['question']} {' '.join(mcq['options'])} {mcq['correct_answer']}"
                token_count += count_tokens(text_content)

            # Interleave slides and MCQs based on position preference
            interleaved_content = {}
            slide_keys = list(ordered_slides.keys())
            
            if question_position == 1:  # MCQs distributed throughout
                interval = len(slide_keys) // num_mcqs if num_mcqs > 0 else len(slide_keys)
                mcq_counter = 0
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < num_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        if mcq_key in mcqs:
                            # Convert MCQ dict to MCQContent object
                            mcq_obj = MCQContent(
                                type="Question",
                                question=mcqs[mcq_key]['question'],
                                options=mcqs[mcq_key]['options'],
                                correct_answer=mcqs[mcq_key]['correct_answer']
                            )
                            interleaved_content[mcq_key] = mcq_obj
                            mcq_counter += 1
                
            else:  # MCQs at the end
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(num_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    if mcq_key in mcqs:
                        # Convert MCQ dict to MCQContent object
                        mcq_obj = MCQContent(
                            type="Question",
                            question=mcqs[mcq_key]['question'],
                            options=mcqs[mcq_key]['options'],
                            correct_answer=mcqs[mcq_key]['correct_answer']
                        )
                        interleaved_content[mcq_key] = mcq_obj
            
            # Return with MCQs using your exact model
            return StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
        
        else:
            # Return using your exact Pydantic model
            return StorigoContent(slides=ordered_slides, token_count=token_count)

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")


def generate_mcqs_from_slides(ordered_slides, num_mcqs, llm):
    """
    Generate MCQs from slides using the direct context approach
    """
    mcq_template = """
Based on the following slide content, generate one multiple-choice question (MCQ) that tests comprehension of the key concepts.

**Slide Content**: {context}

The MCQ must include:
- A **question** related to the slide content  
- Exactly **4 answer options**  
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)** 
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text
- ✅ Base the question strictly on the provided slide content

The final output **must strictly follow** this format:
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>", 
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"
}}
"""

    mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
    mcqs = {}
    slide_keys = list(ordered_slides.keys())
    
    # Generate MCQs from evenly distributed slides
    if num_mcqs > 0:
        interval = max(1, len(slide_keys) // num_mcqs)
        selected_slide_indices = [i * interval for i in range(num_mcqs)]
        
        for i, slide_idx in enumerate(selected_slide_indices):
            if slide_idx < len(slide_keys) and len(mcqs) < num_mcqs:
                slide_key = slide_keys[slide_idx]
                slide = ordered_slides[slide_key]
                
                # Create context from slide content
                context_text = f"{slide.subheading}: {' '.join(slide.paragraphs)}"
                
                try:
                    mcq_result = (
                        RunnableLambda(lambda x: {"context": context_text})
                        | mcq_prompt
                        | llm
                    ).invoke({})
                    
                    # Extract and parse JSON response
                    content = mcq_result.content if hasattr(mcq_result, 'content') else str(mcq_result)
                    
                    # Find JSON content within curly braces
                    start = content.find('{')
                    if start != -1:
                        open_braces = 0
                        end = -1
                        for j, char in enumerate(content[start:], start=start):
                            if char == '{':
                                open_braces += 1
                            elif char == '}':
                                open_braces -= 1
                                if open_braces == 0:
                                    end = j
                                    break
                        
                        if end != -1:
                            json_content = content[start:end+1]
                            mcq_data = json.loads(json_content)
                            
                            # Validate MCQ structure
                            if (mcq_data.get("question") and 
                                len(mcq_data.get("options", [])) == 4 and 
                                mcq_data.get("correct_answer")):
                                
                                mcq_key = f"mcq_{len(mcqs) + 1}"
                                mcqs[mcq_key] = mcq_data
                                print(f"✅ Generated {mcq_key}")
                            else:
                                print(f"⚠️ Invalid MCQ structure for slide {slide_key}")
                    
                except Exception as e:
                    print(f"❌ Error generating MCQ for slide {slide_key}: {e}")
                    # Create fallback MCQ
                    mcq_key = f"mcq_{len(mcqs) + 1}"
                    mcqs[mcq_key] = {
                        "question": f"What is the main topic of: {slide.subheading}?",
                        "options": ["Topic A", "Topic B", "Topic C", "Topic D"],
                        "correct_answer": "a"
                    }
    
    return mcqs

def load_pdf(file_path):
    """Load PDF and return documents."""
    #loader = PyPDFLoader(f"{file_path}.pdf")
    loader = PyPDFLoader(file_path)
    return loader.load()

def count_total_words(docs):
    """Count the total number of words in the documents."""
    return sum(len(doc['page_content'].split()) for doc in docs)

def split_text_with_semantic_chunker(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"  # Can be changed to "standard_deviation", "interquartile"
    )
    documents = text_splitter.create_documents([doc.page_content for doc in docs])
    #documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    print("Documents split into semantic chunks.")
    return documents

async def crawlerrr(file):
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url=file)

        # Print the extracted content
        print(result.markdown)

        # Extract a safe filename from the URL
        parsed_url = urlparse(file)
        filename = parsed_url.netloc + parsed_url.path.replace('/', '_')
        if not filename.endswith('.txt'):
            filename += '.txt'

        # Save the result to the text file using the `filename`
        with open(filename, 'w') as txt_file:
            txt_file.write(result.markdown)
        print("HELLO jII")
        print(filename)
        return filename

def read_file_url(input):
    try:
        # Check if input path is a valid file
        print("Started")
        if not input or not os.path.isfile(input):
            print(f"Error: The file '{input}' does not exist or the path is incorrect.")
            return ""

        # Open the file in read mode ('r')
        with open(input, 'r', encoding='utf-8') as file:
            # Read the entire content of the file
            content = file.read()

        # Return the content of the file (if content is empty, return empty string)
        return content if content else ""
    
    except Exception as e:
        # Catch any other unexpected errors
        print(f"Error reading the file: {e}")
        return ""

def clean_using_llm(content):
    # Define the prompt template for meaningful content extraction
    prompt_template = """
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements, 
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs, 
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    """

    # Initialize the LLM (with your API key)
    #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
    llm = ChatOllama(
    base_url = 'http://127.0.0.1:11434',
    #model = "llama3:8b"
    model = "gemma3:12b"
    #model = "deepseek-r1:8b"
    )
    # Create the PromptTemplate object
    prompt = PromptTemplate(input_variables=["context"], template=prompt_template)

    # Create the LLMChain to pass the prompt and run the model
    runnable = prompt | llm

    # Run the sequence to get the filtered content
    filtered_content = runnable.invoke({"context": content})
    print(filtered_content)
    filtered_content = filtered_content.content
    print(type(filtered_content))
    return filtered_content
    # Print the filtered content (or save to a new file if needed)
    # print("Filtered Content:")
    # print(filtered_content)

def split_text_with_semantic_chunker_for_url(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    # Check if docs is a string instead of a list
    if isinstance(docs, str):
        # Convert the string to a list with one item
        docs = [docs]

    # Debugging: Print the type of items in docs
    print(f"Type of docs after conversion: {type(docs)}")
    print(f"First item in docs: {docs[0] if docs else 'Empty list'}")

    # Convert strings to dictionaries with 'page_content' if needed
    if isinstance(docs[0], str):
        docs = [{'page_content': doc} for doc in docs]

    # Ensure all docs have the correct structure
    if not all(isinstance(doc, dict) and 'page_content' in doc for doc in docs):
        print("Error: Invalid document structure.")
        return []

    # Create semantic chunks
    documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    print("Documents split into semantic chunks.")
    print(documents)
    return documents

def save_documents_to_txt(documents, output_dir):
    """Saves each document in the documents list as a separate .txt file."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Create the output directory if it doesn't exist
    
    for i, document in enumerate(documents):
        file_name = f"document_part_{i+1}.txt"
        file_path = os.path.join(output_dir, file_name)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(document.page_content)  # Assuming each document object has a 'page_content' attribute
        
        print(f"Saved: {file_path}")


def create_and_save_embeddings(split_documents, client_id):

    client_id = str(client_id)
    #eference_id = str(reference_id)
    # Base folder structure: my_embeddings/{client_id}/{reference_id}
    embedding_folder_base = os.path.join("my_embeddings", client_id)
    
    # Make sure the base embedding folder exists
    os.makedirs(embedding_folder_base, exist_ok=True)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
    # Iterate over each document chunk and generate embeddings
    for idx, doc in enumerate(split_documents, start=1):  
        # Create a unique folder for each document's embeddings directly inside embedding_folder_base
        embedding_folder = os.path.join(embedding_folder_base)
        
        # Ensure each document's folder is created fresh inside the base folder, without nesting
        os.makedirs(embedding_folder, exist_ok=True)
        
        # Create a FAISS index for this chunk
        temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
        # Save the FAISS index for this chunk with an incremental filename
        embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
        temp_db.save_local(embedding_file_path)
        
        print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")

def create_embeddings(split_documents, client_id):
    client_id = str(client_id)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')

    # Create a FAISS index from all chunks (in memory only)
    vectorstore = FAISS.from_documents(split_documents, embedding=embeddings)
    faiss_index_path = "faiss_supplier_index"
    vectorstore.save_local(faiss_index_path)
    
    print(f"✅ Created FAISS vectorstore in memory for client {client_id}")
    
    return vectorstore
import pickle

def save_faiss_per_chunk(documents, base_path="faiss_chunks", embedding_model=None, api_key=None):
    """
    Save each chunk in its own FAISS vector store directory.

    Args:
        documents (List[Document]): List of LangChain Document objects.
        base_path (str): Base directory to store all FAISS chunks.
        embedding_model: Optional embedding model instance.
        api_key (str): Required if embedding_model is not passed.

    Returns:
        List[str]: List of FAISS chunk folder paths.
    """
    if embedding_model is None:
        embedding_model = OllamaEmbeddings(model='nomic-embed-text')

    os.makedirs(base_path, exist_ok=True)
    chunk_paths = []

    for i, doc in enumerate(documents):
        chunk_dir = os.path.join(base_path, f"chunk_{i}")
        os.makedirs(chunk_dir, exist_ok=True)

        # Each FAISS requires at least 1 document
        vector_store = FAISS.from_documents([doc], embedding_model)

        # Save the FAISS index
        vector_store.save_local(chunk_dir)
        chunk_paths.append(chunk_dir)

        # Save the original document for reference
        with open(os.path.join(chunk_dir, "doc_metadata.pkl"), "wb") as f:
            pickle.dump(doc, f)

    print(f"✅ Saved {len(documents)} chunks as individual FAISS indexes in '{base_path}'")
    return chunk_paths

def create_and_save_embeddings_new(split_documents, client_id):

    client_id = str(client_id)
    #eference_id = str(reference_id)
    # Base folder structure: my_embeddings/{client_id}/{reference_id}
    embedding_folder_base = os.path.join("my_embeddings", client_id)
    
    # Make sure the base embedding folder exists
    os.makedirs(embedding_folder_base, exist_ok=True)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
    # Iterate over each document chunk and generate embeddings
    for idx, doc in enumerate(split_documents, start=1):  
        # Create a unique folder for each document's embeddings directly inside embedding_folder_base
        embedding_folder = os.path.join(embedding_folder_base)
        
        # Ensure each document's folder is created fresh inside the base folder, without nesting
        os.makedirs(embedding_folder, exist_ok=True)
        
        # Create a FAISS index for this chunk
        temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
        # Save the FAISS index for this chunk with an incremental filename
        embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
        temp_db.save_local(embedding_file_path)
        
        print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")



def merge_all_faiss1(client_id, base_path='my_embeddings'):
    embeddings=OllamaEmbeddings(model="nomic-embed-text") 
    # Initialize an empty FAISS vectorstore for merging
    merged_faiss = None
    
    # Construct the base folder path
    folder_path = f'{base_path}/{client_id}'
    
    # List all folders that match the pattern 'faiss_index{i}'
    faiss_folders = [
        folder for folder in os.listdir(folder_path) 
        if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
    ]
    
    # Sort folders by the index number extracted from 'faiss_index{i}'
    sorted_folders = sorted(faiss_folders, key=lambda x: int(x.replace('faiss_index', '')))
    
    # Loop through the sorted folders and merge FAISS stores
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        print(f"Loading FAISS index from: {faiss_path}")  # Debugging: See the order of loading
        current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
        
        # If it's the first FAISS store, initialize merged_faiss
        if merged_faiss is None:
            merged_faiss = current_faiss
        else:
            # Merge current FAISS store into merged_faiss
            merged_faiss.merge_from(current_faiss)
    
    # Optionally, save the merged FAISS index to a new folder
    if merged_faiss is not None:
        merged_faiss.save_local(f'{folder_path}/merged_faiss')
    print(merged_faiss)

    # Delete individual FAISS index folders, except for the 'merged_faiss'
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        try:
            # Delete the entire directory for the FAISS index (e.g., faiss_index3)
            shutil.rmtree(faiss_path)
            print(f"Deleted FAISS index folder: {faiss_path}")
        except FileNotFoundError:
            print(f"Folder not found: {faiss_path}")
        except OSError as e:
            print(f"Error deleting {faiss_path}: {e}")
    return merged_faiss

def merge_all_faiss(client_id, base_path='my_embeddings'):
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    merged_faiss = None
    
    folder_path = f'{base_path}/{client_id}'
    faiss_files = [
        folder for folder in os.listdir(folder_path) 
        if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
    ]
    
    sorted_files = sorted(faiss_files, key=lambda x: int(x.replace('faiss_index', '')))

    for file in sorted_files:
        faiss_path = os.path.join(folder_path, file)
        print(f"Loading FAISS index from: {faiss_path}")
        current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)

        # Extract document content
        current_texts = [current_faiss.docstore.search(doc_id).page_content 
                         for doc_id in current_faiss.index_to_docstore_id.values()]

        if merged_faiss is None:
            merged_faiss = current_faiss
        else:
            # Only add texts — this adds both vectors and metadata
            merged_faiss.add_texts(current_texts)

    if merged_faiss is not None:
        merged_faiss.save_local(f'{folder_path}/merged_faiss')
        print(f"Merged FAISS index saved as merged_faiss")

    # Clean up individual indexes
    for file in sorted_files:
        faiss_path = os.path.join(folder_path, file)
        try:
            shutil.rmtree(faiss_path)
            print(f"Deleted FAISS index folder: {faiss_path}")
        except FileNotFoundError:
            print(f"Folder not found: {faiss_path}")
        except OSError as e:
            print(f"Error deleting {faiss_path}: {e}")

    return merged_faiss




# You Tube
from youtube_transcript_api import YouTubeTranscriptApi

def transcribe(youtube_video_url):
    video_id = youtube_video_url.split("=")[1]
    print(video_id)
    transcript_text = YouTubeTranscriptApi.get_transcript(video_id)
    print(transcript_text)
    transcript = ""

    for  i in transcript_text:
        transcript += " " + i["text"]
    
    with open(video_id, "w", encoding="utf-8") as f:
        f.write(transcript)
    
    print(f"Transcript saved to {video_id}")
    return video_id

def extract_video_id(youtube_url):
    """Extract video ID from various YouTube URL formats"""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:v\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, youtube_url)
        if match:
            return match.group(1)
    
    return None

def transcribe_with_retry(youtube_video_url, max_retries=3, delay_range=(1, 3)):
    """Transcribe with retry logic for handling intermittent failures"""
    video_id = extract_video_id(youtube_video_url)
    
    if not video_id:
        print("Error: Could not extract video ID from URL")
        return None
    
    print(f"Video ID: {video_id}")
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}")
            
            # Try to get transcript
            transcript_text = YouTubeTranscriptApi.get_transcript(video_id)
            
            # Combine all transcript text
            transcript = ""
            for entry in transcript_text:
                transcript += " " + entry["text"]
            
            # Save to file
            filename = f"{video_id}_transcript.txt"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(transcript.strip())
            
            print(f"Transcript saved to {filename}")
            return filename
            
        except TranscriptsDisabled:
            print(f"Error: Transcripts are disabled for video {video_id}")
            return None
            
        except NoTranscriptFound:
            print(f"Error: No transcript found for video {video_id}")
            return try_auto_generated_transcript(video_id, attempt, max_retries, delay_range)
            
        except VideoUnavailable:
            print(f"Error: Video {video_id} is unavailable")
            return None
            
        except (xml.etree.ElementTree.ParseError, Exception) as e:
            error_msg = str(e)
            print(f"Attempt {attempt + 1} failed with error: {error_msg}")
            
            # Check if it's the XML parsing error we're trying to handle
            if "no element found" in error_msg or "ParseError" in str(type(e)):
                if attempt < max_retries - 1:
                    wait_time = random.uniform(*delay_range)
                    print(f"XML parsing error detected. Retrying in {wait_time:.1f}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"All {max_retries} attempts failed due to XML parsing errors")
                    return None
            else:
                # For other unexpected errors, don't retry
                print(f"Unexpected error (not retrying): {error_msg}")
                return None
    
    return None  

def load_txt(file_path):
    """Load PDF and return documents."""
    #loader = PyPDFLoader(f"{file_path}.pdf")
    loader = TextLoader(file_path)
    return loader.load()

def split_text_with_semantic_chunker(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"  # Can be changed to "standard_deviation", "interquartile"
    )
    documents = text_splitter.create_documents([doc.page_content for doc in docs])
    #documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    #documents = text_splitter.create_documents(docs)
    print("Documents split into semantic chunks.")
    return documents

def parsing(input):
    converter = PdfConverter(artifact_dict=create_model_dict())
    rendered = converter(input)
    text, _, images = text_from_rendered(rendered)
    return text

def marks_splitter(headers_to_split_on, content):
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(content)
    print("CCC")
    print(md_header_splits)
    return md_header_splits

def allocate_slides(chunks, total_slides, min_chars=60):
    """
    Allocate slides based on chunk size (works on Document-like objects).

    Args:
        chunks: List of Document (or string) objects
        total_slides: Total number of slides to generate
        min_chars: Minimum character count to consider a chunk valid

    Returns:
        Dict mapping original chunk index to number of slides to generate
    """
    # Step 0: Extract text from each chunk
    extracted = []
    for i, chunk in enumerate(chunks):
        if hasattr(chunk, "page_content"):
            text = chunk.page_content
        elif hasattr(chunk, "content"):
            text = chunk.content
        elif isinstance(chunk, str):
            text = chunk
        else:
            text = str(chunk)
        extracted.append((i, text))

    # Step 1: Filter out too‑short chunks
    valid = [(i, txt) for i, txt in extracted if len(txt) >= min_chars]
    if not valid:
        return {}

    # Step 2: Total characters across valid chunks
    total_chars = sum(len(txt) for _, txt in valid)

    # Step 3: Initial proportional allocation
    allocations = {}
    remaining = total_slides
    for idx, txt in valid:
        prop = len(txt) / total_chars
        cnt = max(1, round(prop * total_slides))
        allocations[idx] = cnt
        remaining -= cnt

    # Step 4a: If we’ve overshot, remove from the largest buckets
    while remaining < 0:
        # pick the chunk with max slides
        max_idx = max(allocations, key=allocations.get)
        if allocations[max_idx] > 1:
            allocations[max_idx] -= 1
            remaining += 1
        else:
            break

    # Step 4b: If we’ve undershot, add to the largest chunks by text size
    while remaining > 0:
        # sort valid chunks by length descending
        sorted_by_size = sorted(valid, key=lambda x: len(x[1]), reverse=True)
        for i in range(min(remaining, len(sorted_by_size))):
            idx = sorted_by_size[i][0]
            allocations[idx] = allocations.get(idx, 0) + 1
            remaining -= 1

    return allocations


  # adjust as needed

class SlideCollection:
    """Container class for slides with dict-like behavior"""
    def __init__(self):
        self.slides = {}
    
    def add_slide(self, key, content):
        self.slides[key] = content

    # ← Add these:
    def keys(self):
        return self.slides.keys()

    def __iter__(self):
        return iter(self.slides)

    def __getitem__(self, key):
        return self.slides[key]
    
    def items(self):
        return self.slides.items()
    
    def values(self):
        return self.slides.values()
    
    def __repr__(self):
        return repr(self.slides)
    
def quick_json_fix(ai_message) -> str:
    # Extract text content first
    if hasattr(ai_message, 'content'):
        text = ai_message.content
    else:
        text = str(ai_message)
        
    text = text.strip()
    
    # Remove explanatory text
    if "Here's another attempt" in text or "I apologize" in text:
        start = text.find('{')
        end = text.rfind('}') + 1
        if start != -1 and end != 0:
            text = text[start:end]
    
    # Fix quote issues
    text = re.sub(r"'(\w+)\":", r'"\1":', text)
    
    # Remove "properties" wrapper
    try:
        parsed = json.loads(text)
        if "properties" in parsed:
            return json.dumps(parsed["properties"])
    except:
        pass
    
    return text
    
def generate_slide_content_alloc1(chunks,allocations,num_slides,num_mcqs, is_image, is_question, question_position,GPU):
    try:
        #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
        """
        Generate slide JSON for each chunk according to allocations, then merge.

        Args:
            vectors: your VectorStore (for similarity_search).
            chunks: list of text‐chunks (strings or Document-like).
            allocations: dict mapping chunk_index -> num_slides.
            is_image, is_question, question_position: (passed through if needed).

        Returns:
            A dict {"slide_1": {...}, ..., "slide_N": {...}} of all slides in order.
        """
        # 1. Build the LangChain prompt/chain once
        slide_content_template_old = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - A specific, concise **visualization suggestion**

        **Context**: {query}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
        - "subheading": A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        - "visualization_suggestion": A very specific, context-aware suggestion (no longer than 5 words).

        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        slide_content_template = """
You are an expert presentation writer.

Your task is to generate exactly {{ num_slides }} professional, informative slides based on the provided context.

Each slide must follow this exact JSON format:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "Concise subheading",
      "paragraphs": [
        "Insightful paragraph 1",
        "Insightful paragraph 2 (optional)"
      ],
      "visualization_suggestion": "Max 5 words",
      "image": null
    }}
  }},
  "token_count": Integer (approximate)
}}

Rules:
- Return only the JSON.
- Do not ask for input.
- Do not include explanations.
- Only generate slide content using the context below.

### Context:
{{ chunk }}
"""
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template_old)
        if(GPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b"
            #model = "deepseek-r1:8b"
        )
        # slide_content_chain_old = (
        #     {
        #         "context": lambda x: vectors.similarity_search(x["query"], k=3),
        #         "num_slides": lambda x: x["num_slides"],
        #         "format_instructions": lambda x: parser.get_format_instructions()
        #     }
        #     | slide_content_prompt
        #     | llm
        #     | parser
        # )
        slide_content_chain = (
            {
                "query": lambda x: x["query"],  # No vector store involved
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            | parser
        )

        # 2. Iterate chunks in allocation order and collect
        #all_slides = {}
        all_slides = SlideCollection()
        counter = 1

        for chunk_idx in sorted(allocations):
            n = allocations[chunk_idx]
            # extract text
            chunk = chunks[chunk_idx]
            print("===========================================================")
            print(chunk)
            query = getattr(chunk, "page_content",
                            getattr(chunk, "content", str(chunk)))
            print("Query")
            print(query)
            print("Query end")
            # invoke chain
            result = slide_content_chain.invoke({"query": query, "num_slides": n})
            print("result")
            print(result)
            raw = result.model_dump()
            slide_items = raw.get("slides", raw)

            # flatten into all_slides
            for slide_key in sorted(
                slide_items.keys(),
                key=lambda k: int(k.split("_")[1])
            ):
                all_slides.add_slide(f"slide_{counter}", slide_items[slide_key])
                counter += 1
        
        print("all_slides")
        print("slides_output\n", all_slides)
        print(type(all_slides))

        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(all_slides.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )
        #print(ordered_slides)
        all_slides.slides = ordered_slides
        print("yess")
        print(all_slides.slides)
        # Image handling logic
        if is_image:
            print(is_image)
            for slide_key, slide_content in all_slides.items():
                # Check if the slide has a visualization suggestion
                if slide_content.get('visualization_suggestion'):
                    image_url = get_valid_image_new(slide_content['visualization_suggestion'], slide_content)
                    print("ImageURL")
                    print(image_url)
                    if image_url:
                        slide_content['image'] = image_url  # Correctly modify the dictionary
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content['image'] = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content['image'] = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in all_slides.values():
                slide_content['image'] = None
        
        #print("all_slides")
        #print(all_slides)
        token_count = 0
        token_count_slides= 0
        for slide in all_slides.values():
            # Assuming slide is a dictionary
            text_content = f"{slide['subheading']} {' '.join(slide['paragraphs'])} {slide['visualization_suggestion']}"
            token_count += count_tokens(text_content)
            token_count_slides += count_tokens(text_content)
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            print("MCQ Started")
            mcq_template = """
                Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

                **Context**: {context}

                The final output **must be valid JSON only**. Ensure all keys and string values are enclosed in double quotes and do not include any additional strings or explanations:
                

                Don't give any conversational like text, For example: "Here is the MCQ:". Just return JSON Output.
                Here is the pydantic class to validate the output that you give. Make sure the check always passes.

                class MCQContent(BaseModel):
                type: str = Field("Question")
                question: str = Field(..., description="The multiple-choice question")
                options: List[str] = Field(..., description="A list of 4 answer options")
                correct_answer: str = Field(..., description="The correct answer (e.g., 'a', 'b', 'c', or 'd')")
                
                CRITICAL: Return ONLY the JSON object with no additional text, explanations, or formatting.

                {format_instructions}

                JSON:"""

            mcq_output_parser_template = """
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

Context:
{context}

The final output MUST be a **valid JSON object**, and nothing else. Do not include any explanation or conversational text (e.g., "Here is your MCQ" or "Let's try again").

Output format example (use this exact structure):


{{
  "type": "Question",
  "question": "What is the capital of France?",
  "options": ["Berlin", "London", "Paris", "Madrid"],
  "correct_answer": "Paris"
}}


Ensure:
- All strings use double quotes
- No trailing commas
- Exactly 4 options
- Output must be enclosed in <json> ... </json> tags
- Only return the JSON block inside these tags
"""
            # Define a schema for the MCQ
                    
            mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            mcq_output_parser_template_prompt = ChatPromptTemplate.from_template(mcq_output_parser_template)
            # Generate MCQs
            mcqs = {}
            slide_keys = list(all_slides.keys())
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        key = slide_keys[j]
                        slide = all_slides[key]                  # slide is a dict
                        title = getattr(slide, "subheading", "")
                        paras = getattr(slide, "paragraphs", [])     # list of paragraphs
                        context_slides.append(f"{title}: {' '.join(paras)}")

                    context_text = "\n".join(context_slides)
                    print(type(context_text))
                    print("context_text")
                    print(context_text)
                    json_fixer = RunnableLambda(quick_json_fix)
                    # Create the OutputFixingParser
                    output_fixing_parser = OutputFixingParser.from_llm(
                        llm=llm,
                        parser=mcq_parser
                    )
                    mcq_result = (
                        mcq_prompt
                        | llm
                        | json_fixer 
                        | output_fixing_parser  # This will automatically retry if parsing fails
                    ).invoke({"context": context_text,
                            "format_instructions": mcq_parser.get_format_instructions()})
                    print("======================================================================================")
                    mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result
            
            #Calculate token count
            #token_count=0
            for mcq in mcqs.values():
                text_content = f"{mcq.question} {' '.join(mcq.options)} {mcq.correct_answer}"
                token_count += count_tokens(text_content)

            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs
            
            if question_position == 1:
                interval = total_slides // total_mcqs if total_mcqs > 0 else total_slides
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        print(mcq_key)
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count+token_count_slides)
                return storigo_content
            
            else:
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    #interleaved_content[mcq_key] = mcqs[mcq_key]
                    if mcq_key in mcqs:
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                    else:
                        print(f"[Warning] Skipping missing MCQ: {mcq_key}")
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count+token_count_slides)
                return storigo_content  
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count_slides)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")

def  main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU):
    # parse_data = parsing(input)
    # with open("parse_data.md", "w", encoding="utf-8") as f:
    #     f.write(parse_data)
    # headers_to_split_on = [
    # ("#", "Header 1"),
    # ("##", "Header 2"),
    # ("###", "Header 3"),
    # ("####", "Header 4")
    # ] 
    # with open("parse_data.md", "r", encoding="utf-8") as f:
    #     content = f.read()
    # chunks = marks_splitter(headers_to_split_on, content)
    # print("Chunks")
    # allocation = allocate_slides(chunks, num_slides, min_chars=100)
    # print(f"Total chunks: {len(chunks)}")
    # print(f"Valid chunks: {len(allocation)}")
    # print("\nSlide allocation:")
    # slide_content = generate_slide_content_alloc1(chunks,allocation,num_slides,num_mcqs, is_image, is_question, question_position,GPU)
    # print(slide_content)
    
    # HTML / YouTube
    #file_path = transcribe(input)
    file_path = transcribe_with_retry(input)
    # file_path = await crawlerrr(input)
    print("Transcribe Complete")
    print("----------------------------------------------------")
    print(file_path)
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    print("Embedding Initialize")
    text = load_txt(file_path)
    print(text)
    print("Txt Loaded Complete")
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    print("Embedding Initialize")
    # text = read_file_url(file_path)
    meaningful_content = clean_using_llm(text)
    print("Extracting Meaningful Content Complete")
    print(meaningful_content)
    print("SLides started")
    # split_documents = split_text_with_semantic_chunker_for_url(meaningful_content, embeddings)
    # print("Spliting Complete")
    slide = generate_slide_content_seperate(text, num_slides, num_mcqs, is_image, is_question, question_position, GPU)
    print(slide)
    # create_and_save_embeddings(split_documents, client_id)
    # print("Embedding Saved")
    # merge_embeddings = merge_all_faiss(client_id)
    # print("Embedding Merged")
    # #slide_content = generate_slide_content_old(merge_embeddings,client_id, num_slides,num_mcqs, is_image, is_question, question_position,GPU)
    # slide_content = generate_slide_content_new(merge_embeddings, client_id, num_slides,num_mcqs, is_image, is_question, question_position, GPU)
    # print(slide_content)
    
        
    
    
    
if __name__ == "__main__":
    # Run the main function asynchronously
    #input = 'CleanMax_Code_Of_Conduct_Supplier.pdf'
    #input = 'https://edurigo.com/'
    input = 'https://www.youtube.com/watch?v=HsyYnY674sE'
    output_dir = f'temp/{input}'
    embedding_folder_base = 'output_embeddings'
    
    client_id = 508
    num_slides = 29
    is_image = True
    num_mcqs=14
    is_question = True
    question_position = 1
    GPU =1 
    #main(input, output_dir,client_id, is_image) 
    #main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU)
    main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU)
    #asyncio.run(main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU))
    
