import os
import time
import random
import re
import json
from jsonschema import validate, ValidationError
from langchain_core.messages import AIMessage
import requests
import numpy as np
#from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from crawl4ai import AsyncWebCrawler
import asyncio
from langchain.output_parsers import OutputFixingParser
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredPowerPointLoader
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from urllib.parse import urlparse
from typing import Union, Dict
from langchain_core.runnables import RunnableLambda
import shutil
from pptx import Presentation
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.output_parsers import OutputFixingParser

GROQ_API_KEY = "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"
OLLAMA_MODEL = "nomic-embed-text"
PIXABAY_API_KEY = "44622834-f22df6f12cf45558ee180dd8d"


class SlideContent(BaseModel):
    #heading: str = Field(..., description="The main heading of the slide")
    type: str = Field("flash")
    subheading: Optional[str] = Field(None, description="An optional subheading for the slide")
    paragraphs: List[str] = Field(..., description="List of paragraphs for the slide content")  
    visualization_suggestion: str = Field(..., description="A specific and concise suggestion for a relevant visualization or image (max 5 words)")
    image: Optional[str] = Field(None, description="URL of the image for the slide")
    #subheading = heading

class MCQContent(BaseModel):
    type: str = Field("Question")
    question: str = Field(..., description="The multiple-choice question")
    options: List[str] = Field(..., description="A list of 4 answer options")
    correct_answer: str = Field(..., description="The correct answer (e.g., 'a', 'b', 'c', or 'd')")

class StorigoContent(BaseModel):
    slides: Dict[str, SlideContent] = Field(..., description="Dictionary of slide contents with slide numbers as keys")
    #mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    token_count: int = Field(..., description="Total token count for all the generated content")
    #token_count: int = 0


class StorigoContentMCQ(BaseModel):
    slides: Dict[str, SlideContent] = Field(..., description="Dictionary of slide contents with slide numbers as keys")
    mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    token_count: int = Field(..., description="Total token count for all the generated content")


class StorigoContentMCQMid(BaseModel):
    slides: Dict[str, Union[SlideContent, MCQContent]] = Field(..., description="Dictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keys")
    #mcqs: Dict[str, MCQContent] = Field(..., description="Dictionary of MCQs with identifiers like 'mcq_1' as keys")
    token_count: int = Field(..., description="Total token count for all the generated content")

class CustomMCQParser(PydanticOutputParser):
    def parse_result(self, result):
        # Step 1: Ensure that result is a string
        if isinstance(result, list):
            # If it's a list, join it into a single string
            result = " ".join(result)
        
        # Step 2: Convert single quotes to double quotes
        result = result.replace("'", "\"")

        # Step 3: Optionally, remove any unwanted text (e.g., "Here is the MCQ:")
        result = result.replace("Here is the MCQ:", "").strip()

        # Step 4: Try parsing the cleaned output as JSON
        try:
            # Parse the cleaned string into a JSON object
            json_object = json.loads(result)
            
            # Step 5: Use the Pydantic model to validate the JSON
            return self.pydantic_object.model_validate(json_object)
        except json.JSONDecodeError as e:
            raise Exception(f"Error decoding JSON: {str(e)}")
        except Exception as e:
            raise Exception(f"Error parsing result: {str(e)}")
        
        
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

# def create_embeddings(text, client_id):
#     try:
#         embeddings = OllamaEmbeddings(model=OLLAMA_MODEL)
#         vectors = FAISS.from_texts([text], embeddings)
        
#         client_dir = f"my_embeddings/{client_id}"
#         os.makedirs(client_dir, exist_ok=True)
#         vectors.save_local(client_dir)
        
#         return vectors
#     except Exception as e:
#         raise Exception(f"Error creating embeddings: {str(e)}")

def generate_search_query(visualization_suggestion, slide_content):
    context_keywords = extract_context_keywords(slide_content)
    
    # Combine visualization suggestion with context keywords
    combined_query = f"{visualization_suggestion} {' '.join(context_keywords)}"
    
    # Extract key words from the combined query
    words = re.findall(r'\w+', combined_query.lower())
    
    # Remove common words
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    filtered_words = [word for word in words if word not in common_words]
    
    # Prioritize words from the visualization suggestion
    suggestion_words = visualization_suggestion.lower().split()
    prioritized_words = suggestion_words + [word for word in filtered_words if word not in suggestion_words]
    
    # Take the first 5 words
    query_words = prioritized_words[:min(5, len(prioritized_words))]
    
    return " ".join(query_words)

def generate_search_query_new(visualization_suggestion, slide_content):
    context_keywords = extract_context_keywords_new(slide_content)
    
    # Combine visualization suggestion with context keywords
    combined_query = f"{visualization_suggestion} {' '.join(context_keywords)}"
    
    # Extract key words from the combined query
    words = re.findall(r'\w+', combined_query.lower())
    
    # Remove common words
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    filtered_words = [word for word in words if word not in common_words]
    
    # Prioritize words from the visualization suggestion
    suggestion_words = visualization_suggestion.lower().split()
    prioritized_words = suggestion_words + [word for word in filtered_words if word not in suggestion_words]
    
    # Take the first 5 words
    query_words = prioritized_words[:min(5, len(prioritized_words))]
    
    return " ".join(query_words)

def extract_context_keywords(slide_content):
    # Extract keywords from slide content to provide context
    text = f"{slide_content.subheading or ''} {' '.join(slide_content.paragraphs)}"
    words = re.findall(r'\w+', text.lower())
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    keywords = [word for word in words if word not in common_words]
    return list(set(keywords))[:3]  # Return up to 3 unique keywords
def extract_context_keywords_new(slide_content):
    # Extract keywords from slide content to provide context
    text = f"{slide_content.get('subheading', '')} {' '.join(slide_content.get('paragraphs', []))}"    
    words = re.findall(r'\w+', text.lower())
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    keywords = [word for word in words if word not in common_words]
    return list(set(keywords))[:3]  # Return up to 3 unique keywords

def fetch_pixabay_image(query):
    url = "https://pixabay.com/api/"
    params = {
        "key": PIXABAY_API_KEY,
        "q": query,
        "image_type": "photo",
        "orientation": "horizontal",
        "per_page": 5,  # Fetch top 5 images
        "safesearch": "true",
        "order": "relevance"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data["hits"]:
            # Sort images by relevance score (you may need to adjust this based on Pixabay's API)
            sorted_hits = sorted(data["hits"], key=lambda x: x["likes"] + x["downloads"], reverse=True)
            return sorted_hits[0]["webformatURL"]  # Return the most relevant image
        else:
            print(f"No image found for query: {query}")
            return None
    except requests.RequestException as e:
        print(f"Error fetching image from Pixabay: {str(e)}")
        return None
    except Exception as e:
        print(f"Unexpected error in fetch_pixabay_image: {str(e)}")
        return None

def get_valid_image(visualization_suggestion, slide_content, max_attempts=3):
    if not visualization_suggestion:
        print("No visualization suggestion provided.")
        return None

    for attempt in range(max_attempts):
        try:
            query = generate_search_query(visualization_suggestion, slide_content)
            print(f"Attempt {attempt + 1} to fetch image for query: {query}")
            image_url = fetch_pixabay_image(query)
            
            if image_url:
                print(f"Valid image found: {image_url}")
                return image_url
            else:
                print(f"No image URL returned for query: {query}")
            
            time.sleep(1)
        except Exception as e:
            print(f"Error in get_valid_image (attempt {attempt + 1}): {str(e)}")
    
    print(f"No valid image found after {max_attempts} attempts")
    return None

def get_valid_image_new(visualization_suggestion, slide_content, max_attempts=3):
    if not visualization_suggestion:
        print("No visualization suggestion provided.")
        return None

    for attempt in range(max_attempts):
        try:
            query = generate_search_query_new(visualization_suggestion, slide_content)
            print(f"Attempt {attempt + 1} to fetch image for query: {query}")
            image_url = fetch_pixabay_image(query)
            
            if image_url:
                print(f"Valid image found: {image_url}")
                return image_url
            else:
                print(f"No image URL returned for query: {query}")
            
            time.sleep(1)
        except Exception as e:
            print(f"Error in get_valid_image (attempt {attempt + 1}): {str(e)}")
    
    print(f"No valid image found after {max_attempts} attempts")
    return None

def count_tokens(text):
    # Here you can define how you count tokens.
    # A simple way is to count words, assuming each word is a token.
    tokens = re.findall(r'\w+', text)
    return len(tokens)


import heapq

def generate_slide_content_new_1(vectors, num_slides, num_mcqs, is_image, is_question, question_position):
    try:
        # 1. Instantiate your LLM
        llm = ChatOllama(
            base_url='http://127.0.0.1:11434',
            model="llama3:8b"
        )

        # 2. Your prompt template (unchanged)
        slide_content_template = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - A specific, concise **visualization suggestion**

        **Context**: {context}

        {format_instructions}

        The final output **must** be a valid JSON object with keys
        `"slide_1"`, `"slide_2"`, …, `"slide_{num_slides}"` in strict sequential order.

        Each slide object should contain:
        - `"subheading"`
        - `"paragraphs"` (list of strings)
        - `"visualization_suggestion"`
        """

        # 3. Prepare parser & prompt
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)

        # 4. Build the reusable chain
        def make_chain():
            return (
                {
                    "context": lambda x: x["context"],
                    "num_slides": lambda x: x["num_slides"],
                    "format_instructions": lambda x: parser.get_format_instructions()
                }
                | slide_content_prompt
                | llm
                | parser
            )

        # 5. Extract all chunks
        docs = list(vectors.docstore._dict.values())
        if not docs:
            raise ValueError("No chunks available in the vector store.")

        # 6. Loop over every chunk, generate slides, and merge
        all_slides = {}
        global_idx = 1
        for doc in docs:
            chain = make_chain()
            result: StorigoContent = chain.invoke({
                "context": doc.page_content,
                "num_slides": num_slides
            })
            # result.slides is {"slide_1": {...}, ..., "slide_n": {...}}
            for _, slide_obj in sorted(result.slides.items(), key=lambda x: x[0]):
                all_slides[f"slide_{global_idx}"] = slide_obj
                global_idx += 1

        # 7. Return a single Pydantic model containing all slides
        return StorigoContent(slides=all_slides)

    except Exception as e:
        print(f"Error generating slides: {e}")
        return None

def generate_slide_content_new(vectors, num_slides,num_mcqs, is_image, is_question, question_position):
    try:
        #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
        llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b"
            #model = "deepseek-r1:8b"
        )

        # Prompt template
        slide_content_template = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.
        
        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - and a specific, concise **visualization suggestion**

        **Context**: {context}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading or subheading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source materialor words like 
        "In this video" and such words.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
       
        - "subheading" (if applicable): A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        -"visualization_suggestion":visualization_suggestion for each slide should be very specific, context-aware, and no longer than 5 words.
        
        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        # Creating the chain
        slide_content_chain = (
            {
                "context": lambda x: vectors.similarity_search(x["query"], k=3),
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )


        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            #token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
            Based on the following context from the last two slides, generate one multiple-choice question (MCQ). 

            **Context**: {context}

            The MCQ should include:
            - A **question** relevant to the context  
            - Exactly **4 answer options** as a list  
            - The correct option letter (e.g., 'a', 'b', 'c', or 'd')

            The final output must be **strictly valid JSON** — no explanations, headers, or additional text.

            Example output:  
            {
                "question": "What is the primary benefit of using encryption in data security?",
                "options": [
                    "It makes data faster to process",
                    "It prevents unauthorized access",
                    "It reduces storage requirements",
                    "It improves internet speed"
                ],
                "correct_answer": "b"
            }
            """

            mcq_parser = PydanticOutputParser(pydantic_object=MCQContent) 
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            # Generate MCQs
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)

                    mcq_result = (
                                    RunnableLambda(lambda x: {
                                        "context": context_text,
                                        "question": x.get("question", "Generate an MCQ based on the context")
                                    })
                                    | mcq_prompt
                                    | llm
                                    | mcq_parser
                                ).invoke({"context": context_text})

                    mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result

            # Calculate token count
            print("Generated MCQ Raw Output:", mcq_result)
            for mcq in mcqs.values():
                text_content = f"{mcq.question} {' '.join(mcq.options)} {mcq.correct_answer}"
                token_count += count_tokens(text_content)

            # Return the final StorigoContent object
            #if question_position == "bottom":
            storigo_content = StorigoContentMCQ(slides=ordered_slides, mcqs=mcqs, token_count=token_count)
                

            # Interleave slides and MCQs for display
            #else:
            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs

            # Calculate interval for inserting MCQs
            if question_position == 1:
                if total_mcqs > 0:
                    interval = total_slides // total_mcqs
                else:
                    interval = total_slides  # Default to not inserting any MCQs if there are none

                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]

                    # Insert an MCQ based on the calculated interval
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                    #return interleaved_content 
                return storigo_content
            else :
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]

                # Add all MCQs at the end
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]

                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")
    

def generate_slide_content_old(vectors, num_slides,num_mcqs, is_image, is_question, question_position):
    try:
        #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
        llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b"
            #model = "deepseek-r1:8b"
        )
        # Prompt template
        slide_content_template = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.
        
        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - and a specific, concise **visualization suggestion**

        **Context**: {context}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading or subheading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source materialor words like 
        "In this video" and such words.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
       
        - "subheading" (if applicable): A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        -"visualization_suggestion":visualization_suggestion for each slide should be very specific, context-aware, and no longer than 5 words.
        
        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain
        slide_content_chain = (
            {
                "context": lambda x: vectors.similarity_search(x["query"], k=3),
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)
        print("result")
        print(result)
        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )
        print("ordered_slides")
        print(ordered_slides)

        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
                Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

                **Context**: {context}

                The MCQ should include:
                - A **question** related to the context
                - Exactly **4 answer options** (a, b, c, d)
                - A clear indication of the **correct answer**

                The final output **must be valid JSON only** with no additional text or formatting. Ensure the response is strictly in this format:

                {{
                "question": "<The MCQ question>",
                "options": [
                    "Option 1",
                    "Option 2",
                    "Option 3",
                    "Option 4"
                ],
                "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"
                Dont give  'a', 'b', 'c', or 'd' in options.
                }}
                """

            mcq_parser = PydanticOutputParser(pydantic_object=MCQContent) 
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            # Generate MCQs
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)

                    mcq_result = (
                        RunnableLambda(lambda x: {"context": context_text})
                        | mcq_prompt
                        | llm
                        | mcq_parser
                    ).invoke({})

                    mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result

            # Calculate token count
            
            for mcq in mcqs.values():
                text_content = f"{mcq.question} {' '.join(mcq.options)} {mcq.correct_answer}"
                token_count += count_tokens(text_content)

            # Return the final StorigoContent object
            #if question_position == "bottom":
            storigo_content = StorigoContentMCQ(slides=ordered_slides, mcqs=mcqs, token_count=token_count)
                

            # Interleave slides and MCQs for display
            #else:
            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs

            # Calculate interval for inserting MCQs
            if question_position == 1:
                if total_mcqs > 0:
                    interval = total_slides // total_mcqs
                else:
                    interval = total_slides  # Default to not inserting any MCQs if there are none

                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]

                    # Insert an MCQ based on the calculated interval
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                    #return interleaved_content 
                return storigo_content
            else :
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]

                # Add all MCQs at the end
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]

                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
                
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")


def generate_slide_content(vectors,client_id, num_slides,num_mcqs, is_image, is_question, question_position, GPU):
    try:
        #llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        if(GPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b",
            model = "gemma3:12b"
            #model = "deepseek-r1:8b"
        )

        # Load all chunks from vector store
        all_chunks = list(vectors.docstore._dict.values())
        print("YOYO")
        print(all_chunks)
        total_chunks = len(all_chunks)

        # Step 1: Estimate total possible slides
        def estimate_slides(text, words_per_slide=50):
            return max(1, round(len(text.split()) / words_per_slide))
        

        total_possible = sum(estimate_slides(doc.page_content) for doc in all_chunks)
        print(f"🧠 Estimated possible slides from document: {total_possible}")

        # Step 2: Select chunks for context
        selected_chunks = []

        if num_slides <= total_chunks:
            print("📊 Scenario 1: Fewer slides than chunks — sampling evenly.")
            indices = np.linspace(0, total_chunks - 1, num=num_slides, dtype=int)
            selected_chunks = [all_chunks[i] for i in indices]
        else:
            print("📊 Scenario 2: More slides than chunks — distributing proportionally.")
            word_counts = [len(c.page_content.split()) for c in all_chunks]
            total_words = sum(word_counts)
            raw_alloc = [w / total_words * num_slides for w in word_counts]
            alloc = [max(1, round(x)) for x in raw_alloc]

            # Adjust total to exactly num_slides
            diff = num_slides - sum(alloc)
            i = 0
            while diff != 0:
                if diff > 0:
                    alloc[i] += 1
                    diff -= 1
                elif diff < 0 and alloc[i] > 1:
                    alloc[i] -= 1
                    diff += 1
                i = (i + 1) % total_chunks

            # Duplicate chunks proportionally
            for chunk, count in zip(all_chunks, alloc):
                selected_chunks.extend([chunk] * count)

        context_text = "\n\n".join(doc.page_content for doc in selected_chunks[:num_slides])
        print("Context text is ")
        print(context_text)
        # Prompt template
        slide_content_template = """
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like “as we know”, “in general”, or “in this video”.

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **visualization suggestion** (max 5 words, specific to the content)

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
"""

        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        raw_parser = PydanticOutputParser(pydantic_object=StorigoContent)
        parser = OutputFixingParser.from_llm(parser=raw_parser, llm=llm)    
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
        
        # Creating the chain
        slide_content_chain = (
            {
                #"context": lambda x: vectors.similarity_search(x["query"], k=3),
                #"context": lambda x: "\n\n".join([doc.page_content for doc in vectors.docstore._dict.values()]),
                "context": lambda x: context_text,
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            
            | parser
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})

        # Sort result slides to ensure they are in the correct order (by slide number)
        #ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))
        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(result.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )


        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None
        
        token_count = 0
        for slide in ordered_slides.values():
            text_content = f"{slide.subheading} {' '.join(slide.paragraphs)} {slide.visualization_suggestion}"
            token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            mcq_template = """
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context  
- Exactly **4 answer options**  
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}


"""


            def is_valid_json(response):
                try:
                    json.loads(response)
                    return True
                except json.JSONDecodeError:
                    return False

            #mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            llm = llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            #model = "llama3:8b"
            model = "gemma3:12b")
            
            mcqs = {}
            slide_keys = list(ordered_slides.keys())
            
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        slide = ordered_slides[slide_keys[j]]
                        context_slides.append(f"{slide.subheading}: {' '.join(slide.paragraphs)}")

                    context_text = "\n".join(context_slides)
                    print("Context")
                    print(context_text)
                    try:
                        mcq_result = (
                            RunnableLambda(lambda x: {"context": context_text})
                            | mcq_prompt
                            | llm
                        ).invoke({})

                        print("qwer")
                        print(mcq_result)
                        print("END")
                        #tokens1 = mcq_result['content']
                        #tokens1 = mcq_result['usage_metadata']['total_tokens']
                        print("tokens1")
                        #print(tokens1)

                        if hasattr(mcq_result, 'content'):
                            # Extract the content (this is the LLM's response)
                            content = mcq_result.content
                        else:
                            # If mcq_result does not have 'content', try to convert it to a string
                            content = str(mcq_result)

                        # Now, process the content to extract the JSON inside the curly braces

                        start = content.find('{')  # Find the first opening curly brace
                        if start != -1:
                            open_braces = 0
                            end = -1
                            for i, char in enumerate(content[start:], start=start):
                                if char == '{':
                                    open_braces += 1
                                elif char == '}':
                                    open_braces -= 1
                                    if open_braces == 0:
                                        end = i
                                        break

                            # Extract the content inside the first pair of curly braces
                            if end != -1:
                                content_only = content[start:end+1]
                                print(content_only) 
                                json_object = json.loads(content_only)
                                 # This will print the content inside the outermost curly braces
                            else:
                                print("No matching closing brace found.")
                        else:
                            print("No opening brace found.")
                        print("match")
                            
                        
                         # Extract the content from the AI response
                        mcq_content = mcq_result.content if hasattr(mcq_result, "content") else mcq_result
                        print("mcq_content")
                        print(mcq_content)

                        print("content_only")
                        print(content_only)
                        print("content_only12")
                        # Clean and fix the JSON format
                        # if not mcq_content or not mcq_content.strip():
                        #     raise ValueError("Empty or invalid response from Ollama")

                        if not content_only or not content_only.strip():
                            raise ValueError("Empty or invalid response from Ollama")

                        try:
                            # cleaned_content = mcq_content.replace("'", '"')  
                            # mcq_json = json.loads(cleaned_content)
                            
                            # formatted_mcq = {
                            #     "question": mcq_json.get("question", ""),
                            #     "options": mcq_json.get("options", []),
                            #     "correct_answer": mcq_json.get("correct_answer", "")
                            # }
                            #cleaned_content = content_only.replace("'", '"')  
                            #mcq_json = json.loads(cleaned_content)
                            
                            formatted_mcq = {
                                "question": json_object.get("question", ""),
                                "options": json_object.get("options", []),
                                "correct_answer": json_object.get("correct_answer", "")
                            }

                            print("Formatted MCQ:")
                            print(json.dumps(formatted_mcq, indent=4))

                            if formatted_mcq["question"] and len(formatted_mcq["options"]) == 4 and formatted_mcq["correct_answer"]:
                                mcq_key = f"mcq_{len(mcqs) + 1}"
                                mcqs[mcq_key] = formatted_mcq
                                
                                print(f"✅ MCQ {mcq_key} generated and saved!")
                            else:
                                raise ValueError("Incomplete MCQ data")
            
                            
                        except json.JSONDecodeError as e:
                            print(f"JSON Decode Error: {e}")
                            print("Raw Response:", mcq_content)
                        # Validate the JSON structure
                        # if not is_valid_json(mcq_json):
                        #     raise ValueError("Invalid JSON structure for MCQ")

                        # # Save the valid MCQ
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_json
                        # # if not mcq_result or not is_valid_json(mcq_result):
                        # #     raise ValueError("Invalid or empty response from Ollama")
                        # if not mcq_content or not is_valid_json(mcq_content):
                        #     raise ValueError("Invalid or empty response from Ollama")
                        
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result
                        # #mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result.model_dump()
                        # mcqs[f"mcq_{len(mcqs) + 1}"] = json.loads(mcq_content)
                        
                    # except Exception as e:
                    #     print(f"Error generating MCQ: {e}")
                    #     mcqs[f"mcq_{len(mcqs) + 1}"] = {
                    #         "question": "Could not generate question",
                    #         "options": ["N/A", "N/A", "N/A", "N/A"],
                    #         "correct_answer": "a"
                    #     }
                    except Exception as e:
                        print(f"Error generating MCQ: {e}")
            
            #Calculate token count
            #token_count=0
            for mcq in mcqs.values():
                text_content = f"{mcq['question']} {' '.join(mcq['options'])} {mcq['correct_answer']}"
                token_count += count_tokens(text_content)

            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs
            
            if question_position == 1:
                interval = total_slides // total_mcqs if total_mcqs > 0 else total_slides
                
                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                    
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        print(mcq_key)
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
            
            else:
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = ordered_slides[slide_key]
                
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]
                
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")


def generate_slide_content_chunks(vectors, client_id, num_slides, num_mcqs, is_image, is_question, question_position, GPU):

    try:
        # Setup LLM
        if GPU == 0:
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(base_url='http://127.0.0.1:11434', model="llama3:8b")

        # Get all chunks
        all_chunks = list(vectors.docstore._dict.values())
        total_chunks = len(all_chunks)

        print(f"📄 Total Chunks Available: {total_chunks}")

        # Select chunks based on num_slides
        if num_slides <= total_chunks:
            indices = np.linspace(0, total_chunks - 1, num=num_slides, dtype=int)
            selected_chunks = [all_chunks[i] for i in indices]
        else:
            selected_chunks = all_chunks.copy()
            remaining = num_slides - len(all_chunks)
            selected_chunks.extend(all_chunks[:remaining])  # Re-use some chunks

        slide_prompt_template = """
You are an assistant that generates professional slide content in JSON format, following a precise structure.

STRICT RULES:
- ❗ ONLY use the context provided.
- ❌ DO NOT use external info or extra explanations.
- ✅ RETURN ONLY JSON that matches the exact format shown below.

Expected format:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "Your subheading here",
      "paragraphs": [
        "Paragraph 1",
        "Paragraph 2"
      ],
      "visualization_suggestion": "5-word visual idea",
      "image": null
    }}
  }},
  "token_count": 0
}}

Only return JSON. No intro or commentary.

Context:
{context}

{format_instructions}
"""


        # Parser
        raw_parser = PydanticOutputParser(pydantic_object=StorigoContent)
        parser = OutputFixingParser.from_llm(parser=raw_parser, llm=llm)

        # Prompt
        slide_prompt = ChatPromptTemplate.from_template(slide_prompt_template)

        # Chain
        slide_chain = (
            {
                "context": lambda x: x["context"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_prompt
            | llm
            | parser
        )

        # Process each chunk → slide
        final_result = {}
        for i, chunk in enumerate(selected_chunks[:num_slides]):
            print(f"🧠 Generating slide {i+1}/{num_slides} from chunk {i+1}")
            context_text = chunk.page_content.strip()
            try:
                result = slide_chain.invoke({"context": context_text})
                final_result[f"slide_{i+1}"] = result.slides["slide_1"]   # slide_1 due to pydantic model
            except Exception as e:
                print(f"❌ Slide {i+1} failed: {e}")
                final_result[f"slide_{i+1}"] = {
                    "subheading": "Generation Failed",
                    "paragraphs": [f"Error: {str(e)}"],
                    "visualization_suggestion": "error icon"
                }

        return final_result

    except Exception as e:
        print(f"❌ Critical error in slide generation: {e}")
        return None

def load_pdf(file_path):
    """Load PDF and return documents."""
    #loader = PyPDFLoader(f"{file_path}.pdf")
    loader = PyPDFLoader(file_path)
    return loader.load()

def count_total_words(docs):
    """Count the total number of words in the documents."""
    return sum(len(doc['page_content'].split()) for doc in docs)

def split_text_with_semantic_chunker(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"  # Can be changed to "standard_deviation", "interquartile"
    )
    documents = text_splitter.create_documents([doc.page_content for doc in docs])
    #documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    print("Documents split into semantic chunks.")
    return documents

async def crawlerrr(file):
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url=file)

        # Print the extracted content
        print(result.markdown)

        # Extract a safe filename from the URL
        parsed_url = urlparse(file)
        filename = parsed_url.netloc + parsed_url.path.replace('/', '_')
        if not filename.endswith('.txt'):
            filename += '.txt'

        # Save the result to the text file using the `filename`
        with open(filename, 'w') as txt_file:
            txt_file.write(result.markdown)
        print("HELLO jII")
        print(filename)
        return filename

def read_file_url(input):
    try:
        # Check if input path is a valid file
        print("Started")
        if not input or not os.path.isfile(input):
            print(f"Error: The file '{input}' does not exist or the path is incorrect.")
            return ""

        # Open the file in read mode ('r')
        with open(input, 'r', encoding='utf-8') as file:
            # Read the entire content of the file
            content = file.read()

        # Return the content of the file (if content is empty, return empty string)
        return content if content else ""
    
    except Exception as e:
        # Catch any other unexpected errors
        print(f"Error reading the file: {e}")
        return ""


def clean_using_llm(content):
    # Define the prompt template for meaningful content extraction
    prompt_template = """
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements, 
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs, 
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    """

    # Initialize the LLM (with your API key)
    #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
    llm = ChatOllama(
    base_url = 'http://127.0.0.1:11434',
    #model = "llama3:8b"
    model = "gemma3:12b"
    #model = "deepseek-r1:8b"
    )
    # Create the PromptTemplate object
    prompt = PromptTemplate(input_variables=["context"], template=prompt_template)

    # Create the LLMChain to pass the prompt and run the model
    runnable = prompt | llm

    # Run the sequence to get the filtered content
    filtered_content = runnable.invoke({"context": content})
    print(filtered_content)
    filtered_content = filtered_content.content
    print(type(filtered_content))
    return filtered_content
    # Print the filtered content (or save to a new file if needed)
    # print("Filtered Content:")
    # print(filtered_content)

def split_text_with_semantic_chunker_for_url(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    # Check if docs is a string instead of a list
    if isinstance(docs, str):
        # Convert the string to a list with one item
        docs = [docs]

    # Debugging: Print the type of items in docs
    print(f"Type of docs after conversion: {type(docs)}")
    print(f"First item in docs: {docs[0] if docs else 'Empty list'}")

    # Convert strings to dictionaries with 'page_content' if needed
    if isinstance(docs[0], str):
        docs = [{'page_content': doc} for doc in docs]

    # Ensure all docs have the correct structure
    if not all(isinstance(doc, dict) and 'page_content' in doc for doc in docs):
        print("Error: Invalid document structure.")
        return []

    # Create semantic chunks
    documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    print("Documents split into semantic chunks.")
    print(documents)
    return documents

def save_documents_to_txt(documents, output_dir):
    """Saves each document in the documents list as a separate .txt file."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Create the output directory if it doesn't exist
    
    for i, document in enumerate(documents):
        file_name = f"document_part_{i+1}.txt"
        file_path = os.path.join(output_dir, file_name)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(document.page_content)  # Assuming each document object has a 'page_content' attribute
        
        print(f"Saved: {file_path}")


def create_and_save_embeddings(split_documents, client_id):

    client_id = str(client_id)
    #eference_id = str(reference_id)
    # Base folder structure: my_embeddings/{client_id}/{reference_id}
    embedding_folder_base = os.path.join("my_embeddings", client_id)
    
    # Make sure the base embedding folder exists
    os.makedirs(embedding_folder_base, exist_ok=True)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
    # Iterate over each document chunk and generate embeddings
    for idx, doc in enumerate(split_documents, start=1):  
        # Create a unique folder for each document's embeddings directly inside embedding_folder_base
        embedding_folder = os.path.join(embedding_folder_base)
        
        # Ensure each document's folder is created fresh inside the base folder, without nesting
        os.makedirs(embedding_folder, exist_ok=True)
        
        # Create a FAISS index for this chunk
        temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
        # Save the FAISS index for this chunk with an incremental filename
        embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
        temp_db.save_local(embedding_file_path)
        
        print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")

def create_embeddings(split_documents, client_id):
    client_id = str(client_id)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')

    # Create a FAISS index from all chunks (in memory only)
    vectorstore = FAISS.from_documents(split_documents, embedding=embeddings)
    faiss_index_path = "faiss_supplier_index"
    vectorstore.save_local(faiss_index_path)
    
    print(f"✅ Created FAISS vectorstore in memory for client {client_id}")
    
    return vectorstore
import pickle

def save_faiss_per_chunk(documents, base_path="faiss_chunks", embedding_model=None, api_key=None):
    """
    Save each chunk in its own FAISS vector store directory.

    Args:
        documents (List[Document]): List of LangChain Document objects.
        base_path (str): Base directory to store all FAISS chunks.
        embedding_model: Optional embedding model instance.
        api_key (str): Required if embedding_model is not passed.

    Returns:
        List[str]: List of FAISS chunk folder paths.
    """
    if embedding_model is None:
        embedding_model = OllamaEmbeddings(model='nomic-embed-text')

    os.makedirs(base_path, exist_ok=True)
    chunk_paths = []

    for i, doc in enumerate(documents):
        chunk_dir = os.path.join(base_path, f"chunk_{i}")
        os.makedirs(chunk_dir, exist_ok=True)

        # Each FAISS requires at least 1 document
        vector_store = FAISS.from_documents([doc], embedding_model)

        # Save the FAISS index
        vector_store.save_local(chunk_dir)
        chunk_paths.append(chunk_dir)

        # Save the original document for reference
        with open(os.path.join(chunk_dir, "doc_metadata.pkl"), "wb") as f:
            pickle.dump(doc, f)

    print(f"✅ Saved {len(documents)} chunks as individual FAISS indexes in '{base_path}'")
    return chunk_paths

def create_and_save_embeddings_new(split_documents, client_id):

    client_id = str(client_id)
    #eference_id = str(reference_id)
    # Base folder structure: my_embeddings/{client_id}/{reference_id}
    embedding_folder_base = os.path.join("my_embeddings", client_id)
    
    # Make sure the base embedding folder exists
    os.makedirs(embedding_folder_base, exist_ok=True)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
    # Iterate over each document chunk and generate embeddings
    for idx, doc in enumerate(split_documents, start=1):  
        # Create a unique folder for each document's embeddings directly inside embedding_folder_base
        embedding_folder = os.path.join(embedding_folder_base)
        
        # Ensure each document's folder is created fresh inside the base folder, without nesting
        os.makedirs(embedding_folder, exist_ok=True)
        
        # Create a FAISS index for this chunk
        temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
        # Save the FAISS index for this chunk with an incremental filename
        embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
        temp_db.save_local(embedding_file_path)
        
        print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")


def merge_all_faiss1(client_id, base_path='my_embeddings'):
    embeddings=OllamaEmbeddings(model="nomic-embed-text") 
    # Initialize an empty FAISS vectorstore for merging
    merged_faiss = None
    
    # Construct the base folder path
    folder_path = f'{base_path}/{client_id}'
    
    # List all folders that match the pattern 'faiss_index{i}'
    faiss_folders = [
        folder for folder in os.listdir(folder_path) 
        if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
    ]
    
    # Sort folders by the index number extracted from 'faiss_index{i}'
    sorted_folders = sorted(faiss_folders, key=lambda x: int(x.replace('faiss_index', '')))
    
    # Loop through the sorted folders and merge FAISS stores
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        print(f"Loading FAISS index from: {faiss_path}")  # Debugging: See the order of loading
        current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
        
        # If it's the first FAISS store, initialize merged_faiss
        if merged_faiss is None:
            merged_faiss = current_faiss
        else:
            # Merge current FAISS store into merged_faiss
            merged_faiss.merge_from(current_faiss)
    
    # Optionally, save the merged FAISS index to a new folder
    if merged_faiss is not None:
        merged_faiss.save_local(f'{folder_path}/merged_faiss')
    print(merged_faiss)

    # Delete individual FAISS index folders, except for the 'merged_faiss'
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        try:
            # Delete the entire directory for the FAISS index (e.g., faiss_index3)
            shutil.rmtree(faiss_path)
            print(f"Deleted FAISS index folder: {faiss_path}")
        except FileNotFoundError:
            print(f"Folder not found: {faiss_path}")
        except OSError as e:
            print(f"Error deleting {faiss_path}: {e}")
    return merged_faiss

def merge_all_faiss(client_id, base_path='my_embeddings'):
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    merged_faiss = None
    
    folder_path = f'{base_path}/{client_id}'
    faiss_files = [
        folder for folder in os.listdir(folder_path) 
        if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
    ]
    
    sorted_files = sorted(faiss_files, key=lambda x: int(x.replace('faiss_index', '')))

    for file in sorted_files:
        faiss_path = os.path.join(folder_path, file)
        print(f"Loading FAISS index from: {faiss_path}")
        current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)

        # Extract document content
        current_texts = [current_faiss.docstore.search(doc_id).page_content 
                         for doc_id in current_faiss.index_to_docstore_id.values()]

        if merged_faiss is None:
            merged_faiss = current_faiss
        else:
            # Only add texts — this adds both vectors and metadata
            merged_faiss.add_texts(current_texts)

    if merged_faiss is not None:
        merged_faiss.save_local(f'{folder_path}/merged_faiss')
        print(f"Merged FAISS index saved as merged_faiss")

    # Clean up individual indexes
    for file in sorted_files:
        faiss_path = os.path.join(folder_path, file)
        try:
            shutil.rmtree(faiss_path)
            print(f"Deleted FAISS index folder: {faiss_path}")
        except FileNotFoundError:
            print(f"Folder not found: {faiss_path}")
        except OSError as e:
            print(f"Error deleting {faiss_path}: {e}")

    return merged_faiss


# You Tube
from youtube_transcript_api import YouTubeTranscriptApi

def transcribe(youtube_video_url):
    video_id = youtube_video_url.split("=")[1]
    transcript_text = YouTubeTranscriptApi.get_transcript(video_id)
    print(transcript_text)
    transcript = ""

    for  i in transcript_text:
        transcript += " " + i["text"]
    
    with open(video_id, "w", encoding="utf-8") as f:
        f.write(transcript)
    
    print(f"Transcript saved to {video_id}")
    return video_id

def load_txt(file_path):
    """Load PDF and return documents."""
    #loader = PyPDFLoader(f"{file_path}.pdf")
    loader = TextLoader(file_path)
    return loader.load()

def split_text_with_semantic_chunker(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"  # Can be changed to "standard_deviation", "interquartile"
    )
    documents = text_splitter.create_documents([doc.page_content for doc in docs])
    #documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    #documents = text_splitter.create_documents(docs)
    print("Documents split into semantic chunks.")
    return documents

def parsing(input):
    converter = PdfConverter(artifact_dict=create_model_dict())
    rendered = converter(input)
    text, _, images = text_from_rendered(rendered)
    return text

def marks_splitter(headers_to_split_on, content):
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(content)
    print("CCC")
    print(md_header_splits)
    return md_header_splits

def allocate_slides(chunks, total_slides, min_chars=60):
    """
    Allocate slides based on chunk size (works on Document-like objects).

    Args:
        chunks: List of Document (or string) objects
        total_slides: Total number of slides to generate
        min_chars: Minimum character count to consider a chunk valid

    Returns:
        Dict mapping original chunk index to number of slides to generate
    """
    # Step 0: Extract text from each chunk
    extracted = []
    for i, chunk in enumerate(chunks):
        if hasattr(chunk, "page_content"):
            text = chunk.page_content
        elif hasattr(chunk, "content"):
            text = chunk.content
        elif isinstance(chunk, str):
            text = chunk
        else:
            text = str(chunk)
        extracted.append((i, text))

    # Step 1: Filter out too‑short chunks
    valid = [(i, txt) for i, txt in extracted if len(txt) >= min_chars]
    if not valid:
        return {}

    # Step 2: Total characters across valid chunks
    total_chars = sum(len(txt) for _, txt in valid)

    # Step 3: Initial proportional allocation
    allocations = {}
    remaining = total_slides
    for idx, txt in valid:
        prop = len(txt) / total_chars
        cnt = max(1, round(prop * total_slides))
        allocations[idx] = cnt
        remaining -= cnt

    # Step 4a: If we’ve overshot, remove from the largest buckets
    while remaining < 0:
        # pick the chunk with max slides
        max_idx = max(allocations, key=allocations.get)
        if allocations[max_idx] > 1:
            allocations[max_idx] -= 1
            remaining += 1
        else:
            break

    # Step 4b: If we’ve undershot, add to the largest chunks by text size
    while remaining > 0:
        # sort valid chunks by length descending
        sorted_by_size = sorted(valid, key=lambda x: len(x[1]), reverse=True)
        for i in range(min(remaining, len(sorted_by_size))):
            idx = sorted_by_size[i][0]
            allocations[idx] = allocations.get(idx, 0) + 1
            remaining -= 1

    return allocations

import json
  # adjust as needed

def generate_slide_content_alloc(vectors, chunks, allocations,
                                 is_image=False, is_question=False, question_position=None):
    """
    Generate slide JSON for each chunk according to allocations, then merge.

    Args:
        vectors: your VectorStore (for similarity_search).
        chunks: list of text‐chunks (strings or Document-like).
        allocations: dict mapping chunk_index -> num_slides.
        is_image, is_question, question_position: (passed through if needed).

    Returns:
        A dict {"slide_1": {...}, ..., "slide_N": {...}} of all slides in order.
    """
    # 1. Build the LangChain prompt/chain once
    slide_content_template = """
    Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

    Each slide must include:
    - A clear and concise **sub-heading**
    - **Paragraphs** that effectively communicate the key ideas and insights
    - A specific, concise **visualization suggestion**

    **Context**: {context}

    Focus on creating content that is both informative and engaging. Ensure each slide:
    - Has a well-structured sub-heading that captures the main point
    - Uses clear and concise paragraphs to communicate important information

    Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
    - **Thought-provoking questions** to encourage reflection
    - **Relevant statistics** or data points that add credibility
    - **Industry insights** or emerging trends to demonstrate expertise
    - **Practical examples** or case studies to illustrate key concepts
    - **Calls to action** to guide the audience toward specific actions or takeaways

    For the visualization suggestion:
    - Provide a clear and specific description of an image that would be relevant to the slide content.
    - Keep it very concise, using a maximum of 5 words.
    - Focus on concrete objects, scenes, or concepts that can be easily visualized.
    - Avoid abstract or overly complex ideas.
    - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

    Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

    {format_instructions}

    The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

    Each slide object should contain the following fields:
    - "subheading": A title of the slide.
    - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
    - "visualization_suggestion": A very specific, context-aware suggestion (no longer than 5 words).

    Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
    """
    parser = PydanticOutputParser(pydantic_object=StorigoContent)
    slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)
    llm = ChatOllama(base_url='http://127.0.0.1:11434', model="llama3:8b")

    slide_content_chain = (
        {
            "context": lambda x: vectors.similarity_search(x["query"], k=3),
            "num_slides": lambda x: x["num_slides"],
            "format_instructions": lambda x: parser.get_format_instructions()
        }
        | slide_content_prompt
        | llm
        | parser
    )

    # 2. Iterate chunks in allocation order and collect
    all_slides = {}
    counter = 1

    for chunk_idx in sorted(allocations):
        n = allocations[chunk_idx]

        # Extract raw text from chunk
        chunk = chunks[chunk_idx]
        if hasattr(chunk, "page_content"):
            query = chunk.page_content
        elif hasattr(chunk, "content"):
            query = chunk.content
        else:
            query = str(chunk)
        #print(query)
        print(n)
        # Invoke the chain for this chunk
        result = slide_content_chain.invoke({"query": query, "num_slides": n})

        # Use model_dump() to get a Python dict
        raw_dict = result.model_dump()

        # Dive into the 'slides' field if it exists, else assume raw_dict is already the slide map
        slide_items = raw_dict.get("slides", raw_dict)

        # Sort keys to maintain order (slide_1, slide_2, ...)
        for slide_key in sorted(slide_items, key=lambda k: int(k.split("_")[1])):
            all_slides[f"slide_{counter}"] = slide_items[slide_key]
            counter += 1

    return all_slides


class SlideCollection:
    """Container class for slides with dict-like behavior"""
    def __init__(self):
        self.slides = {}
    
    def add_slide(self, key, content):
        self.slides[key] = content

    # ← Add these:
    def keys(self):
        return self.slides.keys()

    def __iter__(self):
        return iter(self.slides)

    def __getitem__(self, key):
        return self.slides[key]
    
    def items(self):
        return self.slides.items()
    
    def values(self):
        return self.slides.values()
    
    def __repr__(self):
        return repr(self.slides)
    
def generate_slide_content_alloc1(chunks,allocations,num_slides,num_mcqs, is_image, is_question, question_position,GPU):
    try:
        #llm = ChatGroq(model_name='llama-3.3-70b-versatile', groq_api_key=GROQ_API_KEY)
        """
        Generate slide JSON for each chunk according to allocations, then merge.

        Args:
            vectors: your VectorStore (for similarity_search).
            chunks: list of text‐chunks (strings or Document-like).
            allocations: dict mapping chunk_index -> num_slides.
            is_image, is_question, question_position: (passed through if needed).

        Returns:
            A dict {"slide_1": {...}, ..., "slide_N": {...}} of all slides in order.
        """
        # 1. Build the LangChain prompt/chain once
        slide_content_template_old = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - A specific, concise **visualization suggestion**

        **Context**: {query}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
        - "subheading": A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        - "visualization_suggestion": A very specific, context-aware suggestion (no longer than 5 words).

        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        slide_content_template = """
You are an expert presentation writer.

Your task is to generate exactly {{ num_slides }} professional, informative slides based on the provided context.

Each slide must follow this exact JSON format:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "Concise subheading",
      "paragraphs": [
        "Insightful paragraph 1",
        "Insightful paragraph 2 (optional)"
      ],
      "visualization_suggestion": "Max 5 words",
      "image": null
    }}
  }},
  "token_count": Integer (approximate)
}}

Rules:
- Return only the JSON.
- Do not ask for input.
- Do not include explanations.
- Only generate slide content using the context below.

### Context:
{{ chunk }}
"""
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template_old)
        if(GPU==0):
            llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)
        else:
            llm = ChatOllama(
            base_url = 'http://127.0.0.1:11434',
            model = "llama3:8b"
            #model = "deepseek-r1:8b"
        )
        # slide_content_chain_old = (
        #     {
        #         "context": lambda x: vectors.similarity_search(x["query"], k=3),
        #         "num_slides": lambda x: x["num_slides"],
        #         "format_instructions": lambda x: parser.get_format_instructions()
        #     }
        #     | slide_content_prompt
        #     | llm
        #     | parser
        # )
        slide_content_chain = (
            {
                "query": lambda x: x["query"],  # No vector store involved
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
            }
            | slide_content_prompt
            | llm
            | parser
        )

        # 2. Iterate chunks in allocation order and collect
        #all_slides = {}
        all_slides = SlideCollection()
        counter = 1

        for chunk_idx in sorted(allocations):
            n = allocations[chunk_idx]
            # extract text
            chunk = chunks[chunk_idx]
            print("===========================================================")
            print(chunk)
            query = getattr(chunk, "page_content",
                            getattr(chunk, "content", str(chunk)))
            print("Query")
            print(query)
            print("Query end")
            # invoke chain
            result = slide_content_chain.invoke({"query": query, "num_slides": n})
            print("result")
            print(result)
            raw = result.model_dump()
            slide_items = raw.get("slides", raw)

            # flatten into all_slides
            for slide_key in sorted(
                slide_items.keys(),
                key=lambda k: int(k.split("_")[1])
            ):
                all_slides.add_slide(f"slide_{counter}", slide_items[slide_key])
                counter += 1
        
        print("all_slides")
        print("slides_output\n", all_slides)
        print(type(all_slides))

        def custom_slide_sort_key(item_key: str):
            """
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            """
            prefix, num_str = item_key.split('_', 1)
            if prefix == "slide":
                group = 0
            elif prefix == "mcq":
                group = 1
            else:
                group = 999  # Fallback
            number = int(num_str) if num_str.isdigit() else 999
            return (group, number)

        # Sort the slides using the custom key
        ordered_slides = dict(
            sorted(all_slides.slides.items(), key=lambda kv: custom_slide_sort_key(kv[0]))
        )
        #print(ordered_slides)
        all_slides.slides = ordered_slides
        print("yess")
        print(all_slides.slides)
        # Image handling logic
        if is_image:
            print(is_image)
            for slide_key, slide_content in all_slides.items():
                # Check if the slide has a visualization suggestion
                if slide_content.get('visualization_suggestion'):
                    image_url = get_valid_image_new(slide_content['visualization_suggestion'], slide_content)
                    print("ImageURL")
                    print(image_url)
                    if image_url:
                        slide_content['image'] = image_url  # Correctly modify the dictionary
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content['image'] = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content['image'] = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in all_slides.values():
                slide_content['image'] = None
        
        #print("all_slides")
        #print(all_slides)
        token_count = 0
        for slide in all_slides.values():
            # Assuming slide is a dictionary
            text_content = f"{slide['subheading']} {' '.join(slide['paragraphs'])} {slide['visualization_suggestion']}"
            token_count += count_tokens(text_content)
        
        
        # Calculate the total token count
        #mcqs = {}
        if is_question:
            print("MCQ Started")
            mcq_template = """
                Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

                **Context**: {context}

                The final output **must be valid JSON only**. Ensure all keys and string values are enclosed in double quotes and do not include any additional strings or explanations:
                

                Don't give any conversational like text, For example: "Here is the MCQ:". Just return JSON Output.
                Here is the pydantic class to validate the output that you give. Make sure the check always passes.

                class MCQContent(BaseModel):
                type: str = Field("Question")
                question: str = Field(..., description="The multiple-choice question")
                options: List[str] = Field(..., description="A list of 4 answer options")
                correct_answer: str = Field(..., description="The correct answer (e.g., 'a', 'b', 'c', or 'd')")
                """

            # Define a schema for the MCQ
                    
            mcq_parser = PydanticOutputParser(pydantic_object=MCQContent)
            
            mcq_prompt = ChatPromptTemplate.from_template(mcq_template)
            # Generate MCQs
            mcqs = {}
            slide_keys = list(all_slides.keys())
            for i in range(0, len(slide_keys), 2):
                if len(mcqs) < num_mcqs:
                    context_slides = []
                    for j in range(i, min(i + 2, len(slide_keys))):
                        key = slide_keys[j]
                        slide = all_slides[key]                  # slide is a dict
                        title = slide.get("subheading", "")      # safely grab subheading
                        paras = slide.get("paragraphs", [])      # list of paragraphs
                        context_slides.append(f"{title}: {' '.join(paras)}")

                    context_text = "\n".join(context_slides)
                    #print(type(context_text))
                    retry_chain = RunnableLambda(lambda x: {"context": context_text}) 
                    output_fixing_parser = OutputFixingParser(
                        parser=mcq_parser,  # Wrap the original parser
                        llm=llm,
                        retry_chain = retry_chain # Use an LLM to fix the output if an error occurs
                    )
                    mcq_result = (
                        RunnableLambda(lambda x: {"context": context_text})
                        | mcq_prompt
                        | llm
                        | output_fixing_parser
                    ).invoke({})
                    print("======================================================================================")
                    #print(mcq_result)
                    # print("EStartNd")

                    # print(mcq_result.content)
                    # print("ENd")
                    # mcq_result = mcq_result.content
                    # print(type(mcq_result))
                    # # Assuming mcq_result is the raw output from the model
                    # #mcq_result = str(mcq_result)  # Ensure it's a string

                    # # Remove specific unwanted text (e.g., "Here is the MCQ:")
                    # #mcq_result = mcq_result.replace("Here is the MCQ:", "").strip()
                    
                    # # Replace single quotes with double quotes if necessary
                    # # mcq_result1 = mcq_result.replace("'", "\"")
                    # # variable_to_replace = r'\"(\w+)'  # Regex to capture any word after the double quote
                    # #mcq_result = re.sub(variable_to_replace, r"'\1", mcq_result1)
                    # #mcq_result = mcq_result1.replace('"s', r'\'s')
                    # # Now try to parse the cleaned JSON
                    # # try:
                    # #     mcq_result = json.loads(mcq_result)
                    # # except json.JSONDecodeError as e:
                    # #     raise Exception(f"Error decoding JSON: {str(e)}")
                    # mcq_result = f'''{mcq_result}'''
                    # print(mcq_result)
                    # mcq_result = json.loads(mcq_result)
                    # print("Load")
                    # print(mcq_result)
                    mcqs[f"mcq_{len(mcqs) + 1}"] = mcq_result

            # Calculate token count
            
            for mcq in mcqs.values():
                text_content = f"{mcq.question} {' '.join(mcq.options)} {mcq.correct_answer}"
                token_count += count_tokens(text_content)

            # Return the final StorigoContent object
            #if question_position == "bottom":
            storigo_content = StorigoContentMCQ(slides=ordered_slides, mcqs=mcqs, token_count=token_count)
                

            # Interleave slides and MCQs for display
            #else:
            interleaved_content = {}
            mcq_counter = 0
            total_slides = num_slides
            total_mcqs = num_mcqs

            # Calculate interval for inserting MCQs
            if question_position == 1:
                if total_mcqs > 0:
                    interval = total_slides // total_mcqs
                else:
                    interval = total_slides  # Default to not inserting any MCQs if there are none

                for idx, slide_key in enumerate(slide_keys):
                    interleaved_content[slide_key] = all_slides[slide_key]

                    # Insert an MCQ based on the calculated interval
                    if (idx + 1) % interval == 0 and mcq_counter < total_mcqs:
                        mcq_key = f"mcq_{mcq_counter + 1}"
                        interleaved_content[mcq_key] = mcqs[mcq_key]
                        mcq_counter += 1
                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                    #return interleaved_content 
                return storigo_content
            else :
                for slide_key in slide_keys:
                    interleaved_content[slide_key] = all_slides[slide_key]

                # Add all MCQs at the end
                for mcq_counter in range(total_mcqs):
                    mcq_key = f"mcq_{mcq_counter + 1}"
                    interleaved_content[mcq_key] = mcqs[mcq_key]

                storigo_content = StorigoContentMCQMid(slides=interleaved_content, token_count=token_count)
                return storigo_content
                
        else:
            storigo_content_without_mc = StorigoContent(slides=ordered_slides, token_count=token_count)
            return storigo_content_without_mc

    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")

# def load_ppt(file_path):
#     """
#     Load PowerPoint (PPT/PPTX) and return slides content as a list of strings.
#     Each slide's content will be combined into a single string.
#     """
#     presentation = Presentation(file_path)
#     slides_content = []

#     for slide in presentation.slides:
#         slide_text = []
#         for shape in slide.shapes:
#             if shape.has_text_frame:
#                 slide_text.append(shape.text)
#         slides_content.append("\n".join(slide_text))

#     return slides_content

def  main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU):
    parse_data = parsing(input)
    #docs = load_pdf(input)
    with open("parse_data.md", "w", encoding="utf-8") as f:
        f.write(parse_data)
    headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
    ] 
    with open("parse_data.md", "r", encoding="utf-8") as f:
        content = f.read()
    chunks = marks_splitter(headers_to_split_on, content)
    print("Chunks")
    # vectors = save_faiss_per_chunk(chunks, base_path="faiss_chunks_chunk_chaka", embedding_model=None, api_key=None)
    # print(vectors)
    # embedding_model = OpenAIEmbeddings(api_key="YOUR_OPENAI_API_KEY")

    # # 3. Create FAISS Vector Store from documents
    # vector_store = FAISS.from_documents(documents, embedding_model)
    
    allocation = allocate_slides(chunks, num_slides, min_chars=100)
    print(f"Total chunks: {len(chunks)}")
    print(f"Valid chunks: {len(allocation)}")
    print("\nSlide allocation:")

    
    # embeddings = create_embeddings(chunks, client_id)
    # print(embeddings)
    # #slide_content = generate_slide_content_old(embeddings, num_slides,num_mcqs, is_image,is_question, question_position)

    # #slide_content = generate_slide_content_alloc(embeddings, chunks, allocation,
    # #                       is_image=False, is_question=False, question_position=None)
    slide_content = generate_slide_content_alloc1(chunks,allocation,num_slides,num_mcqs, is_image, is_question, question_position,GPU)
    print(slide_content)
    # # #embeddings = OllamaEmbeddings(model='nomic-embed-text')
    # print(slide_content)
    # #chunks = split_text_with_semantic_chunker(docs, embeddings)

    # # #embeddings = create_and_save_embeddings(chunks, client_id)
    # # embeddings = create_embeddings(chunks, client_id)
    # # #merge_embeddings = merge_all_faiss(client_id)
    # # slide_content = generate_slide_content_new_1(embeddings, num_slides,num_mcqs, is_image,is_question, question_position)
    # # #print(embeddings)
    # # print(slide_content)


if __name__ == "__main__":
    # Run the main function asynchronously
    input = 'CleanMax_Code_Of_Conduct_Supplier.pdf'
    #input = 'https://edurigo.com/'
    output_dir = f'temp/{input}'
    embedding_folder_base = 'output_embeddings'
    
    client_id = 1113331144
    num_slides = 30
    is_image = True
    num_mcqs=8
    is_question = True
    question_position = 1
    GPU =1 
    #main(input, output_dir,client_id, is_image) 
    main(input,num_slides,num_mcqs, is_image,is_question, question_position,GPU)

    #asyncio.run(main(input, output_dir,client_id, is_image))