import os
import time
import random
import re
import json
import requests
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredPowerPointLoader
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
import shutil
from crawl4ai import AsyncWebCrawler
import asyncio
from langchain.chains import LLMChain
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from pptx import Presentation
from pptx.util import Inches, Pt

#GROQ_API_KEY = "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"
GROQ_API_KEY = "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"
OLLAMA_MODEL = "nomic-embed-text"
PIXABAY_API_KEY = "44622834-f22df6f12cf45558ee180dd8d"
from urllib.parse import urlparse
async def crawlerrr(file):
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url=file)

        # Print the extracted content
        print(result.markdown)

        # Extract a safe filename from the URL
        parsed_url = urlparse(file)
        filename = parsed_url.netloc + parsed_url.path.replace('/', '_')
        if not filename.endswith('.txt'):
            filename += '.txt'

        # Save the result to the text file using the `filename`
        with open(filename, 'w') as txt_file:
            txt_file.write(result.markdown)
        print("HELLO jII")
        print(filename)
        return filename

def read_file(input):
    try:
        # Check if input path is a valid file
        print("Started")
        if not input or not os.path.isfile(input):
            print(f"Error: The file '{input}' does not exist or the path is incorrect.")
            return ""

        # Open the file in read mode ('r')
        with open(input, 'r', encoding='utf-8') as file:
            # Read the entire content of the file
            content = file.read()

        # Return the content of the file (if content is empty, return empty string)
        return content if content else ""
    
    except Exception as e:
        # Catch any other unexpected errors
        print(f"Error reading the file: {e}")
        return ""

# Example usage
# content = read_file('result.txt')  # or pass the correct path here
# if content:
#     pass
#     #print("File content:", content)
# else:
#     print("No content or error reading the file.")

def clean_using_llm(content):
    # Check if content is empty before proceeding
    # if not content.strip():
    #     print("The content is empty. Please provide a valid text file.")
    #     return

    # Print content to debug
    #print("Content received by LLM:")
    #print(content[:500])  # Print first 500 characters for debugging (to avoid printing too much)

    # Define the prompt template for meaningful content extraction
    prompt_template = """
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements, 
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs, 
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    """

    # Initialize the LLM (with your API key)
    llm = ChatGroq(model_name='llama3-8b-8192', groq_api_key=GROQ_API_KEY)

    # Create the PromptTemplate object
    prompt = PromptTemplate(input_variables=["context"], template=prompt_template)

    # Create the LLMChain to pass the prompt and run the model
    runnable = prompt | llm

    # Run the sequence to get the filtered content
    filtered_content = runnable.invoke({"context": content})
    print(filtered_content)
    filtered_content = filtered_content.content
    print(type(filtered_content))
    return filtered_content
    # Print the filtered content (or save to a new file if needed)
    # print("Filtered Content:")
    # print(filtered_content)

def split_text_with_semantic_chunker(docs, embeddings):
    """Splits the text into semantic chunks using the given embeddings."""
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    # Check if docs is a string instead of a list
    if isinstance(docs, str):
        # Convert the string to a list with one item
        docs = [docs]

    # Debugging: Print the type of items in docs
    print(f"Type of docs after conversion: {type(docs)}")
    print(f"First item in docs: {docs[0] if docs else 'Empty list'}")

    # Convert strings to dictionaries with 'page_content' if needed
    if isinstance(docs[0], str):
        docs = [{'page_content': doc} for doc in docs]

    # Ensure all docs have the correct structure
    if not all(isinstance(doc, dict) and 'page_content' in doc for doc in docs):
        print("Error: Invalid document structure.")
        return []

    # Create semantic chunks
    documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
    print("Documents split into semantic chunks.")
    print(documents)
    return documents

def save_documents_to_txt(documents, output_dir):
    """Saves each document in the documents list as a separate .txt file."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Create the output directory if it doesn't exist
    
    for i, document in enumerate(documents):
        file_name = f"document_part_{i+1}.txt"
        file_path = os.path.join(output_dir, file_name)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(document.page_content)  # Assuming each document object has a 'page_content' attribute
        
        print(f"Saved: {file_path}")


def create_and_save_embeddings(split_documents, client_id):

    client_id = str(client_id)
    #eference_id = str(reference_id)
    # Base folder structure: my_embeddings/{client_id}/{reference_id}
    embedding_folder_base = os.path.join("my_embeddings", client_id)
    
    # Make sure the base embedding folder exists
    os.makedirs(embedding_folder_base, exist_ok=True)
    
    # Initialize the embedding model
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
    # Iterate over each document chunk and generate embeddings
    for idx, doc in enumerate(split_documents, start=1):  
        # Create a unique folder for each document's embeddings directly inside embedding_folder_base
        embedding_folder = os.path.join(embedding_folder_base)
        
        # Ensure each document's folder is created fresh inside the base folder, without nesting
        os.makedirs(embedding_folder, exist_ok=True)
        
        # Create a FAISS index for this chunk
        temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
        # Save the FAISS index for this chunk with an incremental filename
        embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
        temp_db.save_local(embedding_file_path)
        
        print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")


def merge_all_faiss(client_id, base_path='my_embeddings'):
    embeddings=OllamaEmbeddings(model="nomic-embed-text") 
    # Initialize an empty FAISS vectorstore for merging
    merged_faiss = None
    
    # Construct the base folder path
    folder_path = f'{base_path}/{client_id}'
    
    # List all folders that match the pattern 'faiss_index{i}'
    faiss_folders = [
        folder for folder in os.listdir(folder_path) 
        if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
    ]
    
    # Sort folders by the index number extracted from 'faiss_index{i}'
    sorted_folders = sorted(faiss_folders, key=lambda x: int(x.replace('faiss_index', '')))
    
    # Loop through the sorted folders and merge FAISS stores
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        print(f"Loading FAISS index from: {faiss_path}")  # Debugging: See the order of loading
        current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
        
        # If it's the first FAISS store, initialize merged_faiss
        if merged_faiss is None:
            merged_faiss = current_faiss
        else:
            # Merge current FAISS store into merged_faiss
            merged_faiss.merge_from(current_faiss)
    
    # Optionally, save the merged FAISS index to a new folder
    if merged_faiss is not None:
        merged_faiss.save_local(f'{folder_path}/merged_faiss')
    print(merged_faiss)

    # Delete individual FAISS index folders, except for the 'merged_faiss'
    for folder in sorted_folders:
        faiss_path = os.path.join(folder_path, folder)
        try:
            # Delete the entire directory for the FAISS index (e.g., faiss_index3)
            shutil.rmtree(faiss_path)
            print(f"Deleted FAISS index folder: {faiss_path}")
        except FileNotFoundError:
            print(f"Folder not found: {faiss_path}")
        except OSError as e:
            print(f"Error deleting {faiss_path}: {e}")
    return merged_faiss

class SlideContent(BaseModel):
    #heading: str = Field(..., description="The main heading of the slide")
    subheading: Optional[str] = Field(None, description="An optional subheading for the slide")
    paragraphs: List[str] = Field(..., description="List of paragraphs for the slide content")  
    visualization_suggestion: str = Field(..., description="A specific and concise suggestion for a relevant visualization or image (max 5 words)")
    image: Optional[str] = Field(None, description="URL of the image for the slide")
    #subheading = heading

class StorigoContent(BaseModel):
    slides: Dict[str, SlideContent] = Field(..., description="Dictionary of slide contents with slide numbers as keys")
    
def generate_search_query(visualization_suggestion, slide_content):
    context_keywords = extract_context_keywords(slide_content)
    
    # Combine visualization suggestion with context keywords
    combined_query = f"{visualization_suggestion} {' '.join(context_keywords)}"
    
    # Extract key words from the combined query
    words = re.findall(r'\w+', combined_query.lower())
    
    # Remove common words
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    filtered_words = [word for word in words if word not in common_words]
    
    # Prioritize words from the visualization suggestion
    suggestion_words = visualization_suggestion.lower().split()
    prioritized_words = suggestion_words + [word for word in filtered_words if word not in suggestion_words]
    
    # Take the first 5 words
    query_words = prioritized_words[:min(5, len(prioritized_words))]
    
    return " ".join(query_words)

def extract_context_keywords(slide_content):
    # Extract keywords from slide content to provide context
    text = f"{slide_content.subheading or ''} {' '.join(slide_content.paragraphs)}"
    words = re.findall(r'\w+', text.lower())
    common_words = set(['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    keywords = [word for word in words if word not in common_words]
    return list(set(keywords))[:3]  # Return up to 3 unique keywords

def fetch_pixabay_image(query):
    url = "https://pixabay.com/api/"
    params = {
        "key": PIXABAY_API_KEY,
        "q": query,
        "image_type": "photo",
        "orientation": "horizontal",
        "per_page": 5,  # Fetch top 5 images
        "safesearch": "true",
        "order": "relevance"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data["hits"]:
            # Sort images by relevance score (you may need to adjust this based on Pixabay's API)
            sorted_hits = sorted(data["hits"], key=lambda x: x["likes"] + x["downloads"], reverse=True)
            return sorted_hits[0]["webformatURL"]  # Return the most relevant image
        else:
            print(f"No image found for query: {query}")
            return None
    except requests.RequestException as e:
        print(f"Error fetching image from Pixabay: {str(e)}")
        return None
    except Exception as e:
        print(f"Unexpected error in fetch_pixabay_image: {str(e)}")
        return None

def get_valid_image(visualization_suggestion, slide_content, max_attempts=3):
    if not visualization_suggestion:
        print("No visualization suggestion provided.")
        return None

    for attempt in range(max_attempts):
        try:
            query = generate_search_query(visualization_suggestion, slide_content)
            print(f"Attempt {attempt + 1} to fetch image for query: {query}")
            image_url = fetch_pixabay_image(query)
            
            if image_url:
                print(f"Valid image found: {image_url}")
                return image_url
            else:
                print(f"No image URL returned for query: {query}")
            
            time.sleep(1)
        except Exception as e:
            print(f"Error in get_valid_image (attempt {attempt + 1}): {str(e)}")
    
    print(f"No valid image found after {max_attempts} attempts")
    return None

def generate_slide_content(vectors, num_slides, is_image):
    try:
        llm = ChatGroq(model_name='llama3-70b-8192', groq_api_key=GROQ_API_KEY)

        # Prompt template
        slide_content_template = """
        Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.
        
        Each slide must include:
        - A clear and concise **sub-heading**
        - **Paragraphs** that effectively communicate the key ideas and insights
        - and a specific, concise **visualization suggestion**

        **Context**: {context}

        Focus on creating content that is both informative and engaging. Ensure each slide:
        - Has a well-structured sub-heading or subheading that captures the main point
        - Uses clear and concise paragraphs to communicate important information

        Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
        - **Thought-provoking questions** to encourage reflection
        - **Relevant statistics** or data points that add credibility
        - **Industry insights** or emerging trends to demonstrate expertise
        - **Practical examples** or case studies to illustrate key concepts
        - **Calls to action** to guide the audience toward specific actions or takeaways

        For the visualization suggestion:
        - Provide a clear and specific description of an image that would be relevant to the slide content.
        - Keep it very concise, using a maximum of 5 words.
        - Focus on concrete objects, scenes, or concepts that can be easily visualized.
        - Avoid abstract or overly complex ideas.
        - Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

        Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

        {format_instructions}

        The final output **must** be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}", in **strict sequential order**.

        Each slide object should contain the following fields:
       
        - "subheading" (if applicable): A title of the slide.
        - "paragraphs": A list of concise paragraphs that communicate the main points of the slide.
        -"visualization_suggestion":visualization_suggestion for each slide should be very specific, context-aware, and no longer than 5 words.
        
        Please ensure the slides are generated in the correct order as defined in the embeddings or the document content.
        """
        # - "heading": The main title of the slide.
        # Using Pydantic parser for output formatting
        parser = PydanticOutputParser(pydantic_object=StorigoContent)
        print("BYEE")
        print(parser.get_format_instructions())
        #structured_llm = llm.with_structured_output(StorigoContent)
        slide_content_prompt = ChatPromptTemplate.from_template(slide_content_template)

        # Creating the chain
        slide_content_chain = ( 
            {
                "context": lambda x: vectors.similarity_search(x["query"], k=3),
                "num_slides": lambda x: x["num_slides"],
                "format_instructions": lambda x: parser.get_format_instructions()
                #"format_instructions": lambda x: structured_llm
            }
            | slide_content_prompt
            | llm
            | parser
            
        )

        # Invoke the chain with the query and number of slides
        result = slide_content_chain.invoke({"query": "", "num_slides": num_slides})
        
        # Sort result slides to ensure they are in the correct order (by slide number)
        ordered_slides = dict(sorted(result.slides.items(), key=lambda item: item[0]))

        # Image handling logic
        if is_image:
            for slide_key, slide_content in ordered_slides.items():
                if slide_content.visualization_suggestion:
                    image_url = get_valid_image(slide_content.visualization_suggestion, slide_content)
                    if image_url:
                        slide_content.image = image_url
                    else:
                        print(f"Warning: No suitable image found for slide {slide_key} after multiple attempts.")
                        slide_content.image = None
                else:
                    print(f"Warning: No visualization suggestion for slide {slide_key}.")
                    slide_content.image = None
        else:
            # If not an image slide, ensure the 'image' field is None
            for slide_content in ordered_slides.values():
                slide_content.image = None

        # Return the ordered slide content
        return StorigoContent(slides=ordered_slides)
    
    except Exception as e:
        raise Exception(f"Error generating slide content: {str(e)}")

async def main(file_path, output_dir,client_id, is_image= False):
    start_time = time.time()
    print(f"Start Time: {start_time}")
    
    # Load the PDF
    innput = "https://edurigo.com/"
    crawl = await(crawlerrr(innput))
    #docs = load_pdf(file_path)
    #docs = extract_pdf_with_headings(file_path)
    #input = ''
    docs = read_file(crawl)
    print("yes")
    clean = clean_using_llm(docs)
    print("clean")
    embeddings = OllamaEmbeddings(model ='nomic-embed-text')
    split_documents= split_text_with_semantic_chunker(clean, embeddings)
    
    save_documents_to_txt(split_documents, output_dir)
    create_and_save_embeddings(split_documents, client_id)
    embedding_dir = f"my_embeddings/{client_id}/"
    merge_embeddings = merge_all_faiss(client_id)
    slide_content = generate_slide_content(merge_embeddings, num_slides, is_image)
    print(slide_content)
    #create_presentation(slide_content)
    #print(create_presentation)
    
    # Count the total words
    """ total_words = count_total_words(docs)
    print(total_words) """
    
    # Create embeddings
    # embeddings = OllamaEmbeddings(model ='nomic-embed-text')
    
    # # Split the text with semantic chunker
    # split_documents = split_text_with_semantic_chunker(docs, embeddings)
    
    # # Save the split documents to text files
    # save_documents_to_txt(split_documents, output_dir)
    # #save_documents_to_json(split_documents, output_dir)
    
    # # Create and save embeddings for the split documents
    # create_and_save_embeddings(split_documents, client_id)

    # # Generate slide content after creating and saving embeddings
    # embedding_dir = f"my_embeddings/{client_id}/"
    # """ combined_faiss_index = load_all_faiss_indices(embedding_dir)
    # slides = generate_slide_content(combined_faiss_index, num_slides, is_image) """
    
    # # Process each FAISS index and generate slides
    # #process_each_faiss_index(embedding_dir,  num_slides, is_image)
    # merge_embeddings = merge_all_faiss(client_id)
    # slide_content = generate_slide_content(merge_embeddings, num_slides, is_image)
        
    # end_time = time.time()
    
    # print("Generated Slide Content:")
    # print(slide_content)
    # print(f"\nProcess took {end_time - start_time:.2f} seconds")
    # """ response_data = slides.dict()
    # print(response_data)
    #  """
    
    # end_time = time.time()
    # print(f"End Time: {end_time}")
    # total_time = end_time-start_time
    # print(f"total Time: {total_time}")
    
    
if __name__ == "__main__":
    # Run the main function asynchronously
    input = 'result.txt'
    output_dir = f'temp/{input}'
    embedding_folder_base = 'output_embeddings'
    
    client_id = 198101
    num_slides = 4
    is_image = True
    asyncio.run(main(input, output_dir,client_id, is_image))