from phi.knowledge.pdf import PDFKnowledgeBase, PDFReader#from phi.vectordb.pgvector import PgVector
from phi.embedder.sentence_transformer import SentenceTransformerEmbedder
from phi.agent import Agent
#from knowledge_base import knowledge_base
from phi.vectordb.chroma import ChromaDb
from phi.model.ollama import Ollama
from phi.model.groq import Groq
import os
import time
import random
import re
import json
import requests
#from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from crawl4ai import AsyncWebCrawler
import asyncio
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredPowerPointLoader
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from urllib.parse import urlparse
from typing import Union, Dict
from langchain_core.runnables import RunnableLambda
import shutil
from pptx import Presentation
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama

groq_api_key = "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"  # Replace with your actual Groq API key


pdf_urls = [
    "1742835480Cybersecurity_Awareness_Training_Content.pdf",
    #"https://api.edurigo.com/uploads/reference_materials/1725368588CleanMax_Code_Of_Conduct_Supplier_POs.pdf",
]
pdf_files = [f for f in os.listdir('data') if f.endswith('.pdf')]
print("PDF Files:", pdf_files)

knowledge_base = PDFKnowledgeBase(
    path="data",
    max_documents=5,

    # Table name: ai.pdf_documents
    vector_db=ChromaDb(
        collection="multi_pdf_collection",
        path="output_chat" , # Specify your local storage path
        #embedder=Ollama(id="nomic-embed-text:latest")
        embedder=SentenceTransformerEmbedder(),
    ),
    
)
for pdf in pdf_files:
    try:
        print(f"Attempting to load: {pdf}")
        # Temporarily isolate loading of each PDF
        temp_base = PDFKnowledgeBase(
            path=f"data/{pdf}",
            vector_db=ChromaDb(
                collection=f"collection_{pdf}",
                path="output_chat",
                embedder=SentenceTransformerEmbedder(),
            ),
        )
        temp_base.load(recreate=True)
        print(f"Successfully loaded: {pdf}")
    except Exception as e:
        print(f"Error loading {pdf}: {e}")

agent = Agent(
    model=Ollama(id="llama3.1:8b"),
    knowledge=knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load(recreate=True)
#print(knowledge_base.documents)


agent.print_response("What is this document about")

# def load_pdf(file_path):
#     """Load PDF and return documents."""
#     #loader = PyPDFLoader(f"{file_path}.pdf")
#     loader = PyPDFLoader(file_path)
#     print(loader)
#     return loader.load()

# def split_text_with_semantic_chunker(docs, embeddings):
#     """Splits the text into semantic chunks using the given embeddings."""
#     text_splitter = SemanticChunker(
#         embeddings, breakpoint_threshold_type="percentile"  # Can be changed to "standard_deviation", "interquartile"
#     )
#     documents = text_splitter.create_documents([doc.page_content for doc in docs])
#     #documents = text_splitter.create_documents([doc['page_content'] for doc in docs])
#     print("Documents split into semantic chunks.")
#     return documents


# def create_and_save_embeddings(split_documents, client_id):

#     client_id = str(client_id)
#     #eference_id = str(reference_id)
#     # Base folder structure: my_embeddings/{client_id}/{reference_id}
#     embedding_folder_base = os.path.join("my_embeddings", client_id)
    
#     # Make sure the base embedding folder exists
#     os.makedirs(embedding_folder_base, exist_ok=True)
    
#     # Initialize the embedding model
#     embeddings = OllamaEmbeddings(model='nomic-embed-text')
    
#     # Iterate over each document chunk and generate embeddings
#     for idx, doc in enumerate(split_documents, start=1):  
#         # Create a unique folder for each document's embeddings directly inside embedding_folder_base
#         embedding_folder = os.path.join(embedding_folder_base)
        
#         # Ensure each document's folder is created fresh inside the base folder, without nesting
#         os.makedirs(embedding_folder, exist_ok=True)
        
#         # Create a FAISS index for this chunk
#         temp_db = FAISS.from_documents([doc], embedding=embeddings)
        
#         # Save the FAISS index for this chunk with an incremental filename
#         embedding_file_path = os.path.join(embedding_folder, f"faiss_index{idx}")
#         temp_db.save_local(embedding_file_path)
        
#         print(f"Saved FAISS embedding for document part {idx} as faiss_index{idx} in {embedding_folder}")

    

# def merge_all_faiss(client_id, base_path='my_embeddings'):
#     embeddings = OllamaEmbeddings(model="nomic-embed-text")
#     merged_faiss = None
    
#     folder_path = f'{base_path}/{client_id}'
#     faiss_files = [
#         folder for folder in os.listdir(folder_path) 
#         if folder.startswith('faiss_index') and folder[len('faiss_index'):].isdigit()
#     ]
    
#     sorted_files = sorted(faiss_files, key=lambda x: int(x.replace('faiss_index', '')))

#     for file in sorted_files:
#         faiss_path = os.path.join(folder_path, file)
#         print(f"Loading FAISS index from: {faiss_path}")
#         current_faiss = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)

#         # Extract document content
#         current_texts = [current_faiss.docstore.search(doc_id).page_content 
#                          for doc_id in current_faiss.index_to_docstore_id.values()]

#         if merged_faiss is None:
#             merged_faiss = current_faiss
#         else:
#             # Only add texts — this adds both vectors and metadata
#             merged_faiss.add_texts(current_texts)

#     if merged_faiss is not None:
#         merged_faiss.save_local(f'{folder_path}/merged_faiss')
#         print(f"Merged FAISS index saved as merged_faiss")

#     # Clean up individual indexes
#     for file in sorted_files:
#         faiss_path = os.path.join(folder_path, file)
#         try:
#             shutil.rmtree(faiss_path)
#             print(f"Deleted FAISS index folder: {faiss_path}")
#         except FileNotFoundError:
#             print(f"Folder not found: {faiss_path}")
#         except OSError as e:
#             print(f"Error deleting {faiss_path}: {e}")

#     return merged_faiss


# def main(file_path, output_dir,client_id, is_image):
#     #
#     embeddings = OllamaEmbeddings(model='nomic-embed-text')
#     text = load_pdf(file_path)
#     split_documents = split_text_with_semantic_chunker(text, embeddings)
    
#     embeddings = OllamaEmbeddings(model ='nomic-embed-text')
#     #split = split_text_with_semantic_chunker(split_documents, embeddings)
#     embeddings = create_and_save_embeddings(split_documents, client_id)
#     #create_and_save_embeddings(split, client_id)
#     merge_embeddings = merge_all_faiss(client_id)


# if __name__ == "__main__":
#     # Run the main function asynchronously
#     input = 'cyber.pdf'
#     #input = 'https://edurigo.com/'
#     output_dir = f'temp/{input}'
#     embedding_folder_base = 'output_embeddings'
    
#     client_id = 1673
#     num_slides = 15
#     is_image = False
#     main(input, output_dir,client_id, is_image) 
#     #asyncio.run(main(input, output_dir,client_id, is_image))


