from dotenv import load_dotenv
load_dotenv()

# bring in deps
from llama_cloud_services import LlamaParse
from llama_index.core import SimpleDirectoryReader
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama


embedding_model = OllamaEmbeddings(model='nomic-embed-text')
# set up parser
parsing_instruction = """
Please parse the document into markdown format using the following rules:

1. Use `#` only for the main document title .
2. Use `##` for top-level sections like.
3. Use `###` for subsections under those .
4. Preserve the original order and structure of sections as in the document.
5. Do not infer or hallucinate section headers — only convert explicitly visible ones.
6. Ensure that all content under a heading is properly indented and grouped.
7. Tables, bullet points, and bold formatting should be preserved as they appear.

Output clean, readable markdown text without metadata or footers unless explicitly part of the content.
"""
parser = LlamaParse(
    result_type="markdown",
    system_prompt =parsing_instruction
        # "markdown" and "text" are available
)
folder_path = "parser_folder"
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['cyber.pdf'], file_extractor=file_extractor,input_dir=folder_path,).load_data()
#print(documents)
print("Done1")
# print(f"Total files parsed: {len(documents)}")
# for i, doc in enumerate(documents):
#     with open(f"parsed_outputs_{i+1}.md", "w", encoding="utf-8") as f:
#         f.write(doc.text)  # or doc.page_content depending on LlamaIndex version

#documents = parser.load_data(["/parser_folder"])
import os 
# folder_path = "parse_folder1"
# combined_text = ""

# # Loop through all .md files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".md"):
#         with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
#             combined_text += f.read() + "\n\n"  # Add spacing between files

# # Save combined content to a new markdown file
# with open("combined_output1.md", "w", encoding="utf-8") as f:
#     f.write(combined_text)

# print("✅ Combined markdown saved as combined_output.md")
import re

folder_path = "parse_folder1"  # Replace with your folder path
combined_text = ""

# Function to extract numbers for sorting
def extract_number(filename):
    match = re.search(r'(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

# Get and sort filenames numerically
sorted_files = sorted(
    [f for f in os.listdir(folder_path) if f.endswith(".md")],
    key=extract_number
)

# Loop through sorted markdown files
for filename in sorted_files:
    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
        combined_text += f.read() + "\n\n"  # Add spacing between files

# Save combined content
with open("combined_output1.md", "w", encoding="utf-8") as f:
    f.write(combined_text)

print("✅ Combined markdown saved as combined_output1.md")
from langchain_text_splitters import MarkdownHeaderTextSplitter

md_text = """# What is Cyber security and Information security

# Definition

Information security: Protecting information, regardless of its format, from unauthorized access, use, disclosure, disruption, modification, or destruction.

Broader: Covers all forms of information, including physical and digital.

Cybersecurity: Protecting systems, networks, and data from cyber threats, focusing on technology and digital environments.

Narrower: Focuses specifically on protecting digital systems, networks, and data.

# Principles of Cybersecurity

In cybersecurity, the CIA Triad is a foundational model representing the three core principles of information security. These principles ensure the effective protection of data and systems.

- Confidentiality: Ensures that information is accessible only to those authorized to access it.
- Integrity: Ensures that data is accurate, complete, and unaltered during storage or transmission.
- Availability: Ensures that information and resources are accessible to authorized users when needed.

# Different types of cyber threats

- Social Engineering: Social engineering is a manipulation technique that exploits human nature to gain private information, access, or valuables. In cybercrime, these “human hacking” scams tend to lure unsuspecting users into exposing data, spreading malware infections, or giving access to restricted systems. Attacks can happen online, in-person, and via other interactions.
- Malware: Harmful software that can access a system's data, such as viruses, spyware, ransomware, and worms.
- Phishing:

# Cybersecurity Threats

- Phishing: A deceptive attack where cybercriminals impersonate legitimate entities to trick victims into revealing sensitive information
- Denial-of-Service (DoS) attack: A cyber attack that overwhelms a system's resources, making it unable to respond to legitimate service requests
- Distributed denial-of-service (DDoS) attack: A similar attack to a DoS attack, but initiated by many malware-infected host machines
- SQL injection: An attack that exploits vulnerabilities in databases by injecting malicious code into user inputs
- Cross-Site Scripting (XSS): An attack that involves injecting malicious code into a website, but the code only runs in the user's browser
- Ransomware: A malicious form of software that encrypts a victim's files or locks them out of their computer system, demanding a ransom payment in exchange

# Safe Internet and Device Usage

# Verify Website Security

- Look for HTTPS in the URL.
- Avoid clicking on pop-ups or ads promising free products or rewards.

# Use Strong Passwords

- Make passwords at least 8 characters long with a mix of letters, numbers, and symbols.
- Avoid using common words or phrases.
- Use a password manager to store and generate secure passwords.

# Enable Multi-Factor Authentication (MFA)

- Add an extra layer of protection to your accounts by requiring a second form of verification (e.g., a code sent to your phone).

# Secure Browsing

- Keep Software Updated: Ensure your browser and plugins are up to date to patch vulnerabilities.
- Use a Secure Browser: Consider browsers with built‑in security features.
- Avoid Clicking on Ads: Block or ignore pop‑ups and ads to reduce risk.
- Use Private Browsing: Use incognito or private modes for sensitive activities.
- Limit Cookies: Manage browser settings to minimize cookie tracking.
- Avoid Public Wi-Fi: Use a VPN when browsing on unsecured networks.
- Log Out After Use: Always log out of accounts, especially on shared or public devices.
- Be cautious of typosquatting domains (e.g., goggle.com instead of google.com)

# Tips for Public Wi-Fi Use

- Avoid Sensitive Transactions: Do not access banking or sensitive accounts on public Wi-Fi.
- Use a VPN: Encrypt your internet connection to protect your data from potential eavesdroppers.
- Turn Off Sharing: Disable file and printer sharing while connected to public networks.
- Forget Networks After Use: Ensure your device does not automatically reconnect to public Wi-Fi.
- Verify Network Authenticity: Confirm the network name with the provider to avoid connecting to fake networks.
- Keep Software Updated: Regularly update your device’s operating system and security software.

# Email Security Tips

- Think Before You Click: Avoid clicking on suspicious links or attachments in emails.
- Verify Senders: Check the sender’s email address carefully for signs of spoofing.
- Beware of Urgent Language: Scammers often use urgency to trick you into taking action.
- Enable Spam Filters: Use your email provider’s spam filtering features to reduce unwanted emails.
- Do Not Share Sensitive Information: Avoid sending passwords, financial details, or personal information via email.
- Use Encryption: For highly sensitive communications, use email encryption tools.

# Update Email Passwords Regularly

Use strong, unique passwords and change them periodically.

# Report Suspicious Emails

Notify your IT team or email provider about phishing attempts.

# Dos and Don’ts

- Do not change any hardware configuration, settings in the operating systems or any applications installed on their desktops.
- Do not install any software or applications on your desktops/ laptops that is not authorized by the CMES’s IT team or is not essential to CMES’s business.
- USB ports, Compact Disk (CD)/ DVD, memory card access are disabled by default. If the user needs to enable them, approval from the user’s manager and IT team shall be required.
- Take appropriate measures for physical protection of laptops such as not leaving laptops unattended in public places or while travelling.
- Do not connect removable media such as CD/ DVD, USB drives, and other portable storage media from an unknown source to a CMES system.
- Removable media containing sensitive or confidential information shall be stored in encrypted format by using CMES approved tool.
- Removable media containing sensitive information must not be left out in the open or allowed to be vulnerable to opportunistic theft.

# Clear Desk and Clear Screen

- Ensure that desks and other work areas are kept clear of papers and any storage media when unattended.
- All workstations shall have password-locked screen savers enabled to activate after 5 minutes of inactivity.

# Anti-virus/ Anti-malware

- Users shall not disable the installed anti-virus agent or change its settings defined during installation.
- Users shall report to the IT team for unpatched systems or any virus that is detected in the system and not cleaned by the anti-virus software.

# Hardware and Software

- If you suspect any non-adherence or suspicious activities or email, kindly inform IT team at itsupport@cleanmax.com
- To prevent the introduction of malicious code and protect the integrity of CMES assets, all hardware and software shall be obtained by raising request through the IT team.
- CMES’s IT team shall ensure that an approved list of authorized software is maintained.
- All employees shall abide by the software copyright law and shall not obtain, install, replicate, transfer or use software except as permitted under the licensing agreements.
- Users shall ensure that personal software are not used on CMES owned assets to protect the integrity of CMES’s assets and information.

# Incident Reporting

# Blogging and social media

- Access to social media, blogging/ micro-blogging, and video sharing websites such as Facebook, Twitter, LinkedIn, YouTube, etc. shall be restricted to ensure productivity of the employees.
- No official matter, incidents and happenings that can align the name of CMES shall be discussed/ posted on social media platform by employees. Reporting of such an event may lead to sever consequences for the employees(s).

"""
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("####", "Header 3"),
]
from langchain_chroma import Chroma
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)
# print("First")
# print(md_header_splits[0].page_content)
# print("Second")
# print(md_header_splits[1])
# vectorstore = FAISS.from_documents(md_header_splits, embedding_model)
# vectorstore.save_local("vectorstores/markdown_index")

vectorstore = Chroma.from_documents(
    documents=md_header_splits,
    embedding=embedding_model,
    persist_directory="vectorstores/chroma_index"
)

# Persist to disk
vectorstore.persist()

vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory="vectorstores/chroma_index"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

query = "Different types of cyber threats"
docs = retriever.get_relevant_documents(query)

print("✅ Vectorstore saved at vectorstores/markdown_index")
query = "Different types of cyber threats"
docs = retriever.get_relevant_documents(query)
# results = vectorstore.similarity_search(
#     "Make passwords at least 8 characters long ",
#     k=3
# )
print(docs)

#all_docs = vectorstore.similarity_search("", k=len(vectorstore.index_to_docstore_id))
#full_text = "\n\n".join([doc.page_content for doc in all_docs])


slide_count_prompt = f"""
You are an AI presentation planner.

Given the following document content, estimate how many clear, presentable slides can be made from it. Just return a number — no explanation.

---START---
{md_text}
---END---
"""
import ollama

response = ollama.chat(model="llama3:8b", messages=[
    {"role": "user", "content": slide_count_prompt}
])
# chat = ChatOllama(
#             base_url = 'http://127.0.0.1:11434',
#             model = "llama3:8b",
#             messages=[
#             {"role": "user", "content": slide_count_prompt}
#             ])
            

num_slides = int(''.join(filter(str.isdigit, response['message']['content'])))
print(f"🧠 LLaMA suggests creating {num_slides} slides.")

slide_gen_prompt = f"""
You are a presentation-writing assistant.

Based on the following content, generate exactly {num_slides} slides.
Each slide must include:
- A title
- 3 to 8 concise bullet points
- Dont loose information of topics
Output a valid JSON array like:
[
  {{
    "title": "Slide Title",
    "bullets": ["Bullet 1", "Bullet 2", ...]
  }},
  ...
]

---START---
{md_text}
---END---
"""

response = ollama.chat(model="llama3:8b", messages=[
    {"role": "user", "content": slide_gen_prompt}
])
print("📝 LLM Response:")
print(response['message']['content'])
import json
import json
import re

raw_output = response['message']['content']

# Extract the first JSON array in the response using regex
match = re.search(r'\[\s*{.*}\s*]', raw_output, re.DOTALL)
if match:
    slides = json.loads(match.group(0))
    print(f"✅ Parsed {len(slides)} slides from LLaMA output")
else:
    print("❌ No valid JSON found.")
    print("🧾 Raw output was:\n", raw_output)
# slides = json.loads(response['message']['content'])

from pptx import Presentation

def build_ppt_from_json(slides, filename="ai_slides.pptx"):
    prs = Presentation()
    layout = prs.slide_layouts[1]  # Title + content

    for slide in slides:
        s = prs.slides.add_slide(layout)
        s.shapes.title.text = slide["title"]
        s.placeholders[1].text = "\n".join(f"- {b}" for b in slide["bullets"])

    prs.save(filename)
    print(f"✅ PPT created: {filename} with {len(slides)} slides")

build_ppt_from_json(slides)