from dotenv import load_dotenv
load_dotenv()

# bring in deps
from llama_cloud_services import LlamaParse
from llama_index.core import SimpleDirectoryReader
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
import os
import re
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_chroma import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter("cyber.pdf")
text, _, images = text_from_rendered(rendered)
print(text)
exit

embedding_model = OllamaEmbeddings(model='nomic-embed-text')
OPENAI_API_KEY = "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"

parsing_instruction = """
Please parse the document into markdown format using the following rules:

1. Use `#` only for the main document title .
2. Use `##` for top-level sections like.
3. Use `###` for subsections under those .
4. Preserve the original order and structure of sections as in the document.
5. Do not infer or hallucinate section headers — only convert explicitly visible ones.
6. Ensure that all content under a heading is properly indented and grouped.
7. Tables, bullet points, and bold formatting should be preserved as they appear.

Output clean, readable markdown text without metadata or footers unless explicitly part of the content.
"""

parser = LlamaParse(
    result_type="markdown",
    system_prompt =parsing_instruction
        # "markdown" and "text" are available
)
folder_path = "parse_folder"
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['cyber.pdf'], file_extractor=file_extractor,input_dir=folder_path,).load_data()
print(documents)
print("Done1")

folder_path = "parse_folder123"
combined_text = ""
print(f"Total files parsed: {len(documents)}")
for i, doc in enumerate(documents):
    with open(f"parsed_outputs_{i+1}.md", "w", encoding="utf-8") as f:
        f.write(doc.text)  # or doc.page_content depending on LlamaIndex version

documents = parser.load_data(["/parser_folder"])

#folder_path = "parse_folder1"  # Replace with your folder path
# combined_text = ""

def extract_number(filename):
    match = re.search(r'(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

# Get and sort filenames numerically
sorted_files = sorted(
    [f for f in os.listdir(folder_path) if f.endswith(".md")],
    key=extract_number
)

# Loop through sorted markdown files
for filename in sorted_files:
    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
        combined_text += f.read() + "\n\n"  # Add spacing between files
print("Combined_text ::")
print(combined_text)
# Save combined content
with open("combined_output1.md", "w", encoding="utf-8") as f:
    f.write(combined_text)
print("Combined_text ::")
print(combined_text)
print("✅ Combined markdown saved as combined_output1.md")


md_text = """# What is Cyber security and Information security

## Definition
Information security: Protecting information, regardless of its format, from unauthorized access, use, disclosure, disruption, modification, or destruction.
Broader: Covers all forms of information, including physical and digital.
Cybersecurity: Protecting systems, networks, and data from cyber threats, focusing on technology and digital environments.
Narrower: Focuses specifically on protecting digital systems, networks, and data.

## Principles of Cybersecurity
In cybersecurity, the CIA Triad is a foundational model representing the three core principles of information security. These principles ensure the effective protection of data and systems.
- **Confidentiality**: Ensures that information is accessible only to those authorized to access it.
- **Integrity**: Ensures that data is accurate, complete, and unaltered during storage or transmission.
- **Availability**: Ensures that information and resources are accessible to authorized users when needed.

## Different types of cyber threats
- **Social Engineering**
Social engineering is a manipulation technique that exploits human nature to gain private information, access, or valuables. In cybercrime, these “human hacking” scams tend to lure unsuspecting users into exposing data, spreading malware infections, or giving access to restricted systems. Attacks can happen online, in-person, and via other interactions.

- **Malware**
Harmful software that can access a system's data, such as viruses, spyware, ransomware, and worms.

- **Phishing**

# Cybersecurity Threats and Safe Practices

## Types of Cyber Attacks
### Phishing
A deceptive attack where cybercriminals impersonate legitimate entities to trick victims into revealing sensitive information.

### Denial-of-Service (DoS) attack
A cyber attack that overwhelms a system's resources, making it unable to respond to legitimate service requests.

### Distributed denial-of-service (DDoS) attack
A similar attack to a DoS attack, but initiated by many malware-infected host machines.

### SQL injection
An attack that exploits vulnerabilities in databases by injecting malicious code into user inputs.

### Cross-Site Scripting (XSS)
An attack that involves injecting malicious code into a website, but the code only runs in the user's browser.

### Ransomware
A malicious form of software that encrypts a victim's files or locks them out of their computer system, demanding a ransom payment in exchange.

## Safe Internet and Device Usage
### Verify Website Security
- Look for HTTPS in the URL.
- Avoid clicking on pop-ups or ads promising free products or rewards.

### Use Strong Passwords
- Make passwords at least 8 characters long with a mix of letters, numbers, and symbols.
- Avoid using common words or phrases.
- Use a password manager to store and generate secure passwords.

### Enable Multi-Factor Authentication (MFA)
- Add an extra layer of protection to your accounts by requiring a second form of verification (e.g., a code sent to your phone).

# Secure Browsing

## Tips for Secure Browsing
- Keep Software Updated: Ensure your browser and plugins are up to date to patch vulnerabilities.
- Use a Secure Browser: Consider browsers with built‑in security features.
- Avoid Clicking on Ads: Block or ignore pop‑ups and ads to reduce risk.
- Use Private Browsing: Use incognito or private modes for sensitive activities.
- Limit Cookies: Manage browser settings to minimize cookie tracking.
- Avoid Public Wi-Fi: Use a VPN when browsing on unsecured networks.
- Log Out After Use: Always log out of accounts, especially on shared or public devices.
- Be cautious of typosquatting domains (e.g., goggle.com instead of google.com).

## Tips for Public Wi-Fi Use
- Avoid Sensitive Transactions: Do not access banking or sensitive accounts on public Wi-Fi.
- Use a VPN: Encrypt your internet connection to protect your data from potential eavesdroppers.
- Turn Off Sharing: Disable file and printer sharing while connected to public networks.
- Forget Networks After Use: Ensure your device does not automatically reconnect to public Wi-Fi.
- Verify Network Authenticity: Confirm the network name with the provider to avoid connecting to fake networks.
- Keep Software Updated: Regularly update your device’s operating system and security software.

## Email Security Tips
- Think Before You Click: Avoid clicking on suspicious links or attachments in emails.
- Verify Senders: Check the sender’s email address carefully for signs of spoofing.
- Beware of Urgent Language: Scammers often use urgency to trick you into taking action.
- Enable Spam Filters: Use your email provider’s spam filtering features to reduce unwanted emails.
- Do Not Share Sensitive Information: Avoid sending passwords, financial details, or personal information via email.
- Use Encryption: For highly sensitive communications, use email encryption tools.

# Security Guidelines

## Update Email Passwords Regularly
- Use strong, unique passwords and change them periodically.
- Report Suspicious Emails: Notify your IT team or email provider about phishing attempts.

## Dos and Don’ts
- Do not change any hardware configuration, settings in the operating systems or any applications installed on their desktops.
- Do not install any software or applications on your desktops/laptops that is not authorized by the CMES’s IT team or is not essential to CMES’s business.
- USB ports, Compact Disk (CD)/DVD, memory card access are disabled by default. If the user needs to enable them, approval from the user’s manager and IT team shall be required.
- Take appropriate measures for physical protection of laptops such as not leaving laptops unattended in public places or while travelling.
- Do not connect removable media such as CD/DVD, USB drives, and other portable storage media from an unknown source to a CMES system.
- Removable media containing sensitive or confidential information shall be stored in encrypted format by using CMES approved tool.
- Removable media containing sensitive information must not be left out in the open or allowed to be vulnerable to opportunistic theft.

## Clear Desk and Clear Screen
- Ensure that desks and other work areas are kept clear of papers and any storage media when unattended.
- All workstations shall have password-locked screen savers enabled to activate after 5 minutes of inactivity.

## Anti-virus/ Anti-malware
- Users shall not disable the installed anti-virus agent or change its settings defined during installation.
- Users shall report to the IT team for unpatched systems or any virus that is detected in the system and not cleaned by the anti-virus software.

# CURRENT PAGE

## Hardware and Software
•  If you suspect any non-adherence or suspicious activities or email, kindly inform IT team at itsupport@cleanmax.com
•  To prevent the introduction of malicious code and protect the integrity of CMES assets, all hardware and software shall be obtained by raising request through the IT team.
•  CMES’s IT team shall ensure that an approved list of authorized software is maintained.
•  All employees shall abide by the software copyright law and shall not obtain, install, replicate, transfer or use software except as permitted under the licensing agreements.
•  Users shall ensure that personal software are not used on CMES owned assets to protect the integrity of CMES’s assets and information.

## Incident Reporting
•  Access to social media, blogging/ micro-blogging, and video sharing websites such as Facebook, Twitter, LinkedIn, YouTube, etc. shall be restricted to ensure productivity of the employees.
•  No official matter, incidents and happenings that can align the name of CMES shall be discussed/ posted on social media platform by employees. Reporting of such an event may lead to sever consequences for the employees(s).


"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("####", "Header 3"),
]

# markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
# md_header_splits = markdown_splitter.split_text(md_text)
# print("First")
# print(md_header_splits[0].page_content)
# print("Second")
# print(md_header_splits[1])
# vectorstore = FAISS.from_documents(md_header_splits, embedding_model)
# vectorstore.save_local("vectorstores/markdown_index")

print("Open AI Embeddings")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# text_splitter = SemanticChunker(embeddings)
text_splitter = SemanticChunker(embedding_model)
docs = text_splitter.create_documents([md_text])
print(docs[0].page_content)
vectorstore = FAISS.from_documents(docs, embedding_model)
vectorstore.save_local("vectorstores/markdown_embeddings_index_ollama")

# slide_count_prompt = f"""
# You are an AI presentation planner.

# Given the following document content, estimate how many clear, presentable slides can be made from it. Just return a number — no explanation.

# ---START---
# {md_text}
# ---END---
# """
# import ollama

# response = ollama.chat(model="llama3:8b", messages=[
#     {"role": "user", "content": slide_count_prompt}
# ])

# num_slides = int(''.join(filter(str.isdigit, response['message']['content'])))
# print(f"🧠 LLaMA suggests creating {num_slides} slides.")
import openai
import ollama
slide_gen_prompt = f"""
You are a presentation-writing assistant.

Based on the following content, generate 24 slides.
Each slide must include:
- A title
- 3 to 8 concise bullet points
- Dont loose information of topics
Output a valid JSON array like:
[
  {{
    "title": "Slide Title",
    "bullets": ["Bullet 1", "Bullet 2", ...]
  }},
  ...
]

---START---
{md_text}
---END---
"""

response = ollama.chat(model="llama3:8b", messages=[
    {"role": "user", "content": slide_gen_prompt}
])
print("📝 LLM Response:")
print(response['message']['content'])
import json
import json
import re

raw_output = response['message']['content']

# Extract the first JSON array in the response using regex
match = re.search(r'\[\s*{.*}\s*]', raw_output, re.DOTALL)
if match:
    slides = json.loads(match.group(0))
    print(f"✅ Parsed {len(slides)} slides from LLaMA output")
else:
    print("❌ No valid JSON found.")
    print("🧾 Raw output was:\n", raw_output)
# slides = json.loads(response['message']['content'])
