# pip install -U langchain langchain-community

import fitz  # PyMuPDF

def pdf_to_markdown_cleaned(pdf_path, md_path="output.md"):
    doc = fitz.open(pdf_path)
    markdown_text = ""

    for page in doc:
        blocks = page.get_text("blocks")
        blocks = sorted(blocks, key=lambda b: (b[1], b[0]))  # sort top to bottom, left to right

        for block in blocks:
            text = block[4].strip()
            if not text:
                continue

            # Heuristic for heading: short + uppercase (or could use font size)
            if text.isupper() and len(text.split()) <= 5:
                markdown_text += f"\n\n### {text}\n"
            elif len(text.split()) <= 6 and text.istitle():
                markdown_text += f"\n\n## {text}\n"
            elif text.startswith("•") or text.startswith("-"):
                markdown_text += f"\n- {text[1:].strip()}"
            else:
                markdown_text += f"\n\n{text}"

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(markdown_text.strip())

    print(f"✅ Markdown saved to {md_path}")



pdf_to_markdown_cleaned("cyber.pdf", "sample_output.md")