import fitz  # PyMuPDF
import json

def extract_structured_content_with_pymupdf(pdf_path,output_path="structured_output_ff11_rich.json"):
    structured_content = []
    current_heading = None
    current_content = []
    
    doc = fitz.open(pdf_path)
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        font_size = span["size"]
                        
                        # Identify headings based on font size
                        if font_size > 12 and text:  # Adjust threshold as needed
                            if current_heading and current_content:
                                structured_content.append({
                                    "heading": current_heading,
                                    "content": " ".join(current_content)
                                })
                            current_heading = text
                            current_content = []
                        elif text:
                            current_content.append(text)
    
    # Add the last section
    if current_heading and current_content:
        structured_content.append({
            "heading": current_heading,
            "content": " ".join(current_content)
        })
    
    # Save to JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(structured_content, f, indent=4, ensure_ascii=False)

    print(f"✅ Structured content saved to {output_path}")

    return structured_content

pdf = extract_structured_content_with_pymupdf('rich_dad.pdf')
print("pdf")
print(pdf)
from langchain.docstore.document import Document
def load_documents_from_json(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    documents = [
        Document(page_content=item["content"], metadata={"heading": item["heading"]})
        for item in data
    ]
    return documents

