from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama import OllamaEmbeddings
import os
OPENAI_API_KEY = "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"

md_path = "temp_content1.md"
# with open("new.md", "r", encoding="utf-8") as file:
#     markdown_text = file.read()

#print(markdown_text)

loader = UnstructuredMarkdownLoader(md_path)
data = loader.load()
#print(data)

# splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
#     ("###", "Section"),
#     ("####", "Subsection"),
# ])
# from langchain.text_splitter import CharacterTextSplitter
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# split_documents = text_splitter.split_documents(data)
# print(split_documents[1])
# markdown_text = "\n".join(doc.page_content for doc in data)
# chunks = splitter.split_text(markdown_text)
# print(f"Chunks created: {len(chunks)}")
# print(chunks[0].page_content)
# print(chunks[0].metadata)

# sentence_text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1500,
#     chunk_overlap=200,
#     separators=["\n\n", "\n", " "]
# )
from langchain_openai import OpenAIEmbeddings
from langchain_groq import ChatGroq

embeddings = OllamaEmbeddings(model='nomic-embed-text')
#text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
# text_splitter = SemanticChunker(
#     OpenAIEmbeddings(OPENAI_API_KEY=OPENAI_API_KEY), breakpoint_threshold_type="standard_deviation"
# )

converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter("cyber.pdf")
text, _, images = text_from_rendered(rendered)
print(text)
text= text
md_path = "temp_content_cyber.md"
with open(md_path, "w", encoding="utf-8") as f:
    f.write(text)


md_text1 = """### EMPLOYEE CODE OF CONDUCT

ACTING WITH INTEGRITY
#dasda 
![](_page_0_Picture_3.jpeg)

#### LEADERSHIP

We believe that leadership is a team effort built upon the mutual respect and fair treatment of employees, customers, and suppliers along with strong community relationships, all developed through honesty and accountability. By demonstrating integrity, humility, and trustworthiness our companies stand apart from our peers and make positive impacts in our communities. Everyone in the organization has the opportunity to lead by example: showing respect in all interactions, taking responsibility for their own actions, inspiring trust through honesty, and contributing to the success of our company.

#### PEOPLE

We have a personal and professional commitment to protecting the health and safety of our employees, customers, suppliers, service providers and the people in the communities in which we operate. Reliance believes that one person can make a difference, but that ongoing success requires a diverse team of dedicated people and companies working together to make a significant difference. Both the individual and combined strengths of our Family of Companies make Reliance the industry leader.

#waewa
#### SERVICE

Our aim is to deliver value to our customers by providing the highest service levels possible. We deliver what we promise when we promise it, while always striving to improve and exceed customer expectations. Serving customers the right way is the only way we operate. Our customers, no matter their size, end market, or unique supply requirements, will never receive less than our absolute best effort to deliver excellence on all measures of quality and service. We strive to always be flexible and agile in servicing our customers' needs.

#### DIVERSITY"""
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("####", "Header 3"),
]
with open(md_path, 'r', encoding='utf-8') as file:
            md_content = file.read()
markdown_text = "\n".join(doc.page_content for doc in data)
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(text)
print("HEllo")
print(md_header_splits[3])
# #docs = text_splitter.split_documents([data])
# embeddings = OllamaEmbeddings(model='nomic-embed-text')
# #embeddings = ChatGroq(model='meta-llama/llama-4-maverick-17b-128e-instruct',groq_api_key="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")
# text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")

# split_documents =text_splitter.split_documents(data)
# print(split_documents[1])
