import pandas as pd
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString
import re

def clean_xml_string(text):
    """Clean string to make it valid for XML"""
    if text is None:
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Replace invalid XML characters
    text = re.sub(r'[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\U00010000-\U0010FFFF]', '', text)
    
    # Replace &, <, >, ", ' with XML entities
    text = text.replace('&', '&amp;')
    text = text.replace('<', '&lt;')
    text = text.replace('>', '&gt;')
    text = text.replace('"', '&quot;')
    text = text.replace("'", '&apos;')
    
    return text

def excel_to_xml(excel_file_path, output_xml_path):
    # Read the Excel file
    df = pd.read_excel(excel_file_path)
    
    # Clean up data - replace NaN values with empty strings
    df = df.fillna('')
    
    # Create root XML element
    root = ET.Element("root")
    
    # Track current elements to build the hierarchy
    current = {
        "parent": None,
        "parent_elem": None,
        "child1": None,
        "child1_elem": None,
        "child2": None,
        "child2_elem": None,
        "child3": None,
        "child3_elem": None
    }
    
    # Process each row
    for _, row in df.iterrows():
        try:
            parent_tag = str(row['PARENT TAG']).strip() if not pd.isna(row['PARENT TAG']) else ""
            child_tag1 = str(row['CHILD TAG 1']).strip() if not pd.isna(row['CHILD TAG 1']) else ""
            child_tag2 = str(row['CHILD TAG 2']).strip() if not pd.isna(row['CHILD TAG 2']) else ""
            child_tag3 = str(row['CHILD TAG 3']).strip() if not pd.isna(row['CHILD TAG 3']) else ""
            data = row['DATA'] if not pd.isna(row['DATA']) else ""
            
            # Make sure tag names are valid XML
            if parent_tag:
                parent_tag = re.sub(r'[^a-zA-Z0-9_-]', '_', parent_tag)
            if child_tag1:
                child_tag1 = re.sub(r'[^a-zA-Z0-9_-]', '_', child_tag1)
            if child_tag2:
                child_tag2 = re.sub(r'[^a-zA-Z0-9_-]', '_', child_tag2)
            if child_tag3:
                child_tag3 = re.sub(r'[^a-zA-Z0-9_-]', '_', child_tag3)
            
            # Handle parent tag
            if parent_tag and parent_tag != current["parent"]:
                parent_elem = ET.SubElement(root, parent_tag)
                current["parent"] = parent_tag
                current["parent_elem"] = parent_elem
                # Reset child elements
                current["child1"] = None
                current["child1_elem"] = None
                current["child2"] = None
                current["child2_elem"] = None
                current["child3"] = None
                current["child3_elem"] = None
            
            # Handle child1 tag
            if child_tag1:
                if child_tag1 != current["child1"]:
                    if current["parent_elem"] is not None:
                        child1_elem = ET.SubElement(current["parent_elem"], child_tag1)
                        current["child1"] = child_tag1
                        current["child1_elem"] = child1_elem
                        # Reset deeper child elements
                        current["child2"] = None
                        current["child2_elem"] = None
                        current["child3"] = None
                        current["child3_elem"] = None
                
            # Handle child2 tag
            if child_tag2:
                if child_tag2 != current["child2"]:
                    if current["child1_elem"] is not None:
                        child2_elem = ET.SubElement(current["child1_elem"], child_tag2)
                        current["child2"] = child_tag2
                        current["child2_elem"] = child2_elem
                        # Reset deepest child element
                        current["child3"] = None
                        current["child3_elem"] = None
                
            # Handle child3 tag
            if child_tag3:
                if child_tag3 != current["child3"]:
                    if current["child2_elem"] is not None:
                        child3_elem = ET.SubElement(current["child2_elem"], child_tag3)
                        current["child3"] = child_tag3
                        current["child3_elem"] = child3_elem
            
            # Add data to the appropriate element
            if data:
                data_str = clean_xml_string(data)
                if current["child3_elem"] is not None and child_tag3:
                    current["child3_elem"].text = data_str
                elif current["child2_elem"] is not None and child_tag2:
                    current["child2_elem"].text = data_str
                elif current["child1_elem"] is not None and child_tag1:
                    current["child1_elem"].text = data_str
                elif current["parent_elem"] is not None and parent_tag:
                    current["parent_elem"].text = data_str
                    
        except Exception as e:
            print(f"Error processing row: {e}")
            continue
    
    try:
        # Convert the XML tree to a string
        rough_string = ET.tostring(root, 'utf-8')
        
        # Parse and pretty print
        reparsed = parseString(rough_string)
        pretty_xml = reparsed.toprettyxml(indent="  ")
        
        # Remove empty lines
        pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
        
        # Save to file
        with open(output_xml_path, 'w', encoding='utf-8') as f:
            f.write(pretty_xml)
        
        return pretty_xml
    
    except Exception as e:
        print(f"Error generating XML: {e}")
        
        # Fallback: direct string representation without pretty printing
        try:
            with open(output_xml_path, 'wb') as f:
                f.write(rough_string)
            return "XML saved with basic formatting (pretty printing failed)"
        except Exception as fallback_error:
            print(f"Fallback error: {fallback_error}")
            return None

# Example usage
if __name__ == "__main__":
    input_file = "XML_to_Excel.xlsx"  # Replace with your actual file path
    output_file = "output.xml"
    
    xml_content = excel_to_xml(input_file, output_file)
    print(f"XML file has been created at {output_file}")