from lxml import etree
from difflib import SequenceMatcher

def get_all_text_xml_file(root):
    # all_text = root.xpath("//CHAR/text() | //TEXTART/@Text")
    all_text = root.xpath("//CHAR/text()")
    print(f'all_text length: {len(all_text)}')
    return all_text

def find_typos_and_spaces(original, target):
    typos = []
    spaces = []
    space_differences = 0
    
    for text in original:  
        # 오타 검사
        words = text.split()
        for word in words:
            if not any(SequenceMatcher(None, word, target_word).ratio() >= 0.9 for target_word in target):
                typos.append(word)
                
    # 공백 차이 검사
    for orig_text, targ_text in zip(original, target):
        min_length = min(len(orig_text), len(targ_text))
        orig_text = orig_text[:min_length]
        targ_text = targ_text[:min_length]
        orig_spaces = orig_text.count(' ')
        targ_spaces = targ_text.count(' ')
        space_differences += abs(orig_spaces - targ_spaces)
    
    print(f'space_differences : {space_differences}')
    
    return typos, spaces

# XML 파일 파싱
original_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-010128-윤빈.hml"
target_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009939-이준.hml"
# target_file = r"C:\Users\dra\project\HWP-Scoring\output\원본 copy.hml"

tree = etree.parse(original_file)
root = tree.getroot()
original_text = get_all_text_xml_file(root)

tree = etree.parse(target_file)
root = tree.getroot()
target_text = get_all_text_xml_file(root)

typos, spaces = find_typos_and_spaces(original_text, target_text)
# print(f'Typos: {typos}')
# print(f'Spaces: {spaces}')