from lxml import etree from difflib import SequenceMatcher def get_all_text_xml_file(root): # all_text = root.xpath("//CHAR/text() | //TEXTART/@Text") all_text = root.xpath("//CHAR/text()") print(f'all_text length: {len(all_text)}') return all_text def find_typos_and_spaces(original, target): typos = [] spaces = [] space_differences = 0 for text in original: # 오타 검사 words = text.split() for word in words: if not any(SequenceMatcher(None, word, target_word).ratio() >= 0.9 for target_word in target): typos.append(word) # 공백 차이 검사 for orig_text, targ_text in zip(original, target): min_length = min(len(orig_text), len(targ_text)) orig_text = orig_text[:min_length] targ_text = targ_text[:min_length] orig_spaces = orig_text.count(' ') targ_spaces = targ_text.count(' ') space_differences += abs(orig_spaces - targ_spaces) print(f'space_differences : {space_differences}') return typos, spaces # XML 파일 파싱 original_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-010128-윤빈.hml" target_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009939-이준.hml" # target_file = r"C:\Users\dra\project\HWP-Scoring\output\원본 copy.hml" tree = etree.parse(original_file) root = tree.getroot() original_text = get_all_text_xml_file(root) tree = etree.parse(target_file) root = tree.getroot() target_text = get_all_text_xml_file(root) typos, spaces = find_typos_and_spaces(original_text, target_text) # print(f'Typos: {typos}') # print(f'Spaces: {spaces}')