50 lines
1.7 KiB
Python
50 lines
1.7 KiB
Python
|
|
from lxml import etree
|
||
|
|
from difflib import SequenceMatcher
|
||
|
|
|
||
|
|
def get_all_text_xml_file(root):
|
||
|
|
# all_text = root.xpath("//CHAR/text() | //TEXTART/@Text")
|
||
|
|
all_text = root.xpath("//CHAR/text()")
|
||
|
|
print(f'all_text length: {len(all_text)}')
|
||
|
|
return all_text
|
||
|
|
|
||
|
|
def find_typos_and_spaces(original, target):
|
||
|
|
typos = []
|
||
|
|
spaces = []
|
||
|
|
space_differences = 0
|
||
|
|
|
||
|
|
for text in original:
|
||
|
|
# 오타 검사
|
||
|
|
words = text.split()
|
||
|
|
for word in words:
|
||
|
|
if not any(SequenceMatcher(None, word, target_word).ratio() >= 0.9 for target_word in target):
|
||
|
|
typos.append(word)
|
||
|
|
|
||
|
|
# 공백 차이 검사
|
||
|
|
for orig_text, targ_text in zip(original, target):
|
||
|
|
min_length = min(len(orig_text), len(targ_text))
|
||
|
|
orig_text = orig_text[:min_length]
|
||
|
|
targ_text = targ_text[:min_length]
|
||
|
|
orig_spaces = orig_text.count(' ')
|
||
|
|
targ_spaces = targ_text.count(' ')
|
||
|
|
space_differences += abs(orig_spaces - targ_spaces)
|
||
|
|
|
||
|
|
print(f'space_differences : {space_differences}')
|
||
|
|
|
||
|
|
return typos, spaces
|
||
|
|
|
||
|
|
# XML 파일 파싱
|
||
|
|
original_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-010128-윤빈.hml"
|
||
|
|
target_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009939-이준.hml"
|
||
|
|
# target_file = r"C:\Users\dra\project\HWP-Scoring\output\원본 copy.hml"
|
||
|
|
|
||
|
|
tree = etree.parse(original_file)
|
||
|
|
root = tree.getroot()
|
||
|
|
original_text = get_all_text_xml_file(root)
|
||
|
|
|
||
|
|
tree = etree.parse(target_file)
|
||
|
|
root = tree.getroot()
|
||
|
|
target_text = get_all_text_xml_file(root)
|
||
|
|
|
||
|
|
typos, spaces = find_typos_and_spaces(original_text, target_text)
|
||
|
|
# print(f'Typos: {typos}')
|
||
|
|
# print(f'Spaces: {spaces}')
|