score4.py

from lxml import etree
from difflib import SequenceMatcher

def get_all_text_xml_file(root):
    # all_text = root.xpath("//CHAR/text() | //TEXTART/@Text")
    all_text = root.xpath("//CHAR/text()")
    print(f'all_text length: {len(all_text)}')
    return all_text

def find_typos_and_spaces(original, target):
    typos = []
    spaces = []
    space_differences = 0
    
    for text in original:  
        # 오타 검사
        words = text.split()
        for word in words:
            if not any(SequenceMatcher(None, word, target_word).ratio() >= 0.9 for target_word in target):
                typos.append(word)
                
    # 공백 차이 검사
    for orig_text, targ_text in zip(original, target):
        min_length = min(len(orig_text), len(targ_text))
        orig_text = orig_text[:min_length]
        targ_text = targ_text[:min_length]
        orig_spaces = orig_text.count(' ')
        targ_spaces = targ_text.count(' ')
        space_differences += abs(orig_spaces - targ_spaces)
    
    print(f'space_differences : {space_differences}')
    
    return typos, spaces

# XML 파일 파싱
original_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-010128-윤빈.hml"
target_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009939-이준.hml"
# target_file = r"C:\Users\dra\project\HWP-Scoring\output\원본 copy.hml"

tree = etree.parse(original_file)
root = tree.getroot()
original_text = get_all_text_xml_file(root)

tree = etree.parse(target_file)
root = tree.getroot()
target_text = get_all_text_xml_file(root)

typos, spaces = find_typos_and_spaces(original_text, target_text)
# print(f'Typos: {typos}')
# print(f'Spaces: {spaces}')
1번문항채점가능 2025-01-08 17:43:24 +09:00			`from lxml import etree`
			`from difflib import SequenceMatcher`

			`def get_all_text_xml_file(root):`
			`# all_text = root.xpath("//CHAR/text() \| //TEXTART/@Text")`
			`all_text = root.xpath("//CHAR/text()")`
			`print(f'all_text length: {len(all_text)}')`
			`return all_text`

			`def find_typos_and_spaces(original, target):`
			`typos = []`
			`spaces = []`
			`space_differences = 0`

			`for text in original:`
			`# 오타 검사`
			`words = text.split()`
			`for word in words:`
			`if not any(SequenceMatcher(None, word, target_word).ratio() >= 0.9 for target_word in target):`
			`typos.append(word)`

			`# 공백 차이 검사`
			`for orig_text, targ_text in zip(original, target):`
			`min_length = min(len(orig_text), len(targ_text))`
			`orig_text = orig_text[:min_length]`
			`targ_text = targ_text[:min_length]`
			`orig_spaces = orig_text.count(' ')`
			`targ_spaces = targ_text.count(' ')`
			`space_differences += abs(orig_spaces - targ_spaces)`

			`print(f'space_differences : {space_differences}')`

			`return typos, spaces`

			`# XML 파일 파싱`
			`original_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-010128-윤빈.hml"`
			`target_file = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009939-이준.hml"`
			`# target_file = r"C:\Users\dra\project\HWP-Scoring\output\원본 copy.hml"`

			`tree = etree.parse(original_file)`
			`root = tree.getroot()`
			`original_text = get_all_text_xml_file(root)`

			`tree = etree.parse(target_file)`
			`root = tree.getroot()`
			`target_text = get_all_text_xml_file(root)`

			`typos, spaces = find_typos_and_spaces(original_text, target_text)`
			`# print(f'Typos: {typos}')`
			`# print(f'Spaces: {spaces}')`