diff_hwp.py

import olefile
import difflib

def extract_text_from_hwp(file_path):
    """
    HWP 파일에서 텍스트를 추출합니다.
    """
    try:
        ole = olefile.OleFileIO(file_path)
        if ole.exists('BodyText/Section0'):
            content = ole.openstream('BodyText/Section0').read()
            text = content.decode('utf-16le', errors='ignore')
            return text
        else:
            print(f"No 'BodyText/Section0' stream found in {file_path}")
            return ''
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ''

def compare_texts(text1, text2):
    """
    두 텍스트의 차이를 비교하고 차이점을 카운트합니다.
    """
    diff = difflib.ndiff(text1.splitlines(), text2.splitlines())
    diff_count = 0
    for line in diff:
        if line.startswith('+ ') or line.startswith('- '):
            diff_count += 1
    return diff_count

def main():
    file1 = r'C:\Users\dra\project\HWP-Scoring\input\원본.hwp'
    file2 = r'C:\Users\dra\project\HWP-Scoring\input\원본 copy.hwp'
    
    text1 = extract_text_from_hwp(file1)
    text2 = extract_text_from_hwp(file2)
    
    if not text1 or not text2:
        print("텍스트를 추출하지 못했습니다.")
        return
    
    diff_count = compare_texts(text1, text2)
    print(f"차이점 개수: {diff_count}")

if __name__ == "__main__":
    main()
1번문항채점가능 2025-01-08 17:43:24 +09:00			`import olefile`
			`import difflib`

			`def extract_text_from_hwp(file_path):`
			`"""`
			`HWP 파일에서 텍스트를 추출합니다.`
			`"""`
			`try:`
			`ole = olefile.OleFileIO(file_path)`
			`if ole.exists('BodyText/Section0'):`
			`content = ole.openstream('BodyText/Section0').read()`
			`text = content.decode('utf-16le', errors='ignore')`
			`return text`
			`else:`
			`print(f"No 'BodyText/Section0' stream found in {file_path}")`
			`return ''`
			`except Exception as e:`
			`print(f"Error extracting text from {file_path}: {e}")`
			`return ''`

			`def compare_texts(text1, text2):`
			`"""`
			`두 텍스트의 차이를 비교하고 차이점을 카운트합니다.`
			`"""`
			`diff = difflib.ndiff(text1.splitlines(), text2.splitlines())`
			`diff_count = 0`
			`for line in diff:`
			`if line.startswith('+ ') or line.startswith('- '):`
			`diff_count += 1`
			`return diff_count`

			`def main():`
			`file1 = r'C:\Users\dra\project\HWP-Scoring\input\원본.hwp'`
			`file2 = r'C:\Users\dra\project\HWP-Scoring\input\원본 copy.hwp'`

			`text1 = extract_text_from_hwp(file1)`
			`text2 = extract_text_from_hwp(file2)`

			`if not text1 or not text2:`
			`print("텍스트를 추출하지 못했습니다.")`
			`return`

			`diff_count = compare_texts(text1, text2)`
			`print(f"차이점 개수: {diff_count}")`

			`if __name__ == "__main__":`
			`main()`