47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
|
|
import olefile
|
||
|
|
import difflib
|
||
|
|
|
||
|
|
def extract_text_from_hwp(file_path):
|
||
|
|
"""
|
||
|
|
HWP 파일에서 텍스트를 추출합니다.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
ole = olefile.OleFileIO(file_path)
|
||
|
|
if ole.exists('BodyText/Section0'):
|
||
|
|
content = ole.openstream('BodyText/Section0').read()
|
||
|
|
text = content.decode('utf-16le', errors='ignore')
|
||
|
|
return text
|
||
|
|
else:
|
||
|
|
print(f"No 'BodyText/Section0' stream found in {file_path}")
|
||
|
|
return ''
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error extracting text from {file_path}: {e}")
|
||
|
|
return ''
|
||
|
|
|
||
|
|
def compare_texts(text1, text2):
|
||
|
|
"""
|
||
|
|
두 텍스트의 차이를 비교하고 차이점을 카운트합니다.
|
||
|
|
"""
|
||
|
|
diff = difflib.ndiff(text1.splitlines(), text2.splitlines())
|
||
|
|
diff_count = 0
|
||
|
|
for line in diff:
|
||
|
|
if line.startswith('+ ') or line.startswith('- '):
|
||
|
|
diff_count += 1
|
||
|
|
return diff_count
|
||
|
|
|
||
|
|
def main():
|
||
|
|
file1 = r'C:\Users\dra\project\HWP-Scoring\input\원본.hwp'
|
||
|
|
file2 = r'C:\Users\dra\project\HWP-Scoring\input\원본 copy.hwp'
|
||
|
|
|
||
|
|
text1 = extract_text_from_hwp(file1)
|
||
|
|
text2 = extract_text_from_hwp(file2)
|
||
|
|
|
||
|
|
if not text1 or not text2:
|
||
|
|
print("텍스트를 추출하지 못했습니다.")
|
||
|
|
return
|
||
|
|
|
||
|
|
diff_count = compare_texts(text1, text2)
|
||
|
|
print(f"차이점 개수: {diff_count}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|