Files
diw/diff_hwp.py
2025-01-08 17:43:24 +09:00

47 lines
1.4 KiB
Python

import olefile
import difflib
def extract_text_from_hwp(file_path):
"""
HWP 파일에서 텍스트를 추출합니다.
"""
try:
ole = olefile.OleFileIO(file_path)
if ole.exists('BodyText/Section0'):
content = ole.openstream('BodyText/Section0').read()
text = content.decode('utf-16le', errors='ignore')
return text
else:
print(f"No 'BodyText/Section0' stream found in {file_path}")
return ''
except Exception as e:
print(f"Error extracting text from {file_path}: {e}")
return ''
def compare_texts(text1, text2):
"""
두 텍스트의 차이를 비교하고 차이점을 카운트합니다.
"""
diff = difflib.ndiff(text1.splitlines(), text2.splitlines())
diff_count = 0
for line in diff:
if line.startswith('+ ') or line.startswith('- '):
diff_count += 1
return diff_count
def main():
file1 = r'C:\Users\dra\project\HWP-Scoring\input\원본.hwp'
file2 = r'C:\Users\dra\project\HWP-Scoring\input\원본 copy.hwp'
text1 = extract_text_from_hwp(file1)
text2 = extract_text_from_hwp(file2)
if not text1 or not text2:
print("텍스트를 추출하지 못했습니다.")
return
diff_count = compare_texts(text1, text2)
print(f"차이점 개수: {diff_count}")
if __name__ == "__main__":
main()