Files
diw/binaryToChartxml.py

56 lines
1.9 KiB
Python

from pathlib import Path
from lxml import etree as ET
import base64
import re
class binaryToChartxml:
def __init__(self, xml_path):
self.tree = ET.parse(xml_path)
self.root = self.tree.getroot()
def decoding_bindata(self):
binary_data = self.root.xpath('//BINDATA[@Id=//BINITEM[@Format="OLE"]/@BinData]/text()')
binary_data = binary_data[0].encode('utf-8')
encoded_data = re.sub(b'<BINDATA.*?>', b'', binary_data)
# print(encoded_data)
encoded_data = encoded_data.replace(b'</BINDATA>', b'')
encoded_data = encoded_data.replace(b'\r\n', b'')
# base64 디코딩을 수행합니다.
decoded_data = base64.b64decode(encoded_data+b'==')
print(decoded_data)
# 디코딩된 데이터 내용 중 xml 형식만 추출할 때 <c:chartSpace>, </c:chartSpace> 사이의 데이터만 추출.
start = decoded_data.find(b'<?xml')
# print(start)
end = decoded_data.find(b'</c:chartSpace>')
# print(end)
self.xml_data = decoded_data[start:end+len(b'</c:chartSpace>')]
# def save_chart_xml(self, xml_output_path):
# def save_chart_xml(self):
# with open(, 'wb') as file:
# file.write(self.xml_data)
# xml 파일 읽기
# xml_path = r"C:\Users\dra\project\HWP-Scoring\output\워드(한글)-009865-고미송.hml"
# tree = ET.parse(xml_path)
# root = tree.getroot()
# # xpath로 바이너리 부분추출
# binary_data = root.xpath('//BINDATA[@Id=//BINITEM[@Format="OLE"]/@BinData]/text()')
# binary_data = str(binary_data[0])
# print(f'binary : {binary_data}')
# # base64 디코딩
# decoded_data = base64.b64decode(binary_data)
# # 디코딩된 데이터를 파일로 저장
# output_file = f"decoded_output_test.bin"
# with open(output_file, "wb") as decoded_file:
# decoded_file.write(decoded_data)