from datetime import datetime import json from pathlib import Path import os from lxml import etree as ET import re from difflib import SequenceMatcher import pandas as pd import base64 # from xpathSearch import XMLPathHandler class XMLScorer: # 채점 기준 경로 초기화 def __init__(self, scoring_criteria_path): # 채점 기준 로드 self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path) # 채점 기준파일 로드(JSON 파일) def _load_scoring_criteria(self, file_path): with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) # XML 파일에서 element의 값을 찾아 반환 def query_xml(self, root, *args): first_xpath = args[0] second_xpath = args[1] points = args[2] if second_xpath is not None: try: result = root.xpath(first_xpath) if type(result) is list and len(result) == 0: return None elif result < points: result = root.xpath(second_xpath) return result else: return result # result = root.xpath(second_xpath) # print(f'result : {result}') # return result except ET.XPathEvalError as e: return None else: try: result = root.xpath(first_xpath) if type(result) is list and len(result) == 0: return None return result except ET.XPathEvalError as e: return None def chart_query_xml(self, tree, xpath, namespaces): result = tree.xpath(xpath, namespaces=namespaces) if type(result) is list and len(result) == 0: return None return result # 유사한 텍스트 찾기 def find_similar_text(self, root, target_text, threshold=0.5): """ 전체 문서에서 유사한 텍스트를 찾아 반환 Args: root (_type_): xml root element 객체 target_text (_type_): 찾을 텍스트 threshold (float, optional): 유사도 설정 Defaults to 0.3. Returns: str: 유사도 기준을 만족하는 텍스트 """ # 전체 텍스트 추출 # all_text = root.xpath(f"//CHAR/text()") # all_text.append(root.xpath(f"//TEXTART/@text")) all_text = root.xpath(f"//CHAR/text() | //TEXTART/@Text") # 유사도 비교 max_score = 0 similar_text = '' for text in all_text: score = SequenceMatcher(None, target_text, text).ratio() if score > max_score: max_score = score similar_text = text if max_score >= threshold: return similar_text else: return None # 하나의 XML 파일 채점 def _score_xml_file(self, xml_path, chart_xml): try: tree = ET.parse(xml_path) root = tree.getroot() # 네임스페이스 정의 namespaces = { 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart' } # 차트 XML 파일이 없는 경우 0점 채점을 위헤 빈 XML 생성 if chart_xml is None: chart_tree = ET.fromstring('') else: chart_tree = ET.fromstring(chart_xml) total_score = 0 partial_score = 0 previous_first_digit = None # 결과값을 Dictionary로 저장 results = { 'filename': os.path.basename(xml_path), 'score_results': [], 'total_score': 0, 'partial_scores': [] } print(f"File name: {results['filename']}") for criterion_id, criterion in self.scoring_criteria.items(): # 키값의 첫 숫자를 확인 first_digit = criterion_id.split('-')[0] if (previous_first_digit is not None) and (first_digit != previous_first_digit): results['partial_scores'].append({ 'section': previous_first_digit, 'score': partial_score }) partial_score = 0 previous_first_digit = first_digit xpath = criterion['path'] xpath2 = criterion['path2'] search_value = criterion['searchValue'] right_answer = criterion['value'] points = criterion['points'] category = criterion['category'] item = criterion['item'] simliar_text = None # searchValue가 있을 경우 유사한 텍스트 찾기 if search_value is not None: simliar_text = self.find_similar_text(root, search_value) if simliar_text is None: xpath = xpath.replace('{searchValue}', search_value) else: xpath = xpath.replace('{searchValue}', simliar_text) if "chart_xml" in category: result = self.chart_query_xml(chart_tree, xpath, namespaces) # xpath로 실제 작성 답안 찾기 else: result = self.query_xml(root, xpath, xpath2, points) # [ boolean 타입 ] # 1. 이텔릭체, 굵게, 밑줄 등 효과가 적용 여부에 따라 # [ITALIC] [BOLD] [UNDERLINE] 태그가 있거나 없을 수 있으므로 # 존재 유무에 따라 True, False로 판단 # 2. 두 가지 이상의 조건을 모두 만족해야 하는 경우 and 연산자로 연결되어 # 반환값 True/False로 판단 # [ float 타입 ] # 1. 부분점수의 합산으로 반환되는 경우 float 타입으로 반환 if type(result) is not list: if type(result) is float and (result > points): actual_answer = float(points) else: actual_answer = result else: if type(right_answer) is int: actual_answer = int(result[0]) else: actual_answer = result[0] scoring = { 'category': category, # 채점 분류 'item': item, # 채점 항목 'right_answer': right_answer, # 정답 'actual_answer': actual_answer, # 실제 작성 답안 'points': 0, 'deductions': [] # 각 기준별 감점 내역 } scoring['points'] = points # 점수 차감 조건 # 1. 정답이 실수형으로 반환받은 경우는 채점항목의 부분점수 합산 결과이므로 # 반환받은 값 그대로를 점수로 사용 # 2. 정답이 정수형(사이즈 비교)의 경우 오차범위를 넘는다면 감점 # 3. 그 외의 경우 정답과 실제 작성 답안이 다른 경우 점수 차감 if type(actual_answer) is float: scoring['points'] = actual_answer elif type(actual_answer) is int: # 오차범위 5 이상이면 감점 if abs(actual_answer - right_answer) > 5: scoring['points'] -= points else: # right_answer(JSON파일 내 valuer값) null일 경우 점수감점 없이 진행 if right_answer != actual_answer: scoring['points'] -= points # 점수 차감 이유 작성 (개발중) results['score_results'].append(scoring) total_score += scoring['points'] partial_score += scoring['points'] print(f'scoring: {scoring}') results['total_score'] = total_score if previous_first_digit is not None: results['partial_scores'].append({ 'section': previous_first_digit, 'score': partial_score }) return results except ET.ParseError as e: return { 'filename': os.path.basename(xml_path), 'error': f"XML 파싱 오류: {str(e)}", 'total_score': 0 } def binary_to_chartxml(self, xml_path): print(f'binary_to_chartxml {xml_path}') tree = ET.parse(xml_path) root = tree.getroot() binary_data = root.xpath('//BINDATA[@Id=//BINITEM[@Format="OLE"]/@BinData]/text()') if not binary_data: return None binary_data = binary_data[0].encode('utf-8') # 태그와 그 내부 내용을 삭제합니다. encoded_data = re.sub(b'', b'', binary_data) encoded_data = encoded_data.replace(b'', b'') encoded_data = encoded_data.replace(b'\r\n', b'') # base64 디코딩을 수행합니다. decoded_data = base64.b64decode(encoded_data+b'==') # 디코딩된 데이터 내용 중 xml 형식만 추출할 때 , 사이의 데이터만 추출. start = decoded_data.find(b'') print(end) xml_data = decoded_data[start:end+len(b'')] # xml 데이터가 없는 경우 None을 반환합니다. if -1 in [start, end]: return None # 디코딩된 데이터를 파일로 저장합니다. base_filename = os.path.splitext(xml_path)[0] new_filename = f'{base_filename}.xml' with open(new_filename, 'wb') as file: file.write(xml_data) return xml_data # XML 파일 채점 def score_directory(self, xml_directory): # xml 파일 불러오기 xml_files = Path(xml_directory).glob('*.hml') # 결과 저장할 리스트 results = [] for xml_file in xml_files: chart_xml = self.binary_to_chartxml(xml_file) result = self._score_xml_file(xml_file, chart_xml) results.append(result) return results def parse_filename(self, filename): if isinstance(filename, dict): filename = filename.get('파일명', '') match = re.match(r'.*-(\d+)-(.+)\.hml', filename) if match: number = match.group(1) name = match.group(2) return number, name return None, None def export_to_excel(self, results, output_path=None): if output_path is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"scoring_results_{timestamp}.xlsx" summary_data = [] detail_data = [] for result in results: # 요약 정보 summary_row = { '파일명': result['filename'], '총점': result.get('total_score', 0) } if 'error' in result: summary_row['오류'] = result['error'] summary_data.append(summary_row) # 상세 정보 if 'score_results' in result: filename = {'파일명': result['filename']} number, name = self.parse_filename(filename) if (number or name) is None: detail_row = {'채점항목': result['filename'] } else: detail_row = {'채점항목':f"{number}-{name}"} for i, scoring in enumerate(result['score_results']): # detail_row[scoring['item']] = scoring['points'] detail_row[f'{i+1}'] = scoring['points'] detail_row['총점'] = result.get('total_score', 0) detail_data.append(detail_row) summary_df = pd.DataFrame(summary_data) detail_df = pd.DataFrame(detail_data).transpose() # detail_df = pd.DataFrame(detail_data) # ExcelWriter 객체 생성 with pd.ExcelWriter(output_path, engine='openpyxl') as writer: summary_df.to_excel(writer, sheet_name='채점결과요약', index=False) detail_df.to_excel(writer, sheet_name='채점상세내역', index=False) # 열 너비 자동 조정 # for sheet_name in writer.sheets: # worksheet = writer.sheets[sheet_name] # for column_cells in worksheet.columns: # max_length = 0 # column = column_cells[0].column_letter # 열의 문자 # for cell in column_cells: # try: # if cell.value: # max_length = max(max_length, len(str(cell.value))) # except: # pass # adjusted_width = (max_length + 2) # worksheet.column_dimensions[column].width = adjusted_width return output_path def main(): scoring_criteria_path = r'./scoring_criteria.json' # xml(hml)파일 디렉토리 경로 xml_directory = r'./output' # 채점 클래스 초기화 scorer = XMLScorer(scoring_criteria_path) # 폴더 내 모든 xml 파일 채점 results = scorer.score_directory(xml_directory) # for result in results: # print(f"\n파일: {result['filename']}") # if 'error' in result: # print(f"오류: {result['error']}") # continue # print(f"총점: {result['total_score']}") # print("\n채점 세부사항:") # for scoring in result['score_results']: # print(f"채점분류: {scoring['category']}") # print(f"채점항목: {scoring['item']}") # print(f"요구 답안: {scoring['right_answer']}") # print(f"작성 답안: {scoring['actual_answer']}") # print(f"획득 점수: {scoring['points']}") # print(f"감점 내역: {scoring['deductions']}") # print("---") # 채점 결과 엑셀로 저장 output_excel_path = scorer.export_to_excel(results) print(f"채점 결과 엑셀 파일: {output_excel_path}") if __name__ == '__main__': main()