diw/score5.py

import tkinter as tk
from tkinter import filedialog, messagebox
from datetime import datetime
import difflib
import json
from pathlib import Path
import os
from lxml import etree as ET
import re
from difflib import SequenceMatcher
import pandas as pd
import base64
# from xpathSearch import XMLPathHandler

class XMLScorer:
    # 채점 기준 경로 초기화
    def __init__(self, scoring_criteria_path):
        # 채점 기준 로드
        self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path)

    def set_typo_score(self, score):
        self.typo_score = score

    def get_typo_score(self):
        return self.typo_score

    # 채점 기준파일 로드(JSON 파일)
    def _load_scoring_criteria(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    # XML 파일에서 element의 값을 찾아 반환
    def query_xml(self, root, *args):
        first_xpath = args[0]
        second_xpath = args[1]
        points = args[2]
        category = args[3]

        if ("특수문자" in category) and (second_xpath is not None):
            try:
                result = root.xpath(first_xpath)
                # 결과값이 리스트형인데 내부에 정보가 없는경우
                # 결과값이 없음
                if type(result) is list and len(result) == 0:
                    return None
                elif result < points:
                    result = root.xpath(second_xpath)
                    return result
                else:
                    return result

            except ET.XPathEvalError as e:
                return None

        elif second_xpath is not None:
            try:
                result1 = root.xpath(first_xpath)
                result2 = root.xpath(second_xpath)
                if (type(result1) is list and len(result1) == 0) and (type(result2) is list and len(result2) == 0):
                    return None
                return result1 if result1 else result2

            except ET.XPathEvalError as e:
                return None

        else:
            try:
                result = root.xpath(first_xpath)
                if type(result) is list and len(result) == 0:
                    return None
                return result
            except ET.XPathEvalError as e:
                return None

    def chart_query_xml(self, tree, xpath, namespaces):

        result = tree.xpath(xpath, namespaces=namespaces)
        if type(result) is list and len(result) == 0:
            return None

        return result


    # 유사한 텍스트 찾기
    def find_similar_text(self, root, target_text, threshold=0.7):
        """
        전체 문서에서 유사한 텍스트를 찾아 반환

        Args:
            root (_type_): xml root element 객체
            target_text (_type_): 찾을 텍스트
            threshold (float, optional): 유사도 설정 Defaults to 0.3.

        Returns:
            str: 유사도 기준을 만족하는 텍스트
        """
        # 전체 텍스트 추출
        # all_text = root.xpath(f"//CHAR/text()")
        # all_text.append(root.xpath(f"//TEXTART/@text"))

        namespaces = {
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart'
        }

        all_text = root.xpath(f"//BODY//text() | //TEXTART/@Text | //c:chart//text()", namespaces=namespaces)

        # 유사도 비교
        max_score = 0
        similar_text = ''

        for text in all_text:
            score = SequenceMatcher(None, target_text, text).ratio()

            if score > max_score:
                max_score = score
                similar_text = text

        if max_score >= threshold:
            return similar_text
        else:
            return None

    # 하나의 XML 파일 채점
    def _score_xml_file(self, xml_path, chart_xml):
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # 네임스페이스 정의
            namespaces = {
            'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
            'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart'
            }

            # 차트 XML 파일이 없는 경우 0점 채점을 위헤 빈 XML 생성
            if chart_xml is None:
                chart_tree = ET.fromstring('<xml></xml>')
            else:
                chart_tree = ET.fromstring(chart_xml)

            total_score = 0
            partial_score = 0
            previous_first_digit = None

            # 결과값을 Dictionary로 저장
            results = {
                'filename': os.path.basename(xml_path),
                'score_results': [],
                'total_score': 0,
                'partial_scores': []
            }

            print(f"File name: {results['filename']}")

            for criterion_id, criterion in self.scoring_criteria.items():

                # 키값의 첫 숫자를 확인
                first_digit = criterion_id.split('-')[0]
                if (previous_first_digit is not None) and (first_digit != previous_first_digit):
                    results['partial_scores'].append({
                        'section': previous_first_digit,
                        'score': partial_score
                    })
                    partial_score = 0

                previous_first_digit = first_digit

                id = criterion_id
                xpath = criterion['path']
                xpath2 = criterion['path2']
                search_value = criterion['searchValue']
                right_answer = criterion['value']
                points = criterion['points']
                category = criterion['category']
                item = criterion['item']
                similar_text = None

                # chart xml 파일에서 채점하는 경우
                if "chart_xml" in category:
                    if search_value is not None:
                        similar_text = self.find_similar_text(chart_tree, search_value)
                        if similar_text is None:
                            xpath = xpath.replace('{searchValue}', search_value)
                        else:
                            xpath = xpath.replace('{searchValue}', similar_text)

                    result = self.chart_query_xml(chart_tree, xpath, namespaces)

                # 그 외의 hml 파일에서 채점하는 경우
                else:
                    if search_value is not None:
                        similar_text = self.find_similar_text(root, search_value)
                        if similar_text is None:
                            xpath = xpath.replace('{searchValue}', search_value)
                        else:
                            xpath = xpath.replace('{searchValue}', similar_text)

                    result = self.query_xml(root, xpath, xpath2, points, category)

                # [ boolean 타입 ]
                # 1. 이텔릭체, 굵게, 밑줄 등 효과가 적용 여부에 따라
                # [ITALIC] [BOLD] [UNDERLINE] 태그가 있거나 없을 수 있으므로
                # 존재 유무에 따라 True, False로 판단
                # 2. 두 가지 이상의 조건을 모두 만족해야 하는 경우 and 연산자로 연결되어
                # 반환값 True/False로 판단
                # [ float 타입 ]
                # 1. 부분점수의 합산으로 반환되는 경우 float 타입으로 반환
                if type(result) is not list:
                    if type(result) is float and (result > points):
                        actual_answer = float(points)
                    else:
                        actual_answer = result
                else:
                    if type(right_answer) is int:
                        actual_answer = int(result[0])
                    else:
                        actual_answer = result[0]

                if "오타감점" in category:
                    points = self.get_typo_score()

                scoring = {
                    'id': id,
                    'category': category,  # 채점 분류
                    'item': item,  # 채점 항목
                    'right_answer': right_answer,  # 정답
                    'actual_answer': actual_answer,  # 실제 작성 답안
                    'points': points,
                    'deductions': []  # 각 기준별 감점 내역
                }

                # 점수 차감 조건
                # 1. 정답이 실수형으로 반환받은 경우는 채점항목의 부분점수 합산 결과이므로
                # 반환받은 값 그대로를 점수로 사용
                # 2. 정답이 정수형(사이즈 비교)의 경우 오차범위를 넘는다면 감점
                # 3. 그 외의 경우 정답과 실제 작성 답안이 다른 경우 점수 차감
                if type(actual_answer) is float:
                    scoring['points'] = actual_answer

                elif type(actual_answer) is int:
                    # 오차범위 3 이상이면 감점
                    if abs(actual_answer - right_answer) > 3:
                        scoring['points'] -= points
                else:
                    # right_answer(JSON파일 내 valuer값) null일 경우 점수감점 없이 진행
                    if right_answer != actual_answer:
                        scoring['points'] -= points


                # 점수 차감 이유 작성 (개발중)
                results['score_results'].append(scoring)
                total_score += scoring['points']
                partial_score += scoring['points']

                print(f'scoring: {scoring}')

            results['total_score'] = total_score

            if previous_first_digit is not None:
                results['partial_scores'].append({
                    'section': previous_first_digit,
                    'score': partial_score
                })

            return results

        except ET.ParseError as e:
            return {
                'filename': os.path.basename(xml_path),
                'error': f"XML 파싱 오류: {str(e)}",
                'total_score': 0
            }

    def binary_to_chartxml(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()

        binary_data = root.xpath('//BINDATA[@Id=//BINITEM[@Format="OLE"]/@BinData]/text()')
        if not binary_data:
            return None
        binary_data = binary_data[0].encode('utf-8')

        # <BINDATA ...> 태그와 그 내부 내용을 삭제합니다.
        encoded_data = re.sub(b'<BINDATA.*?>', b'', binary_data)
        encoded_data = encoded_data.replace(b'</BINDATA>', b'')
        encoded_data = encoded_data.replace(b'\r\n', b'')

        # base64 디코딩을 수행합니다.
        decoded_data = base64.b64decode(encoded_data+b'==')

        # 디코딩된 데이터 내용 중 xml 형식만 추출할 때 <c:chartSpace>, </c:chartSpace> 사이의 데이터만 추출.
        start = decoded_data.find(b'<?xml')
        print(start)
        end = decoded_data.find(b'</c:chartSpace>')
        print(end)
        xml_data = decoded_data[start:end+len(b'</c:chartSpace>')]

        # xml 데이터가 없는 경우 None을 반환합니다.
        if -1 in [start, end]:
            return None

        # 디코딩된 데이터를 파일로 저장합니다.
        base_filename = os.path.splitext(xml_path)[0]
        new_filename = f'{base_filename}.xml'
        with open(new_filename, 'wb') as file:
                file.write(xml_data)

        return xml_data

    def typo_check(self, xml_path_origin, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()
        tree_origin = ET.parse(xml_path_origin)
        root_origin = tree_origin.getroot()

        # xpath로 바이너리 부분추출
        input_text = root.xpath('//CHAR//text()[not(ancestor::HEADER) and not(ancestor::TABLE)]')
        table_text = root.xpath('//TABLE//CHAR//text()')
        input_text += table_text

        input_text_origin = root_origin.xpath('//CHAR//text()[not(ancestor::HEADER) and not(ancestor::TABLE)]')
        table_text_origin = root_origin.xpath('//TABLE//CHAR//text()')
        input_text_origin += table_text_origin

        # 각 요소에서 공백 제거
        input_text = [text.replace(' ', '') for text in input_text]
        input_text_origin = [text.replace(' ', '') for text in input_text_origin]


        # 숫자와 특정 형식 제거 (예: 1., 2., 3., -)
        input_text = [re.sub(r'\d+\.\s*|-', '', text) for text in input_text]
        input_text_origin = [re.sub(r'\d+\.\s*|-', '', text) for text in input_text_origin]


        # 리스트를 하나의 문자열로 변경
        input_text_str = ''.join(input_text)
        input_text_origin_str = ''.join(input_text_origin)

        print("input_text as string:")
        print(input_text_str)
        print("\ninput_text_origin as string:")
        print(input_text_origin_str)


        # 문자열의 차이를 비교
        diff = difflib.ndiff(input_text_origin_str, input_text_str)
        diff_list = list(diff)

        # 차이점을 정리하여 result_diff에 저장
        result_diff = []
        # 감점을 위한 누락 된 단어만 따로 리스트로 저장
        missing_list = []
        skip_next = False

        for i, line in enumerate(diff_list):
            if skip_next:
                skip_next = False
                continue
            # diff_list의 line 시작이 '-'이면서 다음 line이 '+'이면 두 line을 붙여서 맞춤법이 틀린 단어로 판단
            if line.startswith('- '):
                # 오타
                if i + 1 < len(diff_list) and diff_list[i + 1].startswith('+ '):
                    line = line.replace('- ', '-')
                    next = diff_list[i + 1].replace('+ ', '')
                    result_diff.append(line+'=>'+next)
                    skip_next = True
                # 누락
                else:
                    line = line.replace('- ', '-')
                    result_diff.append(line)
                    missing_list.append(line)
            # 없어도 되는 글자가 있는 경우
            elif line.startswith('+ '):
                line = line.replace('+ ', '+')
                result_diff.append(line)

        # result_diff 출력
        # print("\nResult Differences:")
        # for diff in result_diff:
        #     print(diff)

        # result_diff 배열의 길이를 맨 앞에 저장
        # temp = 40 - min(len(result_diff)*2, 40)
        temp = 40 - min(len(missing_list)*2, 40)
        self.set_typo_score(temp)

        result_diff.insert(0, temp)
        return result_diff

    # XML 파일 채점
    def score_directory(self, xml_directory, answer_path):
        # xml 파일 불러오기
        xml_files = Path(xml_directory).glob('*.hml')

        # 결과 저장할 리스트
        results = []

        for xml_file in xml_files:
            result = {}
            chart_xml = self.binary_to_chartxml(xml_file)
            result['typo'] = self.typo_check(answer_path, xml_file)
            result['score'] = self._score_xml_file(xml_file, chart_xml)
            # result['score']['score_results'][2]['points'] = result['typo'][0]
            results.append(result)
        return results

    def parse_filename(self, filename):
        if isinstance(filename, dict):
            filename = filename.get('파일명', '')
        match = re.match(r'.*-(\d+)-(.+)\.hml', filename)
        if match:
            number = match.group(1)
            name = match.group(2)
            return number, name

        return None, None

    def export_to_excel(self, results, output_path=None):
        if output_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") #연월일_시분초
            # timestamp = datetime.now().strftime("%Y%m%d") #연월일
            output_path = f"scoring_results_{timestamp}.xlsx"

        summary_data = []
        detail_data = []
        typo_data = []

        for temp in results:
            # 요약 정보
            result = temp['score']
            summary_row = {
                '파일명': result['filename'],
                '총점': result.get('total_score', 0)
            }
            if 'error' in result:
                summary_row['오류'] = result['error']

            summary_data.append(summary_row)

            # 상세 정보
            if 'score_results' in result:
                filename = {'파일명': result['filename']}
                number, name = self.parse_filename(filename)
                if (number or name) is None:
                    detail_row = {'채점항목': result['filename'] }
                else:
                    detail_row = {'채점항목':f"{number}-{name}"}

                section_num = None
                partial_idx = 0
                row_index = []
                for i, score_result in enumerate(result['score_results']):
                    current_section = int(score_result['id'].split('-')[0])

                    if section_num is None:
                        section_num = current_section

                    # 다음 섹션(페이지)로 넘어갔을 경우
                    if current_section != section_num:
                        # 이전 섹션의 부분합을 출력
                        detail_row[f'문제{section_num}'] = result['partial_scores'][partial_idx]['score']
                        row_index.append(f'문제{section_num}')
                        partial_idx += 1
                        section_num = current_section

                    detail_row[f'{i+1}'] = score_result['points']
                    row_index.append(score_result['id'])

                # 마지막 섹션의 부분합을 출력
                if section_num is not None and partial_idx < len(result['partial_scores']):
                    detail_row[f'문제{section_num}'] = result['partial_scores'][partial_idx]['score']
                    row_index.append(f'문제{section_num}')

                detail_row['총점'] = result.get('total_score', 0)
                row_index.append('총점')
                detail_data.append(detail_row)

        summary_df = pd.DataFrame(summary_data)
        detail_df = pd.DataFrame(detail_data).transpose()
        detail_df.columns = detail_df.iloc[0]
        detail_df = detail_df[1:]

        detail_df.index = row_index
        # detail_df = pd.DataFrame(detail_data)

        for temp in results:
            result = temp['typo']
            typo_data.append(result)

        typo_df = pd.DataFrame(typo_data).transpose()
        # detail_df = pd.DataFrame(detail_data)

        # ExcelWriter 객체 생성
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            summary_df.to_excel(writer, sheet_name='채점결과요약', index=False)
            detail_df.to_excel(writer, sheet_name='채점상세내역', index=True)
            typo_df.to_excel(writer, sheet_name='오타내역', index=False)

            # 열 너비 자동 조정
            # for sheet_name in writer.sheets:
            #     worksheet = writer.sheets[sheet_name]
            #     for column_cells in worksheet.columns:
            #         max_length = 0
            #         column = column_cells[0].column_letter  # 열의 문자
            #         for cell in column_cells:
            #             try:
            #                 if cell.value:
            #                     max_length = max(max_length, len(str(cell.value)))
            #             except:
            #                 pass
            #         adjusted_width = (max_length + 2)
            #         worksheet.column_dimensions[column].width = adjusted_width

        return output_path


def main():
    # scoring_criteria_path = r'./DIW.json'
    # scoring_criteria_path = r'./DIW_2502A.json'
    scoring_criteria_path = r'./DIW_2502B.json'
    # scoring_criteria_path = r'./DIW_2502C.json'
    # scoring_criteria_path = r'./DIW_2502D.json'
    # scoring_criteria_path = r'./DIW_2502E.json'

    # xml(hml)파일 디렉토리 경로
    # xml_directory = r'./output'
    # xml_directory = r'./output/A'
    xml_directory = r'./output/B'
    # xml_directory = r'./output/C'
    # xml_directory = r'./output/D'
    # xml_directory = r'./output/E'


    # 오탈자 체크를 위한 정답 파일 경로
    # answer_path = r'./output/정답.hml'
    # answer_path = r'./output/A/DIW_2502A.hml'
    answer_path = r'./output/B/DIW_2502B.hml'
    # answer_path = r'./output/C/DIW_2502C.hml'
    # answer_path = r'./output/D/DIW_2502D.hml'
    # answer_path = r'./output/E/DIW_2502E.hml'

    timestamp = datetime.now().strftime("%y%m%d")
    # 엑셀 파일명 (비어있으면 자동생성)
    # output_path = f"{timestamp}_DIW_2502A_채점결과.xlsx"
    output_path = f"{timestamp}_DIW_2502B_채점결과.xlsx"
    # output_path = f"{timestamp}_DIW_2502C_채점결과.xlsx"
    # output_path = f"{timestamp}_DIW_2502D_채점결과.xlsx"
    # output_path = f"{timestamp}_DIW_2502E_채점결과.xlsx"

    # 채점 클래스 초기화
    scorer = XMLScorer(scoring_criteria_path)

    # 폴더 내 모든 xml 파일 채점
    results = scorer.score_directory(xml_directory, answer_path)

    # 채점 결과 엑셀로 저장
    output_excel_path = scorer.export_to_excel(results, output_path)
    print(f"채점 결과 엑셀 파일: {output_excel_path}")

if __name__ == '__main__':
  main()