diw/score5.py

from datetime import datetime
import json
from pathlib import Path
import os
from lxml import etree as ET
import re
from difflib import SequenceMatcher
import pandas as pd
# from xpathSearch import XMLPathHandler

from binaryToChartxml import binaryToChartxml


class XMLScorer:
    # 채점 기준 경로 초기화
    def __init__(self, scoring_criteria_path):
        # 채점 기준 로드
        self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path)

    # 채점 기준파일 로드(JSON 파일)
    def _load_scoring_criteria(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    # XML 파일에서 element의 값을 찾아 반환
    def query_xml(self, root, *args):
        points = args[2]
        if args[1] is not None:
            try:
                result = root.xpath(args[0])
                if type(result) is list and len(result) == 0:
                    return None
                elif result < points:
                    result = root.xpath(args[1])
                    return result
                else:
                    return result
                    # result = root.xpath(args[1])
                    # print(f'result : {result}')
                    # return result
            except ET.XPathEvalError as e:
                return None
        else:
            try:
                result = root.xpath(args[0])
                if type(result) is list and len(result) == 0:
                    return None

                return result
            except ET.XPathEvalError as e:
                return None

    # 유사한 텍스트 찾기
    def find_similar_text(self, root, target_text, threshold=0.5):
        """
        전체 문서에서 유사한 텍스트를 찾아 반환

        Args:
            root (_type_): xml root element 객체
            target_text (_type_): 찾을 텍스트
            threshold (float, optional): 유사도 설정 Defaults to 0.3.

        Returns:
            str: 유사도 기준을 만족하는 텍스트
        """
        # 전체 텍스트 추출
        # all_text = root.xpath(f"//CHAR/text()")
        # all_text.append(root.xpath(f"//TEXTART/@text"))

        all_text = root.xpath(f"//CHAR/text() | //TEXTART/@Text")

        # 유사도 비교
        max_score = 0
        similar_text = ''

        for text in all_text:
            score = SequenceMatcher(None, target_text, text).ratio()

            if score > max_score:
                max_score = score
                similar_text = text

        if max_score >= threshold:
            return similar_text
        else:
            return None

    # 하나의 XML 파일 채점
    def _score_xml_file(self, xml_path):
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            total_score = 0

            # 결과값을 Dictionary로 저장
            results = {
                'filename': os.path.basename(xml_path),
                'score_results': [],
                'total_score': 0,
            }

            print(f"File name: {results['filename']}")

            for criterion_id, criterion in self.scoring_criteria.items():
                xpath = criterion['path']
                xpath2 = criterion['path2']
                search_value = criterion['searchValue']
                right_answer = criterion['value']
                points = criterion['points']
                category = criterion['category']
                item = criterion['item']
                simliar_text = None

                # searchValue가 있을 경우 유사한 텍스트 찾기
                if search_value is not None:
                    simliar_text = self.find_similar_text(root, search_value)
                    if simliar_text is None:
                        xpath = xpath.replace('{searchValue}', search_value)
                    else:
                        xpath = xpath.replace('{searchValue}', simliar_text)

                # xpath로 실제 작성 답안 찾기
                result = self.query_xml(root, xpath, xpath2, points)

                # [ boolean 타입 ]
                # 1. 이텔릭체, 굵게, 밑줄 등 효과가 적용 여부에 따라
                # [ITALIC] [BOLD] [UNDERLINE] 태그가 있거나 없을 수 있으므로
                # 존재 유무에 따라 True, False로 판단
                # 2. 두 가지 이상의 조건을 모두 만족해야 하는 경우 and 연산자로 연결되어
                # 반환값 True/False로 판단
                # [ float 타입 ]
                # 1. 부분점수의 합산으로 반환되는 경우 float 타입으로 반환
                if type(result) is not list:
                    if type(result) is float and (result > points):
                        actual_answer = float(points)
                    else:
                        actual_answer = result
                else:
                    if type(right_answer) is int:
                        actual_answer = int(result[0])
                    else:
                        actual_answer = result[0]

                scoring = {
                    'category': category,  # 채점 분류
                    'item': item,  # 채점 항목
                    'right_answer': right_answer,  # 정답
                    'actual_answer': actual_answer,  # 실제 작성 답안
                    'points': 0,
                    'deductions': []  # 각 기준별 감점 내역
                }
                scoring['points'] = points

                # 점수 차감 조건
                # 1. 정답이 실수형으로 반환받은 경우는 채점항목의 부분점수 합산 결과이므로
                # 반환받은 값 그대로를 점수로 사용
                # 2. 정답이 정수형(사이즈 비교)의 경우 오차범위를 넘는다면 감점
                # 3. 그 외의 경우 정답과 실제 작성 답안이 다른 경우 점수 차감
                if type(actual_answer) is float:
                    scoring['points'] = actual_answer

                elif type(actual_answer) is int:
                    # 오차범위 5 이상이면 감점
                    if abs(actual_answer - right_answer) > 5:
                        scoring['points'] -= points
                else:
                    # right_answer(JSON파일 내 valuer값) null일 경우 점수감점 없이 진행
                    if right_answer != actual_answer:
                        scoring['points'] -= points


                # 점수 차감 이유 작성 (개발중)
                results['score_results'].append(scoring)
                total_score += scoring['points']

                print(f'scoring: {scoring}')

            results['total_score'] = total_score
            return results

        except ET.ParseError as e:
            return {
                'filename': os.path.basename(xml_path),
                'error': f"XML 파싱 오류: {str(e)}",
                'total_score': 0
            }
    # def binary_to_chartxml(self, xml_path):

    # XML 파일 채점
    def score_directory(self, xml_directory):

        # xml 파일 불러오기
        xml_files = Path(xml_directory).glob('*.hml')

        # 결과 저장할 리스트
        results = []

        for xml_file in xml_files:
            self.binary_to_chartxml(xml_file)
            result = self._score_xml_file(xml_file)
            results.append(result)
        return results

    def parse_filename(self, filename):
        if isinstance(filename, dict):
            filename = filename.get('파일명', '')
        match = re.match(r'.*-(\d+)-(.+)\.hml', filename)
        if match:
            number = match.group(1)
            name = match.group(2)
            return number, name
        return None, None

    def export_to_excel(self, results, output_path=None):
        if output_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f"scoring_results_{timestamp}.xlsx"

        summary_data = []
        detail_data = []

        for result in results:
            # 요약 정보
            summary_row = {
                '파일명': result['filename'],
                '총점': result.get('total_score', 0)
            }
            if 'error' in result:
                summary_row['오류'] = result['error']

            summary_data.append(summary_row)

            # 상세 정보
            if 'score_results' in result:
                filename = {'파일명': result['filename']}
                number, name = self.parse_filename(filename)
                detail_row = {'수험자':f"{number}-{name}"}

                for i, scoring in enumerate(result['score_results']):
                    detail_row[f'점수_{i+1}'] = scoring['points']

                detail_row['총점'] = result.get('total_score', 0)
                detail_data.append(detail_row)

        summary_df = pd.DataFrame(summary_data)
        detail_df = pd.DataFrame(detail_data).transpose()
        # detail_df = pd.DataFrame(detail_data)

        # ExcelWriter 객체 생성
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            summary_df.to_excel(writer, sheet_name='채점결과요약', index=False)
            detail_df.to_excel(writer, sheet_name='채점상세내역', index=False)

            # 열 너비 자동 조정
            # for sheet_name in writer.sheets:
            #     worksheet = writer.sheets[sheet_name]
            #     for column_cells in worksheet.columns:
            #         max_length = 0
            #         column = column_cells[0].column_letter  # 열의 문자
            #         for cell in column_cells:
            #             try:
            #                 if cell.value:
            #                     max_length = max(max_length, len(str(cell.value)))
            #             except:
            #                 pass
            #         adjusted_width = (max_length + 2)
            #         worksheet.column_dimensions[column].width = adjusted_width

        return output_path


def main():
    scoring_criteria_path = r'C:\Users\dra\project\HWP-Scoring\scoring_criteria.json'

    # xml(hml)파일 디렉토리 경로
    xml_directory = r'C:\Users\dra\project\HWP-Scoring\output'

    # 채점 클래스 초기화
    scorer = XMLScorer(scoring_criteria_path)

    # 폴더 내 모든 xml 파일 채점
    results = scorer.score_directory(xml_directory)

#   for result in results:
#     print(f"\n파일: {result['filename']}")
#     if 'error' in result:
#         print(f"오류: {result['error']}")
#         continue

#     print(f"총점: {result['total_score']}")
#     print("\n채점 세부사항:")
#     for scoring in result['score_results']:
#         print(f"채점분류: {scoring['category']}")
#         print(f"채점항목: {scoring['item']}")
#         print(f"요구 답안: {scoring['right_answer']}")
#         print(f"작성 답안: {scoring['actual_answer']}")
#         print(f"획득 점수: {scoring['points']}")
#         print(f"감점 내역: {scoring['deductions']}")
#         print("---")

    # 채점 결과 엑셀로 저장
    output_excel_path = scorer.export_to_excel(results)
    print(f"채점 결과 엑셀 파일: {output_excel_path}")

if __name__ == '__main__':
  main()