diw/score2.py

import json
import xml.etree.ElementTree as ET
import os
from pathlib import Path
import pandas as pd
from datetime import datetime
from Levenshtein import distance as levenshtein_distance

class XMLScorer:
    def __init__(self, scoring_criteria_path):
        """
        채점 기준표 JSON 파일을 로드하여 초기화합니다.

        Args:
            scoring_criteria_path (str): 채점 기준표 JSON 파일 경로
        """
        self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path)
        # 오탈자 감점 설정
        self.typo_penalties = {
            'slight': 0.9,  # 90% 점수 (약간의 오탈자)
            'moderate': 0.7,  # 70% 점수 (중간 정도의 오탈자)
            'severe': 0.0    # 0% 점수 (심각한 오탈자)
        }

    def _load_scoring_criteria(self, file_path):
        """
        JSON 채점 기준표를 로드합니다.

        Args:
            file_path (str): JSON 파일 경로

        Returns:
            dict: 채점 기준표 데이터
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _calculate_similarity_score(self, str1, str2):
        """
        두 문자열 간의 유사도를 계산합니다.

        Args:
            str1 (str): 첫 번째 문자열
            str2 (str): 두 번째 문자열

        Returns:
            float: 유사도 점수 (0.0 ~ 1.0)
        """
        if str1 is None or str2 is None:
            return 0.0

        max_len = max(len(str1), len(str2))
        if max_len == 0:
            return 1.0

        distance = levenshtein_distance(str1, str2)
        similarity = 1 - (distance / max_len)
        return similarity

    def _get_penalty_factor(self, similarity):
        """
        유사도에 따른 감점 계수를 반환합니다.

        Args:
            similarity (float): 유사도 점수

        Returns:
            float: 감점 계수
        """
        if similarity >= 0.9:
            return self.typo_penalties['slight']
        elif similarity >= 0.7:
            return self.typo_penalties['moderate']
        else:
            return self.typo_penalties['severe']

    def _find_best_matching_element(self, root, target_element):
        """
        가장 유사한 요소를 찾습니다.

        Args:
            root (Element): XML 루트 요소
            target_element (str): 찾고자 하는 요소 이름

        Returns:
            tuple: (가장 유사한 요소, 유사도 점수)
        """
        best_match = None
        best_similarity = 0.0

        for element in root.iter():
            similarity = self._calculate_similarity_score(element.tag, target_element)
            if similarity > best_similarity:
                best_similarity = similarity
                best_match = element

        return best_match, best_similarity

    def _find_element_value(self, root, element_name, attribute_name):
        """
        XML에서 특정 요소와 속성값을 찾습니다. 오탈자를 고려합니다.

        Args:
            root (Element): XML 루트 요소
            element_name (str): 찾을 요소 이름
            attribute_name (str): 찾을 속성 이름

        Returns:
            tuple: (속성값, 요소 유사도, 속성 유사도)
        """
        element, element_similarity = self._find_best_matching_element(root, element_name)

        if element is not None:
            # 속성 중 가장 유사한 것을 찾음
            best_attr_value = None
            best_attr_similarity = 0.0

            for attr_name, attr_value in element.attrib.items():
                attr_similarity = self._calculate_similarity_score(attr_name, attribute_name)
                if attr_similarity > best_attr_similarity:
                    best_attr_similarity = attr_similarity
                    best_attr_value = attr_value

            return best_attr_value, element_similarity, best_attr_similarity

        return None, 0.0, 0.0

    def score_xml_file(self, xml_path):
        """
        단일 XML 파일을 채점합니다.

        Args:
            xml_path (str): XML 파일 경로

        Returns:
            dict: 채점 결과
        """
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            total_score = 0
            results = {
                'filename': os.path.basename(xml_path),
                'criteria_matches': [],
                'total_score': 0
            }

            # 각 채점 기준에 대해 검사
            for criterion_id, criterion in self.scoring_criteria.items():
                element_name = criterion['ele']
                attribute_name = criterion['arg']
                expected_value = criterion['value']
                points = criterion['points']

                # 오탈자를 고려하여 값을 찾음
                actual_value, element_similarity, attr_similarity = self._find_element_value(
                    root, element_name, attribute_name)

                # 값 유사도 계산
                value_similarity = self._calculate_similarity_score(str(actual_value), str(expected_value))

                # 전체 유사도 계산 (요소, 속성, 값의 유사도를 종합)
                total_similarity = (element_similarity + attr_similarity + value_similarity) / 3

                # 감점 계수 계산
                penalty_factor = self._get_penalty_factor(total_similarity)

                match = {
                    'criterion': f"{element_name}.{attribute_name}",
                    'expected': expected_value,
                    'actual': actual_value,
                    'element_similarity': round(element_similarity, 3),
                    'attribute_similarity': round(attr_similarity, 3),
                    'value_similarity': round(value_similarity, 3),
                    'total_similarity': round(total_similarity, 3),
                    'penalty_factor': penalty_factor,
                    'points': round(points * penalty_factor, 2)
                }

                total_score += match['points']
                results['criteria_matches'].append(match)

            results['total_score'] = round(total_score, 2)
            return results

        except ET.ParseError as e:
            return {
                'filename': os.path.basename(xml_path),
                'error': f"XML 파싱 오류: {str(e)}",
                'total_score': 0
            }

    def score_directory(self, xml_directory):
        """
        디렉토리 내의 모든 XML 파일을 채점합니다.

        Args:
            xml_directory (str): XML 파일들이 있는 디렉토리 경로

        Returns:
            list: 모든 파일의 채점 결과
        """
        results = []
        xml_files = Path(xml_directory).glob('*.xml')

        for xml_file in xml_files:
            result = self.score_xml_file(str(xml_file))
            results.append(result)

        return results

    def export_to_excel(self, results, output_path=None):
        """
        채점 결과를 엑셀 파일로 저장합니다.

        Args:
            results (list): 채점 결과 리스트
            output_path (str, optional): 출력 파일 경로
        """
        if output_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f"scoring_results_{timestamp}.xlsx"

        # 요약 시트용 데이터 준비
        summary_data = []
        detail_data = []

        for result in results:
            # 요약 정보
            summary_row = {
                '파일명': result['filename'],
                '총점': result.get('total_score', 0)
            }
            if 'error' in result:
                summary_row['오류'] = result['error']
            summary_data.append(summary_row)

            # 상세 정보
            if 'criteria_matches' in result:
                for match in result['criteria_matches']:
                    detail_row = {
                        '파일명': result['filename'],
                        '채점항목': match['criterion'],
                        '기대값': match['expected'],
                        '실제값': match['actual'],
                        '요소유사도': match['element_similarity'],
                        '속성유사도': match['attribute_similarity'],
                        '값유사도': match['value_similarity'],
                        '전체유사도': match['total_similarity'],
                        '감점계수': match['penalty_factor'],
                        '획득점수': match['points']
                    }
                    detail_data.append(detail_row)

        # DataFrame 생성
        summary_df = pd.DataFrame(summary_data)
        detail_df = pd.DataFrame(detail_data)

        # ExcelWriter 객체 생성
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            summary_df.to_excel(writer, sheet_name='채점결과요약', index=False)
            detail_df.to_excel(writer, sheet_name='채점상세내역', index=False)

            # 열 너비 자동 조정
            for sheet_name in writer.sheets:
                worksheet = writer.sheets[sheet_name]
                for column in worksheet.columns:
                    max_length = 0
                    column = [cell for cell in column]
                    for cell in column:
                        try:
                            if len(str(cell.value)) > max_length:
                                max_length = len(str(cell.value))
                        except:
                            pass
                    adjusted_width = (max_length + 2)
                    worksheet.column_dimensions[column[0].column_letter].width = adjusted_width

        return output_path

# 메인 함수는 이전과 동일

# 사용 예시
def main():
    # 채점기준표 파일 경로
    scoring_criteria_path = "scoring_criteria.json"
    # XML 파일들이 있는 디렉토리 경로
    xml_directory = r"C:\Users\gzero-ser7-win11\Documents\hwpTest\Output"

    # 채점기 초기화
    scorer = XMLScorer(scoring_criteria_path)

    # 디렉토리 내 모든 XML 파일 채점
    results = scorer.score_directory(xml_directory)

    # 결과 출력
    for result in results:
        print(f"\n파일: {result['filename']}")
        if 'error' in result:
            print(f"오류: {result['error']}")
            continue

        print(f"총점: {result['total_score']}")
        print("\n채점 세부사항:")
        for match in result['criteria_matches']:
            print(f"기준: {match['criterion']}")
            print(f"기대값: {match['expected']}")
            print(f"실제값: {match['actual']}")
            print(f"획득 점수: {match['points']}")
            print("---")

     # 결과를 엑셀 파일로 저장
    excel_path = scorer.export_to_excel(results)
    print(f"\n채점 결과가 다음 경로에 저장되었습니다: {excel_path}")

if __name__ == "__main__":
    main()