score5.py

from datetime import datetime
import json
import glob
from pathlib import Path
import os
from lxml import etree as ET
from difflib import SequenceMatcher
import pandas as pd
# from xpathSearch import XMLPathHandler


class XMLScorer:
    # 채점 기준 경로 초기화
    def __init__(self, scoring_criteria_path):
        # 채점 기준 로드
        self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path)
    
    # 채점 기준파일 로드(JSON 파일)
    def _load_scoring_criteria(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    # XML 파일에서 element의 값을 찾아 반환  
    def query_xml(self, root, query):
        try:
            result = root.xpath(query)
            if type(result) is list and len(result) == 0:
                return None
            
            return result
        except ET.XPathEvalError as e:
            return None
    
    # 유사한 텍스트 찾기
    def find_similar_text(self, root, target_text, threshold=0.3):
        """ 
        전체 문서에서 유사한 텍스트를 찾아 반환

        Args:
            root (_type_): xml root element 객체
            target_text (_type_): 찾을 텍스트
            threshold (float, optional): 유사도 설정 Defaults to 0.3.

        Returns:
            str: 유사도 기준을 만족하는 텍스트 
        """
        # 전체 텍스트 추출
        # all_text = root.xpath(f"//CHAR/text()")
        # all_text.append(root.xpath(f"//TEXTART/@text"))
        all_text = root.xpath(f"//CHAR/text() | //TEXTART/@Text")
        
        # 유사도 비교
        max_score = 0
        similar_text = ''
        
        for text in all_text:
            score = SequenceMatcher(None, target_text, text).ratio()
            
            if score > max_score:
                max_score = score
                similar_text = text
                
        if max_score >= threshold:
            return similar_text
        else:
            return None
    
    # 하나의 XML 파일 채점
    def _score_xml_file(self, xml_path):
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            total_score = 0
            
            # 결과값을 Dictionary로 저장
            results = {
                'filename': os.path.basename(xml_path),
                'score_results': [],
                'total_score': 0,
            }
            
            print(f"File name: {results['filename']}")
            
            for criterion_id, criterion in self.scoring_criteria.items():
                xpath = criterion['path']
                search_value = criterion['searchValue']
                right_answer = criterion['value']
                points = criterion['points']
                category = criterion['category']
                item = criterion['item']
                
                simliar_text = None
                
                # searchValue가 있을 경우 유사한 텍스트 찾기
                if search_value is not None:
                    simliar_text = self.find_similar_text(root, search_value)
                    if simliar_text is None:
                        xpath = xpath.replace('{searchValue}', '')
                    else: 
                        xpath = xpath.replace('{searchValue}', simliar_text)

                # xpath로 실제 작성 답안 찾기
                result = self.query_xml(root, xpath)
                
                # [ boolean 타입 ]
                # 1. 이텔릭체, 굵게, 밑줄 등 효과가 적용 여부에 따라 
                # [ITALIC] [BOLD] [UNDERLINE] 태그가 있거나 없을 수 있으므로
                # 존재 유무에 따라 True, False로 판단
                # 2. 두 가지 이상의 조건을 모두 만족해야 하는 경우 and 연산자로 연결되어
                # 반환값 True/False로 판단
                # [ float 타입 ]
                # 1. 부분점수의 합산으로 반환되는 경우 float 타입으로 반환
                if type(result) is not list:
                    actual_answer = result
                    
                # 표 같이 여러 조건을 동시에 검사 해야하는 경우우
                # elif type(result) is list and len(result) > 1:
                #     xpath2 = criterion['path2']
                #     for i in result:
                #         xpath2 = xpath2.replace('{path_result_list}', str(i))
                #         print(f"xpath2: {xpath2}")
                    
                else:
                    actual_answer = result[0]
                
                scoring = {
                    'category': category,  # 채점 분류   
                    'item': item,  # 채점 항목
                    'right_answer': right_answer,  # 정답
                    'actual_answer': actual_answer,  # 실제 작성 답안
                    'points': 0, 
                    'deductions': []  # 각 기준별 감점 내역
                }
                
                scoring['points'] = points

                # 점수 차감 조건
                # 1. 정답이 실수형으로 반환받은 경우는 채점항목의 부분점수 합산 결과이므로
                # 반환받은 값 그대로를 점수로 사용
                # 2. 그 외의 경우 정답과 실제 작성 답안이 다른 경우 점수 차감
                if type(actual_answer) is float:
                    scoring['points'] = actual_answer        
                else:
                    if right_answer != actual_answer:
                        scoring['points'] -= points

                # 점수 차감 이유 작성 (개발중)
                
                results['score_results'].append(scoring)
                total_score += scoring['points']
                
                if scoring['points'] > 0:
                    print(f'scoring: {scoring}')
                          
            results['total_score'] = total_score
            return results
                
        except ET.ParseError as e:
            return {
                'filename': os.path.basename(xml_path),
                'error': f"XML 파싱 오류: {str(e)}",
                'total_score': 0
            }
        
    # XML 파일 채점
    def score_directory(self, xml_directory):
        
        # xml 파일 불러오기
        xml_files = Path(xml_directory).glob('*.hml')
    
        # 결과 저장할 리스트
        results = []
    
        for xml_file in xml_files:
            result = self._score_xml_file(xml_file)
            results.append(result)
            
        return results

    def export_to_excel(self, results, output_path=None):
        if output_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f"scoring_results_{timestamp}.xlsx"

        summary_data = []
        detail_data = []
        header_added = False
        
        for result in results:
            # 요약 정보
            summary_row = {
                '파일명': result['filename'],
                '총점': result.get('total_score', 0)
            }
            if 'error' in result:
                summary_row['오류'] = result['error']
                
            summary_data.append(summary_row)

            # 상세 정보
            if 'score_results' in result:
                detail_row = {'파일명': result['filename']}                
                for i, scoring in enumerate(result['score_results']):
                    detail_row[f'점수_{i+1}'] = scoring['points']
                
                detail_row['총점'] = result.get('total_score', 0)
                detail_data.append(detail_row)

        summary_df = pd.DataFrame(summary_data)
        detail_df = pd.DataFrame(detail_data)
        # detail_df = pd.DataFrame(detail_data)

        # ExcelWriter 객체 생성
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            summary_df.to_excel(writer, sheet_name='채점결과요약', index=False)
            detail_df.to_excel(writer, sheet_name='채점상세내역', index=False)

            # 열 너비 자동 조정
            # for sheet_name in writer.sheets:
            #     worksheet = writer.sheets[sheet_name]
            #     for column_cells in worksheet.columns:
            #         max_length = 0
            #         column = column_cells[0].column_letter  # 열의 문자
            #         for cell in column_cells:
            #             try:
            #                 if cell.value:
            #                     max_length = max(max_length, len(str(cell.value)))
            #             except:
            #                 pass
            #         adjusted_width = (max_length + 2)
            #         worksheet.column_dimensions[column].width = adjusted_width

        return output_path


def main():
    scoring_criteria_path = r'C:\Users\dra\project\HWP-Scoring\scoring_criteria.json'

    # xml(hml)파일 디렉토리 경로
    xml_directory = r'C:\Users\dra\project\HWP-Scoring\output'

    # 채점 클래스 초기화
    scorer = XMLScorer(scoring_criteria_path)

    # 폴더 내 모든 xml 파일 채점
    results = scorer.score_directory(xml_directory)
  
#   for result in results:
#     print(f"\n파일: {result['filename']}")
#     if 'error' in result:
#         print(f"오류: {result['error']}")
#         continue
        
#     print(f"총점: {result['total_score']}")
#     print("\n채점 세부사항:")
#     for scoring in result['score_results']:
#         print(f"채점분류: {scoring['category']}")
#         print(f"채점항목: {scoring['item']}")
#         print(f"요구 답안: {scoring['right_answer']}")
#         print(f"작성 답안: {scoring['actual_answer']}")
#         print(f"획득 점수: {scoring['points']}")
#         print(f"감점 내역: {scoring['deductions']}")
#         print("---")
  
    # 채점 결과 엑셀로 저장
    output_excel_path = scorer.export_to_excel(results)
    print(f"채점 결과 엑셀 파일: {output_excel_path}")    

if __name__ == '__main__': 
  main()
1번문항채점가능 2025-01-08 17:43:24 +09:00			`from datetime import datetime`
			`import json`
			`import glob`
			`from pathlib import Path`
			`import os`
			`from lxml import etree as ET`
			`from difflib import SequenceMatcher`
			`import pandas as pd`
			`# from xpathSearch import XMLPathHandler`


			`class XMLScorer:`
			`# 채점 기준 경로 초기화`
			`def __init__(self, scoring_criteria_path):`
			`# 채점 기준 로드`
			`self.scoring_criteria = self._load_scoring_criteria(scoring_criteria_path)`

			`# 채점 기준파일 로드(JSON 파일)`
			`def _load_scoring_criteria(self, file_path):`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`return json.load(f)`

			`# XML 파일에서 element의 값을 찾아 반환`
			`def query_xml(self, root, query):`
			`try:`
			`result = root.xpath(query)`
			`if type(result) is list and len(result) == 0:`
			`return None`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00
1번문항채점가능 2025-01-08 17:43:24 +09:00			`return result`
			`except ET.XPathEvalError as e:`
			`return None`

			`# 유사한 텍스트 찾기`
			`def find_similar_text(self, root, target_text, threshold=0.3):`
			`"""`
			`전체 문서에서 유사한 텍스트를 찾아 반환`

			`Args:`
			`root (_type_): xml root element 객체`
			`target_text (_type_): 찾을 텍스트`
			`threshold (float, optional): 유사도 설정 Defaults to 0.3.`

			`Returns:`
			`str: 유사도 기준을 만족하는 텍스트`
			`"""`
			`# 전체 텍스트 추출`
			`# all_text = root.xpath(f"//CHAR/text()")`
			`# all_text.append(root.xpath(f"//TEXTART/@text"))`
			`all_text = root.xpath(f"//CHAR/text() \| //TEXTART/@Text")`

			`# 유사도 비교`
			`max_score = 0`
			`similar_text = ''`

			`for text in all_text:`
			`score = SequenceMatcher(None, target_text, text).ratio()`

			`if score > max_score:`
			`max_score = score`
			`similar_text = text`

			`if max_score >= threshold:`
			`return similar_text`
			`else:`
			`return None`

			`# 하나의 XML 파일 채점`
			`def _score_xml_file(self, xml_path):`
			`try:`
			`tree = ET.parse(xml_path)`
			`root = tree.getroot()`

			`total_score = 0`

			`# 결과값을 Dictionary로 저장`
			`results = {`
			`'filename': os.path.basename(xml_path),`
			`'score_results': [],`
			`'total_score': 0,`
			`}`

			`print(f"File name: {results['filename']}")`

			`for criterion_id, criterion in self.scoring_criteria.items():`
			`xpath = criterion['path']`
			`search_value = criterion['searchValue']`
			`right_answer = criterion['value']`
			`points = criterion['points']`
			`category = criterion['category']`
			`item = criterion['item']`

			`simliar_text = None`

			`# searchValue가 있을 경우 유사한 텍스트 찾기`
			`if search_value is not None:`
			`simliar_text = self.find_similar_text(root, search_value)`
			`if simliar_text is None:`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`xpath = xpath.replace('{searchValue}', '')`
1번문항채점가능 2025-01-08 17:43:24 +09:00			`else:`
			`xpath = xpath.replace('{searchValue}', simliar_text)`

			`# xpath로 실제 작성 답안 찾기`
			`result = self.query_xml(root, xpath)`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00
1번문항채점가능 2025-01-08 17:43:24 +09:00			`# [ boolean 타입 ]`
			`# 1. 이텔릭체, 굵게, 밑줄 등 효과가 적용 여부에 따라`
			`# [ITALIC] [BOLD] [UNDERLINE] 태그가 있거나 없을 수 있으므로`
			`# 존재 유무에 따라 True, False로 판단`
			`# 2. 두 가지 이상의 조건을 모두 만족해야 하는 경우 and 연산자로 연결되어`
			`# 반환값 True/False로 판단`
			`# [ float 타입 ]`
			`# 1. 부분점수의 합산으로 반환되는 경우 float 타입으로 반환`
			`if type(result) is not list:`
			`actual_answer = result`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00
			`# 표 같이 여러 조건을 동시에 검사 해야하는 경우우`
			`# elif type(result) is list and len(result) > 1:`
			`# xpath2 = criterion['path2']`
			`# for i in result:`
			`# xpath2 = xpath2.replace('{path_result_list}', str(i))`
			`# print(f"xpath2: {xpath2}")`

1번문항채점가능 2025-01-08 17:43:24 +09:00			`else:`
			`actual_answer = result[0]`

			`scoring = {`
			`'category': category, # 채점 분류`
			`'item': item, # 채점 항목`
			`'right_answer': right_answer, # 정답`
			`'actual_answer': actual_answer, # 실제 작성 답안`
			`'points': 0,`
			`'deductions': [] # 각 기준별 감점 내역`
			`}`

			`scoring['points'] = points`

			`# 점수 차감 조건`
			`# 1. 정답이 실수형으로 반환받은 경우는 채점항목의 부분점수 합산 결과이므로`
			`# 반환받은 값 그대로를 점수로 사용`
			`# 2. 그 외의 경우 정답과 실제 작성 답안이 다른 경우 점수 차감`
			`if type(actual_answer) is float:`
			`scoring['points'] = actual_answer`
			`else:`
			`if right_answer != actual_answer:`
			`scoring['points'] -= points`

1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`# 점수 차감 이유 작성 (개발중)`

1번문항채점가능 2025-01-08 17:43:24 +09:00			`results['score_results'].append(scoring)`
			`total_score += scoring['points']`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00
			`if scoring['points'] > 0:`
			`print(f'scoring: {scoring}')`
1번문항채점가능 2025-01-08 17:43:24 +09:00
			`results['total_score'] = total_score`
			`return results`

			`except ET.ParseError as e:`
			`return {`
			`'filename': os.path.basename(xml_path),`
			`'error': f"XML 파싱 오류: {str(e)}",`
			`'total_score': 0`
			`}`

			`# XML 파일 채점`
			`def score_directory(self, xml_directory):`

			`# xml 파일 불러오기`
			`xml_files = Path(xml_directory).glob('*.hml')`

			`# 결과 저장할 리스트`
			`results = []`

			`for xml_file in xml_files:`
			`result = self._score_xml_file(xml_file)`
			`results.append(result)`

			`return results`

			`def export_to_excel(self, results, output_path=None):`
			`if output_path is None:`
			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`output_path = f"scoring_results_{timestamp}.xlsx"`

			`summary_data = []`
			`detail_data = []`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`header_added = False`

1번문항채점가능 2025-01-08 17:43:24 +09:00			`for result in results:`
			`# 요약 정보`
			`summary_row = {`
			`'파일명': result['filename'],`
			`'총점': result.get('total_score', 0)`
			`}`
			`if 'error' in result:`
			`summary_row['오류'] = result['error']`

			`summary_data.append(summary_row)`

			`# 상세 정보`
			`if 'score_results' in result:`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`detail_row = {'파일명': result['filename']}`
			`for i, scoring in enumerate(result['score_results']):`
			`detail_row[f'점수_{i+1}'] = scoring['points']`

			`detail_row['총점'] = result.get('total_score', 0)`
			`detail_data.append(detail_row)`
1번문항채점가능 2025-01-08 17:43:24 +09:00
			`summary_df = pd.DataFrame(summary_data)`
			`detail_df = pd.DataFrame(detail_data)`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`# detail_df = pd.DataFrame(detail_data)`
1번문항채점가능 2025-01-08 17:43:24 +09:00
			`# ExcelWriter 객체 생성`
			`with pd.ExcelWriter(output_path, engine='openpyxl') as writer:`
			`summary_df.to_excel(writer, sheet_name='채점결과요약', index=False)`
			`detail_df.to_excel(writer, sheet_name='채점상세내역', index=False)`

			`# 열 너비 자동 조정`
1,2 페이지 채점기준 항목 모두 적용 2025-01-10 19:45:08 +09:00			`# for sheet_name in writer.sheets:`
			`# worksheet = writer.sheets[sheet_name]`
			`# for column_cells in worksheet.columns:`
			`# max_length = 0`
			`# column = column_cells[0].column_letter # 열의 문자`
			`# for cell in column_cells:`
			`# try:`
			`# if cell.value:`
			`# max_length = max(max_length, len(str(cell.value)))`
			`# except:`
			`# pass`
			`# adjusted_width = (max_length + 2)`
			`# worksheet.column_dimensions[column].width = adjusted_width`
1번문항채점가능 2025-01-08 17:43:24 +09:00
			`return output_path`


			`def main():`
			`scoring_criteria_path = r'C:\Users\dra\project\HWP-Scoring\scoring_criteria.json'`

			`# xml(hml)파일 디렉토리 경로`
			`xml_directory = r'C:\Users\dra\project\HWP-Scoring\output'`

			`# 채점 클래스 초기화`
			`scorer = XMLScorer(scoring_criteria_path)`

			`# 폴더 내 모든 xml 파일 채점`
			`results = scorer.score_directory(xml_directory)`

			`# for result in results:`
			`# print(f"\n파일: {result['filename']}")`
			`# if 'error' in result:`
			`# print(f"오류: {result['error']}")`
			`# continue`

			`# print(f"총점: {result['total_score']}")`
			`# print("\n채점 세부사항:")`
			`# for scoring in result['score_results']:`
			`# print(f"채점분류: {scoring['category']}")`
			`# print(f"채점항목: {scoring['item']}")`
			`# print(f"요구 답안: {scoring['right_answer']}")`
			`# print(f"작성 답안: {scoring['actual_answer']}")`
			`# print(f"획득 점수: {scoring['points']}")`
			`# print(f"감점 내역: {scoring['deductions']}")`
			`# print("---")`

			`# 채점 결과 엑셀로 저장`
			`output_excel_path = scorer.export_to_excel(results)`
			`print(f"채점 결과 엑셀 파일: {output_excel_path}")`

			`if __name__ == '__main__':`
			`main()`