Chosun-Proofread-Demo-Dev-0917

Sleeping

App Files Files Community

Chosun-Proofread-Demo-Dev-0917 / app.py

ChloeLee22

Update app.py

a5eb85b verified 5 months ago

raw

history blame contribute delete

18.3 kB

	import gradio as gr
	import os
	import time
	import re
	import json
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from openai import OpenAI
	import random
	from concurrent.futures import TimeoutError as FuturesTimeoutError
	from openai import APIStatusError, APITimeoutError, APIConnectionError
	import traceback
	from dotenv import load_dotenv
	from prompts import (
	USER_PROMPT,
	WRAPPER_PROMPT,
	CALL_1_SYSTEM_PROMPT,
	CALL_2_SYSTEM_PROMPT,
	CALL_3_SYSTEM_PROMPT,
	)
	import difflib
	import csv
	from threading import Lock
	import threading

	load_dotenv()

	BASE_URL = "https://api.upstage.ai/v1"
	API_KEY = os.getenv("OPENAI_API_KEY")

	client = OpenAI(api_key=API_KEY, base_url=BASE_URL) # 60초 하드 타임아웃


	import re
	import re
	import re

	def postprocess_pronoun(text: str) -> str:
	"""
	'이재명 대표'가 포함된 모든 단어를 '이재명 대통령'으로 교체하며,
	뒤따르는 조사가 있을 경우 함께 수정합니다.
	"""

	# 형태가 '바뀌어야 하는' 조사/어미만 정의
	correction_map = {
	'는': '은', '가': '이', '를': '을', '와': '과', '로': '으로',
	'여': '이여', '라': '이라', '랑': '이랑',
	'다': '이다', '였다': '이었다', '라면': '이라면', '라서': '이라서'
	}

	# 정규식이 찾아야 할 모든 조사/어미 목록
	all_target_particles = list(correction_map.keys()) + ['로부터', '만', '도', '께서']

	particle_pattern = "\|".join(re.escape(p) for p in all_target_particles)

	# 최종 정규식: 안전장치(?!...)를 제거하여 모든 경우를 찾아냅니다.
	regex = re.compile(f"(이재명\s*대표)({particle_pattern})?")

	def replace_func(match):
	particle = match.group(2)
	new_phrase = "이재명 대통령"

	if particle:
	new_phrase += correction_map.get(particle, particle)

	return new_phrase

	return regex.sub(replace_func, text)



	def extract_json_from_text(text):
	"""
	텍스트에서 JSON 부분을 추출합니다.
	여러 패턴을 시도하여 JSON을 찾습니다.

	Args:
	text: JSON이 포함된 텍스트

	Returns:
	dict: 파싱된 JSON 객체 또는 None
	"""
	if not text or not text.strip():
	return None

	# 텍스트 정리 (앞뒤 공백 제거)
	text = text.strip()

	# 패턴 1: ```json ... ``` 형태
	json_code_block_pattern = r'```json\s(.?)\s*```'
	match = re.search(json_code_block_pattern, text, re.DOTALL)
	if match:
	try:
	extracted = match.group(1).strip()
	if extracted:
	return json.loads(extracted)
	except json.JSONDecodeError:
	pass

	# 패턴 2: ``` ... ``` 형태 (json 태그 없이)
	code_block_pattern = r'```\s(.?)\s*```'
	match = re.search(code_block_pattern, text, re.DOTALL)
	if match:
	try:
	extracted = match.group(1).strip()
	if extracted:
	return json.loads(extracted)
	except json.JSONDecodeError:
	pass

	# 패턴 3: {로 시작하고 }로 끝나는 JSON 객체
	json_object_pattern = r'\{.*\}'
	match = re.search(json_object_pattern, text, re.DOTALL)
	if match:
	try:
	extracted = match.group(0).strip()
	if extracted:
	return json.loads(extracted)
	except json.JSONDecodeError:
	pass

	# 패턴 4: [로 시작하고 ]로 끝나는 JSON 배열
	json_array_pattern = r'\[.*\]'
	match = re.search(json_array_pattern, text, re.DOTALL)
	if match:
	try:
	extracted = match.group(0).strip()
	if extracted:
	return json.loads(extracted)
	except json.JSONDecodeError:
	pass

	# 패턴 5: 전체 텍스트가 JSON인 경우
	try:
	if text.startswith('{') or text.startswith('['):
	return json.loads(text)
	except json.JSONDecodeError:
	pass

	return None



	# Load vocabulary for rule-based correction
	def load_vocabulary():
	vocabulary = {}
	with open("Vocabulary.csv", "r", encoding="utf-8-sig") as f:
	reader = csv.DictReader(f)
	for row in reader:
	# Debug: print first row to check column names
	if len(vocabulary) == 0:
	print("CSV columns:", list(row.keys()))
	vocabulary[row["original"]] = row["corrected"]
	return vocabulary


	VOCABULARY = load_vocabulary()

	# 스레드 안전한 카운터
	counter_lock = Lock()
	processed_count = 0
	total_bulks = 0


	def apply_vocabulary_correction(text):
	for original, corrected in VOCABULARY.items():
	text = text.replace(original, corrected)
	return text


	def create_bulk_paragraphs(text, max_chars=500):
	"""
	텍스트를 500자 기준으로 벌크 단위로 분할합니다.

	Args:
	text: 입력 텍스트
	max_chars: 최대 문자 수 (기본값: 500)

	Returns:
	List[str]: 벌크 단위로 분할된 텍스트 리스트
	"""
	paragraphs = [p.strip() for p in text.split("\n") if p.strip()]

	if not paragraphs:
	return []

	bulks = []
	current_bulk = []
	current_length = 0

	for para in paragraphs:
	para_length = len(para)

	# 현재 문단이 500자를 초과하는 경우
	if para_length > max_chars:
	# 현재 벌크가 있다면 추가
	if current_bulk:
	bulks.append("\n".join(current_bulk))
	current_bulk = []
	current_length = 0

	# 긴 문단은 단독으로 처리
	bulks.append(para)
	else:
	# 현재 벌크에 추가했을 때 500자를 초과하는 경우
	if (
	current_length + para_length + len(current_bulk) > max_chars
	and current_bulk
	):
	# 현재 벌크를 완성하고 새 벌크 시작
	bulks.append("\n".join(current_bulk))
	current_bulk = [para]
	current_length = para_length
	else:
	# 현재 벌크에 추가
	current_bulk.append(para)
	current_length += para_length

	# 마지막 벌크 추가
	if current_bulk:
	bulks.append("\n".join(current_bulk))

	return bulks



	def process_bulk(bulk_text, bulk_index, max_retries=3, article_info=""):
	"""
	하나의 벌크를 파이프라인으로 처리합니다.
	API 에러 발생 시, 마지막으로 성공한 단계의 결과물을 반환합니다.
	"""
	global processed_count
	thread_id = threading.get_ident()
	start = time.time()

	# 각 단계의 결과를 저장할 변수
	step0, proofread_result, step1, step1_explanation, step2, step2_explanation, step3, step4, step5 = (None,) * 9

	# 에러 발생 시 반환할 마지막 성공 결과물 (초기값은 원본 텍스트)
	last_successful_output = bulk_text

	for attempt in range(max_retries):
	try:
	# Step 0: 단어장 기반 교정
	step0 = apply_vocabulary_correction(bulk_text)
	last_successful_output = step0

	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Calling proofread...")
	proofread_result = call_proofread(step0)


	step0 = step0.replace("\n", "<paragraph_separator>")
	proofread_result = proofread_result.replace("\n", "<paragraph_separator>")


	# Step 1: Solar API 호출 1
	system_step1 = WRAPPER_PROMPT.format(system_prompt=CALL_1_SYSTEM_PROMPT)
	user_step1 = USER_PROMPT.format(original=step0, proofread=proofread_result)

	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Calling step1...")
	step1_json = call_solar_pro2(system_step1, user_step1)
	try:
	parsed_json = json.loads(step1_json)
	step1 = parsed_json.get('output', step0) # 파싱 성공 시 output 필드 사용, 없으면 이전 단계 결과
	step1_explanation = parsed_json.get('explanation', '')
	last_successful_output = step1
	except json.JSONDecodeError:
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Step1 JSON 파싱 실패. 추출 시도...")
	extracted_json = extract_json_from_text(step1_json)
	if extracted_json and 'output' in extracted_json:
	step1 = extracted_json['output']
	step1_explanation = extracted_json.get('explanation', '')
	last_successful_output = step1
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 성공")
	else:
	step1 = step0 # 추출도 실패하면 이전 단계 결과 사용
	step1_explanation = ""
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 실패")

	# Step 2: Solar API 호출 2
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Calling step2...")
	step2_json = call_solar_pro2(CALL_2_SYSTEM_PROMPT, step1)
	try:
	parsed_json = json.loads(step2_json)
	step2 = parsed_json.get('output', step1)
	step2_explanation = parsed_json.get('explanation', '')
	last_successful_output = step2
	except json.JSONDecodeError:
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Step2 JSON 파싱 실패. 추출 시도...")
	extracted_json = extract_json_from_text(step2_json)
	if extracted_json and 'output' in extracted_json:
	step2 = extracted_json['output']
	step2_explanation = extracted_json.get('explanation', '')
	last_successful_output = step2
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 성공")
	else:
	step2 = step1
	step2_explanation = ""
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 실패")

	# Step 3: Solar API 호출 3
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Calling step3...")
	step3_json = call_solar_pro2(CALL_3_SYSTEM_PROMPT, step2)
	try:
	parsed_json = json.loads(step3_json)
	step3 = parsed_json.get('output', step2)
	last_successful_output = step3
	except json.JSONDecodeError:
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - Step3 JSON 파싱 실패. 추출 시도...")
	extracted_json = extract_json_from_text(step3_json)
	if extracted_json and 'output' in extracted_json:
	step3 = extracted_json['output']
	last_successful_output = step3
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 성공")
	else:
	step3 = step2
	print(f"{article_info}[Thread-{thread_id}] Bulk {bulk_index+1} Attempt {attempt+1} - JSON 추출 실패")

	# Step 4: 단어장 기반 교정
	step4 = apply_vocabulary_correction(step3)

	# Step 5: 대명사 후처리
	step5 = postprocess_pronoun(step4)
	last_successful_output = step5


	step5 = step5.replace("<paragraph_separator>", "\n")
	last_successful_output = last_successful_output.replace("<paragraph_separator>", "\n")

	elapsed = time.time() - start

	with counter_lock:
	processed_count += 1

	# 모든 단계가 성공적으로 완료되면 루프 탈출 및 결과 반환
	return {
	"bulk_index": bulk_index,
	"original": bulk_text,
	"final": last_successful_output,
	"processing_time": elapsed,
	"character_count": len(bulk_text),
	"attempts": attempt + 1,
	}

	except Exception as e:
	if attempt < max_retries - 1:
	print(
	f"{article_info}[Thread-{thread_id}] 벌크 {bulk_index+1} 시도 {attempt+1} 실패, 재시도: {type(e).__name__}"
	)
	time.sleep(1 * (attempt + 1)) # 재시도 전 잠시 대기
	continue
	else:
	# 최종 실패 시, 마지막으로 성공했던 결과물을 final에 담아 반환
	print(f"🔥🔥🔥 {article_info}[Thread-{thread_id}] 벌크 {bulk_index+1} 최종 실패! 마지막 성공 결과물을 사용합니다. 🔥🔥🔥")
	traceback.print_exc() # 상세 에러 로그 출력

	return {
	"bulk_index": bulk_index,
	"original": bulk_text,
	"final": last_successful_output, # ✨ 핵심: 원본 대신 마지막 성공 결과물을 사용
	"processing_time": time.time() - start,
	"character_count": len(bulk_text),
	"error": traceback.format_exc(),
	"attempts": max_retries,
	}

	# 루프가 예기치 않게 종료될 경우를 대비한 반환
	return {"bulk_index": bulk_index, "final": bulk_text, "error": "unknown_flow_error"}



	def call_solar_pro2(system, user, temperature=0.0, model_name="solar-pro2"):
	response = client.chat.completions.create(
	model=model_name,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	stream=False,
	temperature=temperature,
	)
	return response.choices[0].message.content


	def call_proofread(paragraph):
	prompt = "입력된 문서에 대한 교열 결과를 생성해 주세요."
	response = client.chat.completions.create(
	model="ft:solar-news-correction-dev",
	messages=[
	{"role": "system", "content": prompt},
	{"role": "user", "content": paragraph},
	],
	stream=False,
	temperature=0.0,
	)
	return response.choices[0].message.content


	def highlight_diff(original, corrected):
	matcher = difflib.SequenceMatcher(None, original, corrected)
	result_html = []
	for tag, i1, i2, j1, j2 in matcher.get_opcodes():
	if tag == "equal":
	result_html.append(f"<span>{original[i1:i2]}</span>")
	elif tag == "replace":
	result_html.append(
	f'<span style="background:#ffecec;text-decoration:line-through;">{original[i1:i2]}</span>'
	)
	result_html.append(
	f'<span style="background:#e6ffec;">{corrected[j1:j2]}</span>'
	)
	elif tag == "delete":
	result_html.append(
	f'<span style="background:#ffecec;text-decoration:line-through;">{original[i1:i2]}</span>'
	)
	elif tag == "insert":
	result_html.append(
	f'<span style="background:#e6ffec;">{corrected[j1:j2]}</span>'
	)
	return "".join(result_html)


	def process_text_parallel(input_text, max_workers=10):
	"""텍스트를 벌크 단위로 병렬 처리합니다."""
	global processed_count, total_bulks

	# 벌크 생성
	bulks = create_bulk_paragraphs(input_text)
	total_bulks = len(bulks)
	processed_count = 0

	if not bulks:
	return []

	results = []

	# 병렬 처리
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	# 모든 벌크를 병렬로 제출
	future_to_bulk = {
	executor.submit(process_bulk, bulk, i): i for i, bulk in enumerate(bulks)
	}

	# 완료된 순서대로 결과 수집
	for future in as_completed(future_to_bulk):
	try:
	result = future.result()
	results.append(result)
	except Exception as e:
	bulk_index = future_to_bulk[future]
	print(f"벌크 {bulk_index+1} 처리 중 예외 발생: {e}")
	results.append(
	{
	"bulk_index": bulk_index,
	"original": bulks[bulk_index],
	"final": bulks[bulk_index],
	"processing_time": 0,
	"character_count": len(bulks[bulk_index]),
	"error": str(e),
	}
	)

	# 벌크 인덱스 순서대로 정렬
	results.sort(key=lambda x: x["bulk_index"])

	return results



	def demo_fn(input_text):
	# 병렬 처리로 벌크 단위로 처리
	bulk_results = process_text_parallel(input_text, max_workers=10)

	if not bulk_results:
	return input_text, input_text

	# 결과 합치기
	final_texts = [r["final"] for r in bulk_results]
	final_result = "\n".join(final_texts)

	# 하이라이트 생성
	highlighted = highlight_diff(input_text, final_result)

	return final_result, highlighted


	with gr.Blocks() as demo:
	gr.Markdown("# 교열 모델 데모")
	input_text = gr.Textbox(
	label="원문 입력", lines=10, placeholder="문단 단위로 입력해 주세요."
	)
	btn = gr.Button("교열하기")
	output_corrected = gr.Textbox(label="교열 결과", lines=10)
	output_highlight = gr.HTML(label="수정된 부분 강조")

	btn.click(
	fn=demo_fn, inputs=input_text, outputs=[output_corrected, output_highlight]
	)

	if __name__ == "__main__":
	demo.launch()