feat: OpenRouter 외부 AI 연동 (STT 교정 + OCR Vision)

This commit is contained in:
root
2026-04-28 15:38:06 +09:00
parent f9075ae3f6
commit f35fe1143a
5 changed files with 667 additions and 299 deletions

View File

@@ -1,9 +1,8 @@
"""
OCR Celery Tasks — PaddleOCR 3.x + Ollama Vision
OCR Celery Tasks
backend: paddle | ollama | openrouter
"""
import os
import base64
import os, base64
import httpx
from celery import Celery
import openpyxl
@@ -17,11 +16,8 @@ OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "600"))
celery_app = Celery("ocr_tasks", broker=REDIS_URL, backend=REDIS_URL)
celery_app.conf.update(
task_serializer="json",
result_serializer="json",
accept_content=["json"],
task_track_started=True,
result_expires=3600,
task_serializer="json", result_serializer="json",
accept_content=["json"], task_track_started=True, result_expires=3600,
)
_ocr_engine = None
@@ -46,12 +42,28 @@ def get_structure():
return _struct_engine
# ════════════════════════════════════════════════════════════════
# 메인 Task
# ════════════════════════════════════════════════════════════════
@celery_app.task(bind=True, name="tasks.ocr_task", queue="ocr")
def ocr_task(self, file_id, image_path, mode="text",
backend="paddle", ollama_model="granite3.2-vision", custom_prompt=""):
self.update_state(state="PROGRESS", meta={"progress": 8, "message": "엔진 준비 중..."})
def ocr_task(
self,
file_id: str,
image_path: str,
mode: str = "text",
backend: str = "paddle",
ollama_model: str = "granite3.2-vision",
openrouter_model: str = "",
openrouter_url: str = "",
openrouter_key: str = "",
custom_prompt: str = "",
):
self.update_state(state="PROGRESS", meta={"progress":8,"message":"엔진 준비 중..."})
try:
if backend == "ollama":
if backend == "openrouter":
result = _run_openrouter(self, file_id, image_path, mode,
openrouter_model, openrouter_url, openrouter_key, custom_prompt)
elif backend == "ollama":
result = _run_ollama(self, file_id, image_path, mode, ollama_model, custom_prompt)
else:
result = _run_paddle(self, file_id, image_path, mode)
@@ -64,34 +76,124 @@ def ocr_task(self, file_id, image_path, mode="text",
raise Exception(f"OCR 실패: {str(e)}")
_OLLAMA_PROMPTS = {
# ════════════════════════════════════════════════════════════════
# OpenRouter Vision 백엔드 (OpenAI 호환)
# ════════════════════════════════════════════════════════════════
_PROMPTS = {
"text": "이 이미지에서 모든 텍스트를 정확하게 추출해줘. 원본의 줄 구분과 단락 구조를 유지해줘.",
"structure": "이 이미지를 분석해서 표는 마크다운 표 형식으로, 나머지 텍스트는 원본 구조를 유지하며 추출해줘.",
}
def _run_openrouter(task, file_id, image_path, mode,
model, base_url, api_key, custom_prompt):
if not api_key:
raise Exception("OpenRouter API 키가 설정되지 않았습니다")
if not model:
raise Exception("OpenRouter 모델이 선택되지 않았습니다")
task.update_state(state="PROGRESS",
meta={"progress":15,"message":f"OpenRouter ({model}) 연결 중..."})
with open(image_path, "rb") as f:
raw = f.read()
# 이미지 MIME 타입 감지
ext = image_path.rsplit(".", 1)[-1].lower()
mime = {"jpg":"image/jpeg","jpeg":"image/jpeg","png":"image/png",
"bmp":"image/bmp","gif":"image/gif","webp":"image/webp"}.get(ext, "image/jpeg")
b64 = base64.b64encode(raw).decode()
data_url = f"data:{mime};base64,{b64}"
prompt = custom_prompt.strip() or _PROMPTS.get(mode, _PROMPTS["text"])
task.update_state(state="PROGRESS", meta={"progress":30,"message":"모델 추론 중..."})
try:
resp = httpx.post(
f"{base_url.rstrip('/')}/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"HTTP-Referer": "https://voicescript.local",
"X-Title": "VoiceScript",
"Content-Type": "application/json",
},
json={
"model": model,
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": data_url}},
{"type": "text", "text": prompt},
],
}],
"temperature": 0.1,
},
timeout=float(OLLAMA_TIMEOUT),
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
body = ""
try: body = e.response.json().get("error",{}).get("message","")
except: pass
if e.response.status_code == 400:
raise Exception(f"이 모델은 이미지를 지원하지 않습니다 — Vision 모델을 선택하세요\n({model})")
raise Exception(f"OpenRouter 오류 ({e.response.status_code}): {body or str(e)}")
except httpx.TimeoutException:
raise Exception(f"OpenRouter 응답 시간 초과. OLLAMA_TIMEOUT 값을 늘려주세요.")
task.update_state(state="PROGRESS", meta={"progress":85,"message":"결과 저장 중..."})
full_text = resp.json()["choices"][0]["message"]["content"].strip()
if not full_text:
raise Exception("OpenRouter 빈 응답")
tables = _parse_md_tables(full_text) if mode == "structure" else []
os.makedirs(OUTPUT_DIR, exist_ok=True)
txt_file = f"{file_id}_ocr.txt"
with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
f.write(f"# OCR 결과 (OpenRouter / {model})\n\n{full_text}")
xlsx_file = None
if tables:
xlsx_file = f"{file_id}_tables.xlsx"
_save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))
tables_html = [_md_table_to_html(t) for t in tables]
lines = [{"text":l,"confidence":1.0,"bbox":[]} for l in full_text.splitlines() if l.strip()]
return {
"mode": mode, "backend": "openrouter", "openrouter_model": model,
"ollama_model": "",
"full_text": full_text, "lines": lines, "line_count": len(lines),
"txt_file": txt_file,
"tables": [{"html":h,"rows":len(t),"cols":max(len(r) for r in t) if t else 0}
for h, t in zip(tables_html, tables)],
"xlsx_file": xlsx_file,
}
# ════════════════════════════════════════════════════════════════
# Ollama Vision 백엔드
# ════════════════════════════════════════════════════════════════
def _run_ollama(task, file_id, image_path, mode, ollama_model, custom_prompt):
task.update_state(state="PROGRESS",
meta={"progress": 15, "message": f"Ollama ({ollama_model}) 연결 중..."})
meta={"progress":15,"message":f"Ollama ({ollama_model}) 연결 중..."})
with open(image_path, "rb") as f:
img_b64 = base64.b64encode(f.read()).decode()
prompt = custom_prompt.strip() or _OLLAMA_PROMPTS.get(mode, _OLLAMA_PROMPTS["text"])
task.update_state(state="PROGRESS", meta={"progress": 30, "message": "모델 추론 중..."})
prompt = custom_prompt.strip() or _PROMPTS.get(mode, _PROMPTS["text"])
task.update_state(state="PROGRESS", meta={"progress":30,"message":"모델 추론 중..."})
try:
resp = httpx.post(f"{OLLAMA_URL}/api/chat", json={
"model": ollama_model,
"messages": [{"role": "user", "content": prompt, "images": [img_b64]}],
"stream": False, "options": {"temperature": 0.1},
"messages": [{"role":"user","content":prompt,"images":[img_b64]}],
"stream": False, "options": {"temperature":0.1},
}, timeout=float(OLLAMA_TIMEOUT))
resp.raise_for_status()
except httpx.ConnectError:
raise Exception(f"Ollama 서버 연결 실패 ({OLLAMA_URL})")
except httpx.TimeoutException:
raise Exception(f"Ollama 응답 시간 초과 ({OLLAMA_TIMEOUT}초). OLLAMA_TIMEOUT 값을 늘려주세요.")
raise Exception(f"Ollama 응답 시간 초과 ({OLLAMA_TIMEOUT}초)")
task.update_state(state="PROGRESS", meta={"progress": 85, "message": "결과 저장 중..."})
full_text = resp.json().get("message", {}).get("content", "").strip()
if not full_text:
raise Exception("Ollama 빈 응답. 모델이 설치되어 있는지 확인하세요.")
task.update_state(state="PROGRESS", meta={"progress":85,"message":"결과 저장 중..."})
full_text = resp.json().get("message",{}).get("content","").strip()
if not full_text: raise Exception("Ollama 빈 응답. 모델이 Vision을 지원하는지 확인하세요.")
tables = _parse_md_tables(full_text) if mode == "structure" else []
os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -103,74 +205,66 @@ def _run_ollama(task, file_id, image_path, mode, ollama_model, custom_prompt):
xlsx_file = f"{file_id}_tables.xlsx"
_save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))
tables_html = [_md_table_to_html(t) for t in tables]
lines = [{"text": l, "confidence": 1.0, "bbox": []}
for l in full_text.splitlines() if l.strip()]
lines = [{"text":l,"confidence":1.0,"bbox":[]} for l in full_text.splitlines() if l.strip()]
return {
"mode": mode, "backend": "ollama", "ollama_model": ollama_model,
"openrouter_model": "",
"full_text": full_text, "lines": lines, "line_count": len(lines),
"txt_file": txt_file,
"tables": [{"html": h, "rows": len(t), "cols": max(len(r) for r in t) if t else 0}
"tables": [{"html":h,"rows":len(t),"cols":max(len(r) for r in t) if t else 0}
for h, t in zip(tables_html, tables)],
"xlsx_file": xlsx_file,
}
# ════════════════════════════════════════════════════════════════
# PaddleOCR 백엔드
# ════════════════════════════════════════════════════════════════
def _run_paddle(task, file_id, image_path, mode):
import cv2
img = cv2.imread(image_path)
if img is None:
raise ValueError("이미지를 읽을 수 없습니다")
if img is None: raise ValueError("이미지를 읽을 수 없습니다")
os.makedirs(OUTPUT_DIR, exist_ok=True)
return _paddle_structure(task, file_id, img) if mode == "structure" \
else _paddle_text(task, file_id, img)
return _paddle_structure(task, file_id, img) if mode == "structure" else _paddle_text(task, file_id, img)
def _paddle_text(task, file_id, img):
task.update_state(state="PROGRESS", meta={"progress": 30, "message": "텍스트 인식 중..."})
task.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 인식 중..."})
result = get_ocr().ocr(img)
task.update_state(state="PROGRESS", meta={"progress": 80, "message": "결과 정리 중..."})
task.update_state(state="PROGRESS", meta={"progress":80,"message":"결과 정리 중..."})
lines = []
if result and len(result) > 0:
r = result[0]
if isinstance(r, dict):
texts = r.get("rec_texts", [])
scores = r.get("rec_scores", [])
for text, conf in zip(texts, scores):
if text.strip():
lines.append({"text": text, "confidence": round(float(conf), 3), "bbox": []})
for text, conf in zip(r.get("rec_texts",[]), r.get("rec_scores",[])):
if text.strip(): lines.append({"text":text,"confidence":round(float(conf),3),"bbox":[]})
elif isinstance(r, list):
for item in r:
if item and len(item) == 2:
if item and len(item)==2:
_, (text, conf) = item
if text.strip():
lines.append({"text": text, "confidence": round(float(conf), 3), "bbox": []})
if text.strip(): lines.append({"text":text,"confidence":round(float(conf),3),"bbox":[]})
full_text = "\n".join(l["text"] for l in lines)
txt_file = f"{file_id}_ocr.txt"
with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
f.write(full_text)
return {"mode": "text", "backend": "paddle", "ollama_model": "",
"full_text": full_text, "lines": lines,
"line_count": len(lines), "txt_file": txt_file,
"tables": [], "xlsx_file": None}
txt_file = f"{file_id}_ocr.txt"
with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f: f.write(full_text)
return {"mode":"text","backend":"paddle","ollama_model":"","openrouter_model":"",
"full_text":full_text,"lines":lines,"line_count":len(lines),
"txt_file":txt_file,"tables":[],"xlsx_file":None}
def _paddle_structure(task, file_id, img):
task.update_state(state="PROGRESS", meta={"progress": 20, "message": "레이아웃 분석 중..."})
task.update_state(state="PROGRESS", meta={"progress":20,"message":"레이아웃 분석 중..."})
result = get_structure()(img)
task.update_state(state="PROGRESS", meta={"progress": 60, "message": "표 구조 추출 중..."})
task.update_state(state="PROGRESS", meta={"progress":60,"message":"표 구조 추출 중..."})
text_blocks, tables_html, tables_data = [], [], []
for region in result:
rtype = region.get("type", "").lower()
rtype = region.get("type","").lower()
if rtype == "table":
html = region.get("res", {}).get("html", "")
if html:
tables_html.append(html)
tables_data.append(_html_table_to_list(html))
elif rtype in ("text", "title", "figure_caption"):
for line in (region.get("res", []) or []):
if isinstance(line, (list, tuple)) and len(line) == 2:
_, (text, _conf) = line
text_blocks.append(text)
html = region.get("res",{}).get("html","")
if html: tables_html.append(html); tables_data.append(_html_table_to_list(html))
elif rtype in ("text","title","figure_caption"):
for line in (region.get("res",[]) or []):
if isinstance(line,(list,tuple)) and len(line)==2:
_, (text, _conf) = line; text_blocks.append(text)
full_text = "\n".join(text_blocks)
task.update_state(state="PROGRESS", meta={"progress": 80, "message": "Excel 생성 중..."})
task.update_state(state="PROGRESS", meta={"progress":80,"message":"Excel 생성 중..."})
xlsx_file = None
if tables_data:
xlsx_file = f"{file_id}_tables.xlsx"
@@ -178,15 +272,17 @@ def _paddle_structure(task, file_id, img):
txt_file = f"{file_id}_ocr.txt"
with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
f.write("# 텍스트\n\n" + full_text)
lines = [{"text": t, "confidence": 1.0, "bbox": []} for t in text_blocks]
tables_meta = [{"html": h, "rows": len(d), "cols": max(len(r) for r in d) if d else 0}
lines = [{"text":t,"confidence":1.0,"bbox":[]} for t in text_blocks]
tables_meta = [{"html":h,"rows":len(d),"cols":max(len(r) for r in d) if d else 0}
for h, d in zip(tables_html, tables_data)]
return {"mode": "structure", "backend": "paddle", "ollama_model": "",
"full_text": full_text, "lines": lines,
"line_count": len(lines), "txt_file": txt_file,
"tables": tables_meta, "xlsx_file": xlsx_file}
return {"mode":"structure","backend":"paddle","ollama_model":"","openrouter_model":"",
"full_text":full_text,"lines":lines,"line_count":len(lines),
"txt_file":txt_file,"tables":tables_meta,"xlsx_file":xlsx_file}
# ════════════════════════════════════════════════════════════════
# 공통 유틸
# ════════════════════════════════════════════════════════════════
def _parse_md_tables(text):
tables, current = [], []
for line in text.splitlines():
@@ -204,8 +300,8 @@ def _md_table_to_html(table):
if not table: return ""
rows = ""
for i, row in enumerate(table):
tag = "th" if i == 0 else "td"
rows += "<tr>" + "".join(f"<{tag}>{c}</{tag}>" for c in row) + "</tr>"
tag = "th" if i==0 else "td"
rows += "<tr>"+"".join(f"<{tag}>{c}</{tag}>" for c in row)+"</tr>"
return f"<table>{rows}</table>"
def _html_table_to_list(html):
@@ -213,36 +309,31 @@ def _html_table_to_list(html):
class P(HTMLParser):
def __init__(self):
super().__init__()
self.rows, self._row, self._cell, self._in = [], [], [], False
def handle_starttag(self, tag, attrs):
if tag == "tr": self._row = []
elif tag in ("td","th"): self._cell = []; self._in = True
def handle_endtag(self, tag):
if tag in ("td","th"): self._row.append("".join(self._cell).strip()); self._in = False
elif tag == "tr":
self.rows,self._row,self._cell,self._in=[],[],[],False
def handle_starttag(self,tag,attrs):
if tag=="tr": self._row=[]
elif tag in("td","th"): self._cell=[];self._in=True
def handle_endtag(self,tag):
if tag in("td","th"): self._row.append("".join(self._cell).strip());self._in=False
elif tag=="tr":
if self._row: self.rows.append(self._row)
def handle_data(self, data):
def handle_data(self,data):
if self._in: self._cell.append(data)
p = P(); p.feed(html); return p.rows
p=P();p.feed(html);return p.rows
def _save_excel(tables, path):
wb = openpyxl.Workbook(); wb.remove(wb.active)
for i, table in enumerate(tables, 1):
ws = wb.create_sheet(f"{i}")
thin = Side(style="thin", color="2A2A33")
bdr = Border(left=thin, right=thin, top=thin, bottom=thin)
for r_idx, row in enumerate(table, 1):
for c_idx, val in enumerate(row, 1):
cell = ws.cell(row=r_idx, column=c_idx, value=val)
cell.border = bdr
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
if r_idx == 1:
cell.fill = PatternFill("solid", fgColor="1A1A2E")
cell.font = Font(color="00E5A0", bold=True, size=10)
else:
cell.font = Font(size=10)
wb=openpyxl.Workbook();wb.remove(wb.active)
for i,table in enumerate(tables,1):
ws=wb.create_sheet(f"{i}")
thin=Side(style="thin",color="2A2A33");bdr=Border(left=thin,right=thin,top=thin,bottom=thin)
for r_idx,row in enumerate(table,1):
for c_idx,val in enumerate(row,1):
cell=ws.cell(row=r_idx,column=c_idx,value=val)
cell.border=bdr;cell.alignment=Alignment(horizontal="center",vertical="center",wrap_text=True)
if r_idx==1: cell.fill=PatternFill("solid",fgColor="1A1A2E");cell.font=Font(color="00E5A0",bold=True,size=10)
else: cell.font=Font(size=10)
for col in ws.columns:
w = max((len(str(c.value or "")) for c in col), default=8)
ws.column_dimensions[col[0].column_letter].width = min(w + 4, 40)
w=max((len(str(c.value or "")) for c in col),default=8)
ws.column_dimensions[col[0].column_letter].width=min(w+4,40)
if not wb.sheetnames: wb.create_sheet("Sheet1")
wb.save(path)