whisper-stt/app/ocr_tasks.py

"""
OCR Celery Tasks
backend: paddle | ollama | openrouter
"""
import os, base64
import httpx
from celery import Celery
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

REDIS_URL      = os.getenv("REDIS_URL", "redis://redis:6379/0")
OUTPUT_DIR     = os.getenv("OUTPUT_DIR", "/data/outputs")
OCR_LANG       = os.getenv("OCR_LANG", "korean")
OLLAMA_URL     = os.getenv("OLLAMA_URL", "http://192.168.0.126:11434")
OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "600"))

celery_app = Celery("ocr_tasks", broker=REDIS_URL, backend=REDIS_URL)
celery_app.conf.update(
    task_serializer="json", result_serializer="json",
    accept_content=["json"], task_track_started=True, result_expires=3600,
)

_ocr_engine    = None
_struct_engine = None

def get_ocr():
    global _ocr_engine
    if _ocr_engine is None:
        from paddleocr import PaddleOCR
        print(f"[PaddleOCR] 로딩 (lang={OCR_LANG})")
        _ocr_engine = PaddleOCR(use_angle_cls=True, lang=OCR_LANG)
        print("[PaddleOCR] 완료")
    return _ocr_engine

def get_structure():
    global _struct_engine
    if _struct_engine is None:
        from paddleocr import PPStructure
        print("[PPStructure] 로딩")
        _struct_engine = PPStructure(table=True, ocr=True, lang=OCR_LANG)
        print("[PPStructure] 완료")
    return _struct_engine


# ════════════════════════════════════════════════════════════════
#  메인 Task
# ════════════════════════════════════════════════════════════════
@celery_app.task(bind=True, name="tasks.ocr_task", queue="ocr")
def ocr_task(
    self,
    file_id:          str,
    image_path:       str,
    mode:             str  = "text",
    backend:          str  = "paddle",
    ollama_model:     str  = "granite3.2-vision",
    openrouter_model: str  = "",
    openrouter_url:   str  = "",
    openrouter_key:   str  = "",
    custom_prompt:    str  = "",
):
    self.update_state(state="PROGRESS", meta={"progress":8,"message":"엔진 준비 중..."})
    try:
        if backend == "openrouter":
            result = _run_openrouter(self, file_id, image_path, mode,
                                     openrouter_model, openrouter_url, openrouter_key, custom_prompt)
        elif backend == "ollama":
            result = _run_ollama(self, file_id, image_path, mode, ollama_model, custom_prompt)
        else:
            result = _run_paddle(self, file_id, image_path, mode)
        try: os.remove(image_path)
        except: pass
        return result
    except Exception as e:
        try: os.remove(image_path)
        except: pass
        raise Exception(f"OCR 실패: {str(e)}")


# ════════════════════════════════════════════════════════════════
#  OpenRouter Vision 백엔드 (OpenAI 호환)
# ════════════════════════════════════════════════════════════════
_PROMPTS = {
    "text":      "이 이미지에서 모든 텍스트를 정확하게 추출해줘. 원본의 줄 구분과 단락 구조를 유지해줘.",
    "structure": "이 이미지를 분석해서 표는 마크다운 표 형식으로, 나머지 텍스트는 원본 구조를 유지하며 추출해줘.",
}

def _run_openrouter(task, file_id, image_path, mode,
                    model, base_url, api_key, custom_prompt):
    if not api_key:
        raise Exception("OpenRouter API 키가 설정되지 않았습니다")
    if not model:
        raise Exception("OpenRouter 모델이 선택되지 않았습니다")

    task.update_state(state="PROGRESS",
                      meta={"progress":15,"message":f"OpenRouter ({model}) 연결 중..."})

    with open(image_path, "rb") as f:
        raw = f.read()

    # 이미지 MIME 타입 감지
    ext = image_path.rsplit(".", 1)[-1].lower()
    mime = {"jpg":"image/jpeg","jpeg":"image/jpeg","png":"image/png",
            "bmp":"image/bmp","gif":"image/gif","webp":"image/webp"}.get(ext, "image/jpeg")
    b64 = base64.b64encode(raw).decode()
    data_url = f"data:{mime};base64,{b64}"

    prompt = custom_prompt.strip() or _PROMPTS.get(mode, _PROMPTS["text"])

    task.update_state(state="PROGRESS", meta={"progress":30,"message":"모델 추론 중..."})

    try:
        resp = httpx.post(
            f"{base_url.rstrip('/')}/chat/completions",
            headers={
                "Authorization": f"Bearer {api_key}",
                "HTTP-Referer":  "https://voicescript.local",
                "X-Title":       "VoiceScript",
                "Content-Type":  "application/json",
            },
            json={
                "model": model,
                "messages": [{
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": data_url}},
                        {"type": "text",      "text": prompt},
                    ],
                }],
                "temperature": 0.1,
            },
            timeout=float(OLLAMA_TIMEOUT),
        )
        resp.raise_for_status()
    except httpx.HTTPStatusError as e:
        body = ""
        try: body = e.response.json().get("error",{}).get("message","")
        except: pass
        if e.response.status_code == 400:
            raise Exception(f"이 모델은 이미지를 지원하지 않습니다 — Vision 모델을 선택하세요\n({model})")
        raise Exception(f"OpenRouter 오류 ({e.response.status_code}): {body or str(e)}")
    except httpx.TimeoutException:
        raise Exception(f"OpenRouter 응답 시간 초과. OLLAMA_TIMEOUT 값을 늘려주세요.")

    task.update_state(state="PROGRESS", meta={"progress":85,"message":"결과 저장 중..."})

    full_text = resp.json()["choices"][0]["message"]["content"].strip()
    if not full_text:
        raise Exception("OpenRouter 빈 응답")

    tables = _parse_md_tables(full_text) if mode == "structure" else []
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    txt_file = f"{file_id}_ocr.txt"
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
        f.write(f"# OCR 결과 (OpenRouter / {model})\n\n{full_text}")
    xlsx_file = None
    if tables:
        xlsx_file = f"{file_id}_tables.xlsx"
        _save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))
    tables_html = [_md_table_to_html(t) for t in tables]
    lines = [{"text":l,"confidence":1.0,"bbox":[]} for l in full_text.splitlines() if l.strip()]
    return {
        "mode": mode, "backend": "openrouter", "openrouter_model": model,
        "ollama_model": "",
        "full_text": full_text, "lines": lines, "line_count": len(lines),
        "txt_file": txt_file,
        "tables": [{"html":h,"rows":len(t),"cols":max(len(r) for r in t) if t else 0}
                   for h, t in zip(tables_html, tables)],
        "xlsx_file": xlsx_file,
    }


# ════════════════════════════════════════════════════════════════
#  Ollama Vision 백엔드
# ════════════════════════════════════════════════════════════════
def _run_ollama(task, file_id, image_path, mode, ollama_model, custom_prompt):
    task.update_state(state="PROGRESS",
                      meta={"progress":15,"message":f"Ollama ({ollama_model}) 연결 중..."})
    with open(image_path, "rb") as f:
        img_b64 = base64.b64encode(f.read()).decode()
    prompt = custom_prompt.strip() or _PROMPTS.get(mode, _PROMPTS["text"])
    task.update_state(state="PROGRESS", meta={"progress":30,"message":"모델 추론 중..."})
    try:
        resp = httpx.post(f"{OLLAMA_URL}/api/chat", json={
            "model": ollama_model,
            "messages": [{"role":"user","content":prompt,"images":[img_b64]}],
            "stream": False, "options": {"temperature":0.1},
        }, timeout=float(OLLAMA_TIMEOUT))
        resp.raise_for_status()
    except httpx.ConnectError:
        raise Exception(f"Ollama 서버 연결 실패 ({OLLAMA_URL})")
    except httpx.TimeoutException:
        raise Exception(f"Ollama 응답 시간 초과 ({OLLAMA_TIMEOUT}초)")

    task.update_state(state="PROGRESS", meta={"progress":85,"message":"결과 저장 중..."})
    full_text = resp.json().get("message",{}).get("content","").strip()
    if not full_text: raise Exception("Ollama 빈 응답. 모델이 Vision을 지원하는지 확인하세요.")

    tables = _parse_md_tables(full_text) if mode == "structure" else []
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    txt_file = f"{file_id}_ocr.txt"
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
        f.write(f"# OCR 결과 (Ollama / {ollama_model})\n\n{full_text}")
    xlsx_file = None
    if tables:
        xlsx_file = f"{file_id}_tables.xlsx"
        _save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))
    tables_html = [_md_table_to_html(t) for t in tables]
    lines = [{"text":l,"confidence":1.0,"bbox":[]} for l in full_text.splitlines() if l.strip()]
    return {
        "mode": mode, "backend": "ollama", "ollama_model": ollama_model,
        "openrouter_model": "",
        "full_text": full_text, "lines": lines, "line_count": len(lines),
        "txt_file": txt_file,
        "tables": [{"html":h,"rows":len(t),"cols":max(len(r) for r in t) if t else 0}
                   for h, t in zip(tables_html, tables)],
        "xlsx_file": xlsx_file,
    }


# ════════════════════════════════════════════════════════════════
#  PaddleOCR 백엔드
# ════════════════════════════════════════════════════════════════
def _run_paddle(task, file_id, image_path, mode):
    import cv2
    img = cv2.imread(image_path)
    if img is None: raise ValueError("이미지를 읽을 수 없습니다")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    return _paddle_structure(task, file_id, img) if mode == "structure" else _paddle_text(task, file_id, img)

def _paddle_text(task, file_id, img):
    task.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 인식 중..."})
    result = get_ocr().ocr(img)
    task.update_state(state="PROGRESS", meta={"progress":80,"message":"결과 정리 중..."})
    lines = []
    if result and len(result) > 0:
        r = result[0]
        if isinstance(r, dict):
            for text, conf in zip(r.get("rec_texts",[]), r.get("rec_scores",[])):
                if text.strip(): lines.append({"text":text,"confidence":round(float(conf),3),"bbox":[]})
        elif isinstance(r, list):
            for item in r:
                if item and len(item)==2:
                    _, (text, conf) = item
                    if text.strip(): lines.append({"text":text,"confidence":round(float(conf),3),"bbox":[]})
    full_text = "\n".join(l["text"] for l in lines)
    txt_file = f"{file_id}_ocr.txt"
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f: f.write(full_text)
    return {"mode":"text","backend":"paddle","ollama_model":"","openrouter_model":"",
            "full_text":full_text,"lines":lines,"line_count":len(lines),
            "txt_file":txt_file,"tables":[],"xlsx_file":None}

def _paddle_structure(task, file_id, img):
    task.update_state(state="PROGRESS", meta={"progress":20,"message":"레이아웃 분석 중..."})
    result = get_structure()(img)
    task.update_state(state="PROGRESS", meta={"progress":60,"message":"표 구조 추출 중..."})
    text_blocks, tables_html, tables_data = [], [], []
    for region in result:
        rtype = region.get("type","").lower()
        if rtype == "table":
            html = region.get("res",{}).get("html","")
            if html: tables_html.append(html); tables_data.append(_html_table_to_list(html))
        elif rtype in ("text","title","figure_caption"):
            for line in (region.get("res",[]) or []):
                if isinstance(line,(list,tuple)) and len(line)==2:
                    _, (text, _conf) = line; text_blocks.append(text)
    full_text = "\n".join(text_blocks)
    task.update_state(state="PROGRESS", meta={"progress":80,"message":"Excel 생성 중..."})
    xlsx_file = None
    if tables_data:
        xlsx_file = f"{file_id}_tables.xlsx"
        _save_excel(tables_data, os.path.join(OUTPUT_DIR, xlsx_file))
    txt_file = f"{file_id}_ocr.txt"
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
        f.write("# 텍스트\n\n" + full_text)
    lines = [{"text":t,"confidence":1.0,"bbox":[]} for t in text_blocks]
    tables_meta = [{"html":h,"rows":len(d),"cols":max(len(r) for r in d) if d else 0}
                   for h, d in zip(tables_html, tables_data)]
    return {"mode":"structure","backend":"paddle","ollama_model":"","openrouter_model":"",
            "full_text":full_text,"lines":lines,"line_count":len(lines),
            "txt_file":txt_file,"tables":tables_meta,"xlsx_file":xlsx_file}


# ════════════════════════════════════════════════════════════════
#  공통 유틸
# ════════════════════════════════════════════════════════════════
def _parse_md_tables(text):
    tables, current = [], []
    for line in text.splitlines():
        s = line.strip()
        if s.startswith("|") and s.endswith("|"):
            if all(c in "| -:" for c in s): continue
            current.append([c.strip() for c in s.strip("|").split("|")])
        else:
            if len(current) >= 2: tables.append(current)
            current = []
    if len(current) >= 2: tables.append(current)
    return tables

def _md_table_to_html(table):
    if not table: return ""
    rows = ""
    for i, row in enumerate(table):
        tag = "th" if i==0 else "td"
        rows += "<tr>"+"".join(f"<{tag}>{c}</{tag}>" for c in row)+"</tr>"
    return f"<table>{rows}</table>"

def _html_table_to_list(html):
    from html.parser import HTMLParser
    class P(HTMLParser):
        def __init__(self):
            super().__init__()
            self.rows,self._row,self._cell,self._in=[],[],[],False
        def handle_starttag(self,tag,attrs):
            if tag=="tr": self._row=[]
            elif tag in("td","th"): self._cell=[];self._in=True
        def handle_endtag(self,tag):
            if tag in("td","th"): self._row.append("".join(self._cell).strip());self._in=False
            elif tag=="tr":
                if self._row: self.rows.append(self._row)
        def handle_data(self,data):
            if self._in: self._cell.append(data)
    p=P();p.feed(html);return p.rows

def _save_excel(tables, path):
    wb=openpyxl.Workbook();wb.remove(wb.active)
    for i,table in enumerate(tables,1):
        ws=wb.create_sheet(f"표 {i}")
        thin=Side(style="thin",color="2A2A33");bdr=Border(left=thin,right=thin,top=thin,bottom=thin)
        for r_idx,row in enumerate(table,1):
            for c_idx,val in enumerate(row,1):
                cell=ws.cell(row=r_idx,column=c_idx,value=val)
                cell.border=bdr;cell.alignment=Alignment(horizontal="center",vertical="center",wrap_text=True)
                if r_idx==1: cell.fill=PatternFill("solid",fgColor="1A1A2E");cell.font=Font(color="00E5A0",bold=True,size=10)
                else: cell.font=Font(size=10)
        for col in ws.columns:
            w=max((len(str(c.value or "")) for c in col),default=8)
            ws.column_dimensions[col[0].column_letter].width=min(w+4,40)
    if not wb.sheetnames: wb.create_sheet("Sheet1")
    wb.save(path)