whisper-stt/app/ocr_tasks.py

"""
OCR Celery Tasks — PaddleOCR 3.x + Ollama Vision + OpenRouter Vision

backend:
  paddle      → PaddleOCR 3.x 로컬 (PPStructure 제거됨, 표는 마크다운 파싱)
  ollama      → Ollama Vision API
  openrouter  → OpenRouter Vision API (OpenAI 호환)
"""
import os, base64, json
import httpx
from celery import Celery
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

REDIS_URL      = os.getenv("REDIS_URL", "redis://redis:6379/0")
OUTPUT_DIR     = os.getenv("OUTPUT_DIR", "/data/outputs")
OCR_LANG       = os.getenv("OCR_LANG", "korean")
OLLAMA_URL     = os.getenv("OLLAMA_URL", "http://192.168.0.126:11434")
OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "600"))

celery_app = Celery("ocr_tasks", broker=REDIS_URL, backend=REDIS_URL)
celery_app.conf.update(
    task_serializer="json",
    result_serializer="json",
    accept_content=["json"],
    task_track_started=True,
    result_expires=3600,
)

# PaddleOCR 싱글톤
_ocr_engine = None

def get_ocr():
    global _ocr_engine
    if _ocr_engine is None:
        from paddleocr import PaddleOCR
        print(f"[PaddleOCR] 로딩 (lang={OCR_LANG})")
        _ocr_engine = PaddleOCR(use_angle_cls=True, lang=OCR_LANG)
        print("[PaddleOCR] 완료")
    return _ocr_engine


# ════════════════════════════════════════════════════════════════
#  메인 Celery Task
#  인자: file_id, image_path, mode, backend,
#        ollama_model, openrouter_model,
#        openrouter_url, openrouter_key,
#        custom_prompt
# ════════════════════════════════════════════════════════════════
@celery_app.task(bind=True, name="tasks.ocr_task", queue="ocr")
def ocr_task(
    self,
    file_id:          str,
    image_path:       str,
    mode:             str = "text",
    backend:          str = "paddle",
    ollama_model:     str = "granite3.2-vision",
    openrouter_model: str = "",
    openrouter_url:   str = "",
    openrouter_key:   str = "",
    custom_prompt:    str = "",
):
    self.update_state(state="PROGRESS", meta={"progress": 8, "message": "엔진 준비 중..."})
    try:
        if backend == "openrouter":
            result = _run_openrouter(
                self, file_id, image_path, mode,
                openrouter_model, openrouter_url, openrouter_key, custom_prompt
            )
        elif backend == "ollama":
            result = _run_ollama(
                self, file_id, image_path, mode, ollama_model, custom_prompt
            )
        else:
            result = _run_paddle(self, file_id, image_path, mode)

        try: os.remove(image_path)
        except: pass
        return result

    except Exception as e:
        try: os.remove(image_path)
        except: pass
        raise Exception(f"OCR 실패: {str(e)}")


# ════════════════════════════════════════════════════════════════
#  공통 프롬프트
# ════════════════════════════════════════════════════════════════
_PROMPT_TEXT = (
    "이 이미지에서 모든 텍스트를 정확하게 추출해줘. "
    "원본의 줄 구분과 단락 구조를 최대한 유지해줘. "
    "이미지에 없는 내용은 절대 추가하지 마."
)
_PROMPT_STRUCTURE = (
    "이 이미지를 분석해서 다음을 수행해줘:\n"
    "1. 표(table)가 있으면 반드시 마크다운 표 형식(| col | col |)으로 변환\n"
    "2. 나머지 텍스트는 원본 구조를 유지하며 추출\n"
    "3. 표와 텍스트를 구분해서 순서대로 출력\n"
    "이미지에 없는 내용은 추가하지 마."
)

def _get_prompt(mode, custom_prompt):
    if custom_prompt and custom_prompt.strip():
        return custom_prompt.strip()
    return _PROMPT_STRUCTURE if mode == "structure" else _PROMPT_TEXT


# ════════════════════════════════════════════════════════════════
#  OpenRouter Vision 백엔드
# ════════════════════════════════════════════════════════════════
def _run_openrouter(task, file_id, image_path, mode,
                    model, base_url, api_key, custom_prompt):
    if not api_key:
        raise Exception("OpenRouter API 키가 설정되지 않았습니다. 설정 → OpenRouter에서 저장하세요.")
    if not model:
        raise Exception("OpenRouter 모델이 선택되지 않았습니다.")

    task.update_state(state="PROGRESS",
                      meta={"progress": 15, "message": f"OpenRouter ({model}) 연결 중..."})

    # 이미지 → base64 data URL
    with open(image_path, "rb") as f:
        raw = f.read()
    ext  = image_path.rsplit(".", 1)[-1].lower()
    mime = {"jpg":"image/jpeg","jpeg":"image/jpeg","png":"image/png",
            "bmp":"image/bmp","gif":"image/gif","webp":"image/webp"}.get(ext, "image/jpeg")
    data_url = f"data:{mime};base64,{base64.b64encode(raw).decode()}"

    prompt = _get_prompt(mode, custom_prompt)
    task.update_state(state="PROGRESS", meta={"progress": 30, "message": "모델 추론 중..."})

    try:
        resp = httpx.post(
            f"{base_url.rstrip('/')}/chat/completions",
            headers={
                "Authorization": f"Bearer {api_key}",
                "HTTP-Referer":  "https://voicescript.local",
                "X-Title":       "VoiceScript",
                "Content-Type":  "application/json",
            },
            json={
                "model": model,
                "messages": [{
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": data_url}},
                        {"type": "text",      "text": prompt},
                    ],
                }],
                "temperature": 0.1,
            },
            timeout=float(OLLAMA_TIMEOUT),
        )
        resp.raise_for_status()
    except httpx.HTTPStatusError as e:
        body = ""
        try: body = e.response.json().get("error", {}).get("message", "")
        except: pass
        if e.response.status_code == 400:
            raise Exception(
                f"이 모델은 이미지를 지원하지 않습니다.\n"
                f"Vision 기능을 지원하는 모델을 선택하세요 (Claude-3, GPT-4o, Gemini 등)\n"
                f"모델: {model}"
            )
        raise Exception(f"OpenRouter 오류 ({e.response.status_code}): {body or str(e)}")
    except httpx.TimeoutException:
        raise Exception(f"OpenRouter 응답 시간 초과 ({OLLAMA_TIMEOUT}초). OLLAMA_TIMEOUT 값을 늘려주세요.")

    task.update_state(state="PROGRESS", meta={"progress": 85, "message": "결과 저장 중..."})

    choices = resp.json().get("choices", [])
    if not choices:
        raise Exception("OpenRouter 빈 응답")
    full_text = choices[0]["message"]["content"].strip()
    if not full_text:
        raise Exception("OpenRouter 빈 응답")

    return _build_result(
        task, file_id, full_text, mode,
        backend="openrouter", ollama_model="", openrouter_model=model
    )


# ════════════════════════════════════════════════════════════════
#  Ollama Vision 백엔드
# ════════════════════════════════════════════════════════════════
def _run_ollama(task, file_id, image_path, mode, ollama_model, custom_prompt):
    task.update_state(state="PROGRESS",
                      meta={"progress": 15, "message": f"Ollama ({ollama_model}) 연결 중..."})

    with open(image_path, "rb") as f:
        img_b64 = base64.b64encode(f.read()).decode()

    prompt = _get_prompt(mode, custom_prompt)
    task.update_state(state="PROGRESS", meta={"progress": 30, "message": "모델 추론 중..."})

    try:
        resp = httpx.post(
            f"{OLLAMA_URL}/api/chat",
            json={
                "model":    ollama_model,
                "messages": [{"role": "user", "content": prompt, "images": [img_b64]}],
                "stream":   False,
                "options":  {"temperature": 0.1},
            },
            timeout=float(OLLAMA_TIMEOUT),
        )
        resp.raise_for_status()
    except httpx.ConnectError:
        raise Exception(f"Ollama 서버 연결 실패 ({OLLAMA_URL})")
    except httpx.TimeoutException:
        raise Exception(f"Ollama 응답 시간 초과 ({OLLAMA_TIMEOUT}초). 설정에서 타임아웃을 늘려주세요.")

    task.update_state(state="PROGRESS", meta={"progress": 85, "message": "결과 저장 중..."})

    full_text = resp.json().get("message", {}).get("content", "").strip()
    if not full_text:
        raise Exception(
            f"Ollama 빈 응답.\n"
            f"이 모델이 Vision(이미지)을 지원하는지 확인하세요: {ollama_model}\n"
            f"Vision 지원 모델: granite3.2-vision, llava 등"
        )

    return _build_result(
        task, file_id, full_text, mode,
        backend="ollama", ollama_model=ollama_model, openrouter_model=""
    )


# ════════════════════════════════════════════════════════════════
#  PaddleOCR 백엔드 (3.x — PPStructure 미사용)
# ════════════════════════════════════════════════════════════════
def _run_paddle(task, file_id, image_path, mode):
    import cv2
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError("이미지를 읽을 수 없습니다. 지원 형식: jpg, png, bmp, tiff, webp")

    task.update_state(state="PROGRESS", meta={"progress": 30, "message": "텍스트 인식 중..."})
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    result = get_ocr().ocr(img)
    task.update_state(state="PROGRESS", meta={"progress": 80, "message": "결과 정리 중..."})

    lines = []
    if result and len(result) > 0:
        r = result[0]
        if isinstance(r, dict):
            # PaddleOCR 3.x 딕셔너리 형태
            texts  = r.get("rec_texts",  [])
            scores = r.get("rec_scores", [])
            polys  = r.get("rec_polys",  [None] * len(texts))
            for text, conf, poly in zip(texts, scores, polys):
                if text.strip():
                    lines.append({
                        "text":       text,
                        "confidence": round(float(conf), 3),
                        "bbox":       poly.tolist() if poly is not None and hasattr(poly, 'tolist') else [],
                    })
        elif isinstance(r, list):
            # 구버전 호환 [[bbox, (text, conf)], ...]
            for item in r:
                if item and len(item) == 2:
                    bbox, (text, conf) = item
                    if text.strip():
                        lines.append({"text": text, "confidence": round(float(conf), 3), "bbox": []})

    full_text = "\n".join(l["text"] for l in lines)

    # structure 모드: 텍스트에서 마크다운 표 파싱 시도
    tables = []
    xlsx_file = None
    if mode == "structure":
        tables = _parse_md_tables(full_text)
        if tables:
            xlsx_file = f"{file_id}_tables.xlsx"
            _save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))

    txt_file = f"{file_id}_ocr.txt"
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
        f.write(full_text)

    tables_html = [_md_table_to_html(t) for t in tables]
    tables_meta = [{"html": h, "rows": len(t), "cols": max(len(r) for r in t) if t else 0}
                   for h, t in zip(tables_html, tables)]

    return {
        "mode":             mode,
        "backend":          "paddle",
        "ollama_model":     "",
        "openrouter_model": "",
        "full_text":        full_text,
        "lines":            lines,
        "line_count":       len(lines),
        "txt_file":         txt_file,
        "tables":           tables_meta,
        "xlsx_file":        xlsx_file,
    }


# ════════════════════════════════════════════════════════════════
#  공통 결과 빌더 (Ollama / OpenRouter 공용)
# ════════════════════════════════════════════════════════════════
def _build_result(task, file_id, full_text, mode,
                  backend, ollama_model, openrouter_model):
    """마크다운 표 파싱 → Excel 생성 → 결과 딕셔너리 반환"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    tables = _parse_md_tables(full_text) if mode == "structure" else []

    txt_file = f"{file_id}_ocr.txt"
    label = ollama_model if backend == "ollama" else openrouter_model
    with open(os.path.join(OUTPUT_DIR, txt_file), "w", encoding="utf-8") as f:
        f.write(f"# OCR 결과 ({backend} / {label})\n\n{full_text}")

    xlsx_file = None
    if tables:
        task.update_state(state="PROGRESS", meta={"progress": 92, "message": "Excel 생성 중..."})
        xlsx_file = f"{file_id}_tables.xlsx"
        _save_excel(tables, os.path.join(OUTPUT_DIR, xlsx_file))

    tables_html = [_md_table_to_html(t) for t in tables]
    tables_meta = [{"html": h, "rows": len(t), "cols": max(len(r) for r in t) if t else 0}
                   for h, t in zip(tables_html, tables)]

    lines = [{"text": l, "confidence": 1.0, "bbox": []}
             for l in full_text.splitlines() if l.strip()]

    return {
        "mode":             mode,
        "backend":          backend,
        "ollama_model":     ollama_model,
        "openrouter_model": openrouter_model,
        "full_text":        full_text,
        "lines":            lines,
        "line_count":       len(lines),
        "txt_file":         txt_file,
        "tables":           tables_meta,
        "xlsx_file":        xlsx_file,
    }


# ════════════════════════════════════════════════════════════════
#  마크다운 표 파싱
# ════════════════════════════════════════════════════════════════
def _parse_md_tables(text: str) -> list:
    """텍스트에서 마크다운 표 추출 → [[row, row, ...], ...]"""
    tables, current = [], []
    for line in text.splitlines():
        s = line.strip()
        if s.startswith("|") and s.endswith("|"):
            # 구분선 (|---|---) 건너뜀
            if all(c in "| -:" for c in s):
                continue
            cells = [c.strip() for c in s.strip("|").split("|")]
            current.append(cells)
        else:
            if len(current) >= 2:
                tables.append(current)
            current = []
    if len(current) >= 2:
        tables.append(current)
    return tables


def _md_table_to_html(table: list) -> str:
    if not table: return ""
    rows = ""
    for i, row in enumerate(table):
        tag = "th" if i == 0 else "td"
        rows += "<tr>" + "".join(f"<{tag}>{c}</{tag}>" for c in row) + "</tr>"
    return f"<table>{rows}</table>"


# ════════════════════════════════════════════════════════════════
#  Excel 저장
# ════════════════════════════════════════════════════════════════
def _save_excel(tables: list, path: str):
    wb = openpyxl.Workbook()
    wb.remove(wb.active)

    hfill = PatternFill("solid", fgColor="1A1A2E")
    hfont = Font(color="00E5A0", bold=True, size=10)
    cfont = Font(size=10)
    center = Alignment(horizontal="center", vertical="center", wrap_text=True)
    thin   = Side(style="thin", color="2A2A33")
    bdr    = Border(left=thin, right=thin, top=thin, bottom=thin)

    for i, table in enumerate(tables, 1):
        ws = wb.create_sheet(f"표 {i}")
        if not table:
            continue
        for r_idx, row in enumerate(table, 1):
            for c_idx, val in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=val)
                cell.border    = bdr
                cell.alignment = center
                if r_idx == 1:
                    cell.fill = hfill
                    cell.font = hfont
                else:
                    cell.font = cfont
                    if r_idx % 2 == 0:
                        cell.fill = PatternFill("solid", fgColor="0F0F14")
        for col in ws.columns:
            w = max((len(str(c.value or "")) for c in col), default=8)
            ws.column_dimensions[col[0].column_letter].width = min(w + 4, 40)

    if not wb.sheetnames:
        wb.create_sheet("Sheet1")
    wb.save(path)