feat: OpenRouter 외부 AI 연동 (STT 교정 + OCR Vision)

2026-04-28 15:38:06 +09:00
parent f9075ae3f6
commit f35fe1143a
5 changed files with 667 additions and 299 deletions
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -15,15 +15,12 @@ OLLAMA_URL     = os.getenv("OLLAMA_URL", "http://192.168.0.126:11434")
 OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "600"))

 _cpu_threads_env = int(os.getenv("CPU_THREADS", "0"))
-CPU_THREADS = _cpu_threads_env if _cpu_threads_env > 0 else None  # None = auto
+CPU_THREADS = _cpu_threads_env if _cpu_threads_env > 0 else None

 celery_app = Celery("whisper_tasks", broker=REDIS_URL, backend=REDIS_URL)
 celery_app.conf.update(
-    task_serializer="json",
-    result_serializer="json",
-    accept_content=["json"],
-    task_track_started=True,
-    result_expires=3600,
+    task_serializer="json", result_serializer="json",
+    accept_content=["json"], task_track_started=True, result_expires=3600,
 )

 _model = None
@@ -33,92 +30,128 @@ def get_model():
    if _model is None:
        from faster_whisper import WhisperModel
        kwargs = dict(device=DEVICE, compute_type=COMPUTE_TYPE)
-        if CPU_THREADS is not None:
-            kwargs["cpu_threads"] = CPU_THREADS
+        if CPU_THREADS is not None: kwargs["cpu_threads"] = CPU_THREADS
        print(f"[Whisper] 로딩: {MODEL_SIZE} / {DEVICE} / {COMPUTE_TYPE} / threads={CPU_THREADS or 'auto'}")
        _model = WhisperModel(MODEL_SIZE, **kwargs)
        print("[Whisper] 로드 완료")
    return _model


+# ── 후처리: Ollama ────────────────────────────────────────────
 def _ollama_postprocess(text: str, model: str) -> str:
-    if not model or not text.strip():
-        return text
+    if not model or not text.strip(): return text
    prompt = (
        "다음은 음성 인식으로 추출된 텍스트입니다. "
        "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
-        "결과 텍스트만 출력하고 설명은 하지 마.\n\n"
-        f"{text}"
+        "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text
+    )
+    try:
+        resp = httpx.post(f"{OLLAMA_URL}/api/chat", json={
+            "model": model,
+            "messages": [{"role":"user","content":prompt}],
+            "stream": False, "options": {"temperature": 0.1},
+        }, timeout=float(OLLAMA_TIMEOUT))
+        resp.raise_for_status()
+        result = resp.json().get("message",{}).get("content","").strip()
+        return result if result else text
+    except Exception as e:
+        print(f"[Ollama 후처리 실패] {e}"); return text
+
+
+# ── 후처리: OpenRouter (OpenAI 호환) ─────────────────────────
+def _openrouter_postprocess(text: str, model: str, base_url: str, api_key: str) -> str:
+    if not model or not api_key or not text.strip(): return text
+    prompt = (
+        "다음은 음성 인식으로 추출된 텍스트입니다. "
+        "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
+        "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text
    )
    try:
        resp = httpx.post(
-            f"{OLLAMA_URL}/api/chat",
-            json={"model": model,
-                  "messages": [{"role": "user", "content": prompt}],
-                  "stream": False, "options": {"temperature": 0.1}},
+            f"{base_url.rstrip('/')}/chat/completions",
+            headers={
+                "Authorization":  f"Bearer {api_key}",
+                "HTTP-Referer":   "https://voicescript.local",
+                "X-Title":        "VoiceScript",
+                "Content-Type":   "application/json",
+            },
+            json={
+                "model": model,
+                "messages": [{"role":"user","content":prompt}],
+                "temperature": 0.1,
+            },
            timeout=float(OLLAMA_TIMEOUT),
        )
        resp.raise_for_status()
-        result = resp.json().get("message", {}).get("content", "").strip()
+        result = resp.json()["choices"][0]["message"]["content"].strip()
        return result if result else text
    except Exception as e:
-        print(f"[Ollama 후처리 실패] {e}")
-        return text
+        print(f"[OpenRouter 후처리 실패] {e}"); return text


+# ════════════════════════════════════════════════════════════════
+#  STT Task
+# ════════════════════════════════════════════════════════════════
@celery_app.task(bind=True, name="tasks.transcribe_task", queue="stt")
-def transcribe_task(self, file_id: str, audio_path: str,
-                    use_ollama: bool = False, ollama_model: str = ""):
-    self.update_state(state="PROGRESS", meta={"progress": 5, "message": "모델 준비 중..."})
+def transcribe_task(
+    self,
+    file_id:          str,
+    audio_path:       str,
+    use_ollama:       bool = False,
+    ollama_model:     str  = "",
+    use_openrouter:   bool = False,
+    openrouter_model: str  = "",
+    openrouter_url:   str  = "",
+    openrouter_key:   str  = "",
+):
+    self.update_state(state="PROGRESS", meta={"progress":5,"message":"모델 준비 중..."})
    try:
        model = get_model()
-        self.update_state(state="PROGRESS", meta={"progress": 15, "message": "오디오 분석 중..."})
+        self.update_state(state="PROGRESS", meta={"progress":15,"message":"오디오 분석 중..."})

        segments_gen, info = model.transcribe(
-            audio_path,
-            language=LANGUAGE,
-            beam_size=BEAM_SIZE,
-            initial_prompt=INITIAL_PROMPT,
-            vad_filter=True,
-            vad_parameters=dict(min_silence_duration_ms=500),
-            word_timestamps=False,
+            audio_path, language=LANGUAGE, beam_size=BEAM_SIZE,
+            initial_prompt=INITIAL_PROMPT, vad_filter=True,
+            vad_parameters=dict(min_silence_duration_ms=500), word_timestamps=False,
        )

-        self.update_state(state="PROGRESS", meta={"progress": 30, "message": "텍스트 변환 중..."})
-
+        self.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 변환 중..."})
        segments, parts = [], []
        duration = info.duration

        for seg in segments_gen:
-            segments.append({"start": round(seg.start, 2),
-                             "end":   round(seg.end, 2),
-                             "text":  seg.text.strip()})
+            segments.append({"start":round(seg.start,2),"end":round(seg.end,2),"text":seg.text.strip()})
            parts.append(seg.text.strip())
            if duration > 0:
-                pct = 30 + int((seg.end / duration) * 50)
-                self.update_state(
-                    state="PROGRESS",
-                    meta={"progress": min(pct, 80),
-                          "message": f"변환 중... {seg.end:.0f}s / {duration:.0f}s"},
-                )
+                pct = 30 + int((seg.end/duration)*50)
+                self.update_state(state="PROGRESS",
+                    meta={"progress":min(pct,80),"message":f"변환 중... {seg.end:.0f}s / {duration:.0f}s"})

        raw_text  = "\n".join(parts)
        full_text = raw_text

+        # Ollama 후처리
        if use_ollama and ollama_model:
            self.update_state(state="PROGRESS",
-                              meta={"progress": 85,
-                                    "message": f"Ollama({ollama_model}) 후처리 중..."})
+                meta={"progress":85,"message":f"Ollama({ollama_model}) 후처리 중..."})
            full_text = _ollama_postprocess(raw_text, ollama_model)

-        self.update_state(state="PROGRESS", meta={"progress": 95, "message": "파일 저장 중..."})
+        # OpenRouter 후처리
+        elif use_openrouter and openrouter_model and openrouter_key:
+            self.update_state(state="PROGRESS",
+                meta={"progress":85,"message":f"OpenRouter({openrouter_model}) 후처리 중..."})
+            full_text = _openrouter_postprocess(raw_text, openrouter_model, openrouter_url, openrouter_key)
+
+        self.update_state(state="PROGRESS", meta={"progress":95,"message":"파일 저장 중..."})
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        output_filename = f"{file_id}.txt"

        with open(os.path.join(OUTPUT_DIR, output_filename), "w", encoding="utf-8") as f:
            f.write(f"# 변환 결과\n# 언어: {info.language}  |  재생 시간: {duration:.1f}초")
            if use_ollama and ollama_model:
-                f.write(f"  |  Ollama 후처리: {ollama_model}")
+                f.write(f"  |  Ollama: {ollama_model}")
+            elif use_openrouter and openrouter_model:
+                f.write(f"  |  OpenRouter: {openrouter_model}")
            f.write("\n\n## 전체 텍스트\n\n" + full_text + "\n\n")
            f.write("## 타임스탬프별 세그먼트\n\n")
            for seg in segments:
@@ -128,14 +161,16 @@ def transcribe_task(self, file_id: str, audio_path: str,
        except: pass

        return {
-            "text":         full_text,
-            "raw_text":     raw_text,
-            "segments":     segments,
-            "language":     info.language,
-            "duration":     round(duration, 1),
-            "output_file":  output_filename,
-            "ollama_used":  use_ollama and bool(ollama_model),
-            "ollama_model": ollama_model if (use_ollama and ollama_model) else "",
+            "text":              full_text,
+            "raw_text":          raw_text,
+            "segments":          segments,
+            "language":          info.language,
+            "duration":          round(duration, 1),
+            "output_file":       output_filename,
+            "ollama_used":       use_ollama and bool(ollama_model),
+            "ollama_model":      ollama_model if (use_ollama and ollama_model) else "",
+            "openrouter_used":   use_openrouter and bool(openrouter_model) and bool(openrouter_key),
+            "openrouter_model":  openrouter_model if (use_openrouter and openrouter_model) else "",
        }

    except Exception as e: