PDF변환 추가

This commit is contained in:
root
2026-05-07 17:45:54 +09:00
parent c3cb7a6e8f
commit 148d8b3483
9 changed files with 960 additions and 189 deletions

View File

@@ -5,6 +5,7 @@ import os, json, subprocess, tempfile
import httpx
from celery import Celery
from ocr_tasks import ocr_task # noqa: F401
from pdf_tasks import pdf_convert_task # noqa: F401
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
MODEL_SIZE = os.getenv("WHISPER_MODEL", "medium")
@@ -112,6 +113,22 @@ def _llm_call(prompt, model, use_openrouter, openrouter_url, openrouter_key, tim
f"해결: 설정에서 Ollama 타임아웃을 늘리거나, 더 작은 모델을 사용하세요."
)
def _extract_json_array(raw: str) -> str:
"""LLM 응답에서 JSON 배열 문자열 추출"""
s = raw.strip()
if "```" in s:
parts = s.split("```")
# 홀수 인덱스 = 코드블록 내부
for part in parts[1::2]:
cleaned = part.lstrip("json").strip()
if cleaned.startswith("["):
return cleaned
# 코드블록 없을 때: [ ... ] 범위 직접 탐색
start, end = s.find("["), s.rfind("]")
if start != -1 and end > start:
return s[start:end+1]
return s
def _translate_batch(texts, target_lang, use_or, model, or_url, or_key, timeout):
if not texts or not model: return texts
prompt = (
@@ -120,16 +137,17 @@ def _translate_batch(texts, target_lang, use_or, model, or_url, or_key, timeout)
f"입력과 동일한 개수와 순서를 유지해.\n\n"
f"{json.dumps(texts, ensure_ascii=False)}"
)
try:
raw = _llm_call(prompt, model, use_or, or_url, or_key, timeout)
if "```" in raw: raw=raw.split("```")[1].lstrip("json\n").rstrip()
result = json.loads(raw)
if isinstance(result,list) and len(result)==len(texts):
return [str(r) for r in result]
return texts
except Exception as e:
print(f"[번역 실패] {e}")
return texts
raw = _llm_call(prompt, model, use_or, or_url, or_key, timeout)
json_str = _extract_json_array(raw)
result = json.loads(json_str)
if not isinstance(result, list):
raise Exception(f"번역 결과가 배열 형식이 아닙니다 (모델: {model})")
if len(result) != len(texts):
raise Exception(
f"번역 결과 개수 불일치: 입력 {len(texts)}개, 출력 {len(result)}"
f"(모델: {model}). 더 작은 청크 크기나 다른 모델을 시도하세요."
)
return [str(r) for r in result]
def _refine_batch(texts, model, use_or, or_url, or_key, timeout):
if not texts or not model: return texts
@@ -175,7 +193,7 @@ def _api_transcribe(audio_path, api_key, base_url, language, model="whisper-larg
"""Groq / OpenAI Whisper API 호출"""
with open(audio_path,"rb") as f:
data = f.read()
params = {"model":model}
params = {"model":model, "response_format":"verbose_json"}
if language: params["language"] = language
try:
resp = httpx.post(
@@ -416,7 +434,10 @@ def subtitle_pipeline_task(
_prog(min(pct,95),3,f"{min(start+CHUNK,total)}/{total} 번역",
f"Step 3/3 — {_lang_name(translate_to)}로 번역 중... ({min(start+CHUNK,total)}/{total})")
batch=[s["text"] for s in chunk]
trans_texts.extend(_translate_batch(batch,translate_to,use_or,trans_model,openrouter_url,openrouter_key,timeout))
try:
trans_texts.extend(_translate_batch(batch,translate_to,use_or,trans_model,openrouter_url,openrouter_key,timeout))
except Exception as e:
raise Exception(f"LLM 번역 실패 ({_lang_name(translate_to)}, 청크 {ci+1}): {e}")
translated_segments=[{**seg,"text":trans_texts[i] if i<len(trans_texts) else seg["text"]}
for i,seg in enumerate(segments)]
trans_suffix=translate_to