feat: 자막 생성 탭 (ffmpeg+Whisper+LLM 3단계 파이프라인)

2026-05-02 07:28:34 +09:00
parent 4fc3da1a2d
commit b3805c2b0b
3 changed files with 884 additions and 608 deletions
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -1,10 +1,12 @@
 """
-STT Celery Tasks
- faster-whisper 변환
- Ollama / OpenRouter 후처리 (교정 또는 번역)
- SRT / VTT / TXT 자막 파일 생성
+STT + Subtitle Pipeline Celery Tasks
+
+subtitle_pipeline_task:
+  Step 1: ffmpeg  → 16kHz WAV 추출
+  Step 2: Whisper → 원어 SRT / VTT 생성
+  Step 3: LLM     → 번역 SRT / VTT 생성 (선택)
 """
-import os, json
+import os, json, subprocess, tempfile
 import httpx
 from celery import Celery
 from ocr_tasks import ocr_task  # noqa: F401
@@ -29,221 +31,267 @@ celery_app.conf.update(
    accept_content=["json"], task_track_started=True, result_expires=3600,
 )

-_model = None
+_whisper_model = None

 def get_model():
-    global _model
-    if _model is None:
+    global _whisper_model
+    if _whisper_model is None:
        from faster_whisper import WhisperModel
        kwargs = dict(device=DEVICE, compute_type=COMPUTE_TYPE)
        if CPU_THREADS is not None: kwargs["cpu_threads"] = CPU_THREADS
        print(f"[Whisper] 로딩: {MODEL_SIZE}/{DEVICE}/{COMPUTE_TYPE}/threads={CPU_THREADS or 'auto'}")
-        _model = WhisperModel(MODEL_SIZE, **kwargs)
+        _whisper_model = WhisperModel(MODEL_SIZE, **kwargs)
        print("[Whisper] 로드 완료")
-    return _model
+    return _whisper_model


 # ══════════════════════════════════════════════════════════════
-#  언어 코드 매핑
+#  언어 코드 → 표시명
 # ══════════════════════════════════════════════════════════════
 LANG_NAMES = {
-    "ko":"한국어","en":"English","ja":"日本語","zh":"中文","fr":"Français",
-    "de":"Deutsch","es":"Español","it":"Italiano","pt":"Português","ru":"Русский",
-    "ar":"العربية","vi":"Tiếng Việt","th":"ไทย","id":"Bahasa Indonesia",
-    "nl":"Nederlands","pl":"Polski","tr":"Türkçe","sv":"Svenska","uk":"Українська",
+    "ko":"한국어","en":"English","ja":"日本語","zh":"中文(简体)",
+    "zh-tw":"中文(繁體)","fr":"Français","de":"Deutsch","es":"Español",
+    "it":"Italiano","pt":"Português","ru":"Русский","ar":"العربية",
+    "vi":"Tiếng Việt","th":"ไทย","id":"Bahasa Indonesia",
+    "nl":"Nederlands","pl":"Polski","tr":"Türkçe","sv":"Svenska",
+    "uk":"Українська","hi":"हिन्दी","bn":"বাংলা",
 }
-
-def _lang_name(code: str) -> str:
-    return LANG_NAMES.get(code, code)
+def _lang_name(code): return LANG_NAMES.get(code, code)


 # ══════════════════════════════════════════════════════════════
 #  자막 포맷 생성
 # ══════════════════════════════════════════════════════════════
-def _fmt_srt_time(s: float) -> str:
-    """초 → SRT 시간 포맷 00:00:00,000"""
+def _srt_time(s: float) -> str:
    ms = int(round(s * 1000))
-    h, rem = divmod(ms, 3600000)
-    m, rem = divmod(rem, 60000)
-    sec, ms = divmod(rem, 1000)
+    h, r = divmod(ms, 3600000); m, r = divmod(r, 60000); sec, ms = divmod(r, 1000)
    return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}"

-def _fmt_vtt_time(s: float) -> str:
-    """초 → VTT 시간 포맷 00:00:00.000"""
-    return _fmt_srt_time(s).replace(",", ".")
+def _vtt_time(s: float) -> str:
+    return _srt_time(s).replace(",", ".")

-def _make_srt(segments: list) -> str:
-    lines = []
+def make_srt(segments: list) -> str:
+    out = []
    for i, seg in enumerate(segments, 1):
-        lines.append(str(i))
-        lines.append(f"{_fmt_srt_time(seg['start'])} --> {_fmt_srt_time(seg['end'])}")
-        lines.append(seg["text"].strip())
-        lines.append("")
-    return "\n".join(lines)
+        out += [str(i), f"{_srt_time(seg['start'])} --> {_srt_time(seg['end'])}", seg["text"].strip(), ""]
+    return "\n".join(out)

-def _make_vtt(segments: list) -> str:
-    lines = ["WEBVTT", ""]
+def make_vtt(segments: list) -> str:
+    out = ["WEBVTT", ""]
    for i, seg in enumerate(segments, 1):
-        lines.append(f"{i}")
-        lines.append(f"{_fmt_vtt_time(seg['start'])} --> {_fmt_vtt_time(seg['end'])}")
-        lines.append(seg["text"].strip())
-        lines.append("")
-    return "\n".join(lines)
+        out += [str(i), f"{_vtt_time(seg['start'])} --> {_vtt_time(seg['end'])}", seg["text"].strip(), ""]
+    return "\n".join(out)


 # ══════════════════════════════════════════════════════════════
-#  번역 (Ollama / OpenRouter)
+#  LLM 번역 (세그먼트 배치)
 # ══════════════════════════════════════════════════════════════
-def _translate_segments(segments: list, target_lang: str,
-                         use_openrouter: bool, model: str,
-                         openrouter_url: str, openrouter_key: str,
-                         task_self=None) -> list:
-    """세그먼트 텍스트를 target_lang으로 번역해서 새 세그먼트 리스트 반환"""
-    if not model or not target_lang:
-        return segments
-
+def _translate_batch(texts: list, target_lang: str,
+                     use_openrouter: bool, model: str,
+                     openrouter_url: str, openrouter_key: str) -> list:
+    """texts 리스트 → 번역된 texts 리스트"""
+    if not texts or not model: return texts
    lang_name = _lang_name(target_lang)
-    translated = []
+    prompt = (
+        f"아래 자막 문장 배열을 {lang_name}로 번역해줘.\n"
+        f"반드시 JSON 문자열 배열로만 답해. 설명·마크다운 없이 배열만 출력.\n"
+        f"입력과 동일한 개수와 순서를 유지해.\n\n"
+        f"{json.dumps(texts, ensure_ascii=False)}"
+    )
+    try:
+        if use_openrouter and openrouter_key:
+            resp = httpx.post(
+                f"{openrouter_url.rstrip('/')}/chat/completions",
+                headers={"Authorization": f"Bearer {openrouter_key}",
+                         "HTTP-Referer": "https://voicescript.local",
+                         "Content-Type": "application/json"},
+                json={"model": model,
+                      "messages": [{"role":"user","content":prompt}],
+                      "temperature": 0.2},
+                timeout=float(OLLAMA_TIMEOUT),
+            )
+            resp.raise_for_status()
+            raw = resp.json()["choices"][0]["message"]["content"].strip()
+        else:
+            resp = httpx.post(f"{OLLAMA_URL}/api/chat",
+                json={"model": model,
+                      "messages": [{"role":"user","content":prompt}],
+                      "stream": False, "options": {"temperature": 0.2}},
+                timeout=float(OLLAMA_TIMEOUT))
+            resp.raise_for_status()
+            raw = resp.json().get("message",{}).get("content","").strip()

-    # 세그먼트를 청크로 묶어서 번역 (API 호출 최소화)
-    # 최대 20개씩 묶음
-    CHUNK = 20
-    chunks = [segments[i:i+CHUNK] for i in range(0, len(segments), CHUNK)]
-
-    for ci, chunk in enumerate(chunks):
-        if task_self:
-            pct = 85 + int((ci / len(chunks)) * 10)
-            task_self.update_state(state="PROGRESS",
-                meta={"progress": pct,
-                      "message": f"번역 중... ({ci*CHUNK+1}/{len(segments)})"})
-
-        # JSON 배열로 텍스트만 전달
-        texts = [seg["text"].strip() for seg in chunk]
-        prompt = (
-            f"다음 문장들을 {lang_name}로 번역해줘.\n"
-            f"JSON 배열 형식으로만 답해줘. 설명 없이 번역된 문장 배열만 출력해.\n"
-            f"입력 배열과 동일한 개수, 동일한 순서로 출력해.\n\n"
-            f"입력: {json.dumps(texts, ensure_ascii=False)}"
-        )
-
-        try:
-            if use_openrouter and openrouter_key:
-                resp = httpx.post(
-                    f"{openrouter_url.rstrip('/')}/chat/completions",
-                    headers={"Authorization": f"Bearer {openrouter_key}",
-                             "HTTP-Referer": "https://voicescript.local",
-                             "Content-Type": "application/json"},
-                    json={"model": model,
-                          "messages": [{"role":"user","content":prompt}],
-                          "temperature": 0.2},
-                    timeout=float(OLLAMA_TIMEOUT),
-                )
-                resp.raise_for_status()
-                raw = resp.json()["choices"][0]["message"]["content"].strip()
-            else:
-                resp = httpx.post(f"{OLLAMA_URL}/api/chat",
-                    json={"model": model,
-                          "messages": [{"role":"user","content":prompt}],
-                          "stream": False, "options": {"temperature": 0.2}},
-                    timeout=float(OLLAMA_TIMEOUT))
-                resp.raise_for_status()
-                raw = resp.json().get("message",{}).get("content","").strip()
-
-            # JSON 파싱
-            # 코드블록 제거
-            if "```" in raw:
-                raw = raw.split("```")[1].lstrip("json").strip()
-            trans_texts = json.loads(raw)
-            if not isinstance(trans_texts, list):
-                trans_texts = texts  # 파싱 실패 시 원본 유지
-
-        except Exception as e:
-            print(f"[번역 실패 chunk {ci}] {e}")
-            trans_texts = texts  # 실패 시 원본 유지
-
-        # 번역된 텍스트를 세그먼트에 결합
-        for seg, t_text in zip(chunk, trans_texts):
-            translated.append({**seg, "text": t_text})
-        # 남은 세그먼트 (번역 누락)
-        if len(trans_texts) < len(chunk):
-            for seg in chunk[len(trans_texts):]:
-                translated.append(seg)
-
-    return translated
+        # 코드블록 제거 후 JSON 파싱
+        if "```" in raw:
+            raw = raw.split("```")[1].lstrip("json\n").rstrip()
+        result = json.loads(raw)
+        if isinstance(result, list) and len(result) == len(texts):
+            return [str(r) for r in result]
+        return texts
+    except Exception as e:
+        print(f"[번역 실패] {e}")
+        return texts  # 실패 시 원본 유지


 # ══════════════════════════════════════════════════════════════
-#  Ollama 텍스트 후처리 (교정)
+#  STT + Ollama/OpenRouter 후처리 (기존 음성변환용)
 # ══════════════════════════════════════════════════════════════
 def _ollama_postprocess(text: str, model: str) -> str:
    if not model or not text.strip(): return text
-    prompt = (
-        "다음은 음성 인식으로 추출된 텍스트입니다. "
-        "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
-        "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text
-    )
+    prompt = ("다음은 음성 인식으로 추출된 텍스트입니다. "
+              "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
+              "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text)
    try:
        resp = httpx.post(f"{OLLAMA_URL}/api/chat",
            json={"model":model,"messages":[{"role":"user","content":prompt}],
                  "stream":False,"options":{"temperature":0.1}},
            timeout=float(OLLAMA_TIMEOUT))
        resp.raise_for_status()
-        result = resp.json().get("message",{}).get("content","").strip()
-        return result if result else text
-    except Exception as e:
-        print(f"[Ollama 후처리 실패] {e}"); return text
+        return resp.json().get("message",{}).get("content","").strip() or text
+    except: return text

 def _openrouter_postprocess(text: str, model: str, base_url: str, api_key: str) -> str:
    if not model or not api_key or not text.strip(): return text
-    prompt = (
-        "다음은 음성 인식으로 추출된 텍스트입니다. "
-        "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
-        "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text
-    )
+    prompt = ("다음은 음성 인식으로 추출된 텍스트입니다. "
+              "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. "
+              "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text)
    try:
        resp = httpx.post(f"{base_url.rstrip('/')}/chat/completions",
            headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local","Content-Type":"application/json"},
            json={"model":model,"messages":[{"role":"user","content":prompt}],"temperature":0.1},
            timeout=float(OLLAMA_TIMEOUT))
        resp.raise_for_status()
-        result = resp.json()["choices"][0]["message"]["content"].strip()
-        return result if result else text
-    except Exception as e:
-        print(f"[OpenRouter 후처리 실패] {e}"); return text
+        return resp.json()["choices"][0]["message"]["content"].strip() or text
+    except: return text


 # ══════════════════════════════════════════════════════════════
-#  메인 STT Task
+#  기존 STT 태스크 (음성변환 탭용)
 # ══════════════════════════════════════════════════════════════
@celery_app.task(bind=True, name="tasks.transcribe_task", queue="stt")
 def transcribe_task(
    self,
-    file_id:           str,
-    audio_path:        str,
-    # 후처리
-    use_ollama:        bool = False,
-    ollama_model:      str  = "",
-    use_openrouter:    bool = False,
-    openrouter_model:  str  = "",
-    openrouter_url:    str  = "",
-    openrouter_key:    str  = "",
-    # 자막
-    subtitle_mode:     bool = False,   # True → 자막 파일 생성
-    subtitle_format:   str  = "srt",   # srt | vtt | both
-    translate_to:      str  = "",      # 번역 대상 언어 코드 (없으면 원어 자막)
-    translate_model:   str  = "",      # 번역에 쓸 모델
-    translate_via:     str  = "ollama",# ollama | openrouter
-    # 원본 언어 강제 지정 (없으면 auto)
-    force_language:    str  = "",
+    file_id: str, audio_path: str,
+    use_ollama: bool = False, ollama_model: str = "",
+    use_openrouter: bool = False, openrouter_model: str = "",
+    openrouter_url: str = "", openrouter_key: str = "",
 ):
    self.update_state(state="PROGRESS", meta={"progress":5,"message":"모델 준비 중..."})
    try:
        model = get_model()
        self.update_state(state="PROGRESS", meta={"progress":15,"message":"오디오 분석 중..."})
-
-        lang = force_language.strip() or LANGUAGE
        segments_gen, info = model.transcribe(
-            audio_path,
+            audio_path, language=LANGUAGE, beam_size=BEAM_SIZE,
+            initial_prompt=INITIAL_PROMPT, vad_filter=True,
+            vad_parameters=dict(min_silence_duration_ms=500), word_timestamps=False,
+        )
+        self.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 변환 중..."})
+        segments, parts = [], []
+        duration = info.duration
+        for seg in segments_gen:
+            segments.append({"start":round(seg.start,3),"end":round(seg.end,3),"text":seg.text.strip()})
+            parts.append(seg.text.strip())
+            if duration > 0:
+                pct = 30 + int((seg.end/duration)*50)
+                self.update_state(state="PROGRESS",
+                    meta={"progress":min(pct,80),"message":f"변환 중... {seg.end:.0f}s / {duration:.0f}s"})
+
+        raw_text = "\n".join(parts)
+        full_text = raw_text
+
+        if use_ollama and ollama_model:
+            self.update_state(state="PROGRESS",meta={"progress":85,"message":f"Ollama({ollama_model}) 교정 중..."})
+            full_text = _ollama_postprocess(raw_text, ollama_model)
+        elif use_openrouter and openrouter_model and openrouter_key:
+            self.update_state(state="PROGRESS",meta={"progress":85,"message":f"OpenRouter({openrouter_model}) 교정 중..."})
+            full_text = _openrouter_postprocess(raw_text, openrouter_model, openrouter_url, openrouter_key)
+
+        self.update_state(state="PROGRESS",meta={"progress":95,"message":"파일 저장 중..."})
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        output_filename = f"{file_id}.txt"
+        with open(os.path.join(OUTPUT_DIR, output_filename),"w",encoding="utf-8") as f:
+            f.write(f"# 변환 결과\n# 언어: {info.language}  |  재생 시간: {duration:.1f}초\n\n## 전체 텍스트\n\n{full_text}\n\n## 타임스탬프별 세그먼트\n\n")
+            for seg in segments:
+                m,s=divmod(int(seg['start']),60)
+                f.write(f"[{m:02d}:{s:02d}]  {seg['text']}\n")
+        try: os.remove(audio_path)
+        except: pass
+        return {
+            "text":full_text,"raw_text":raw_text,"segments":segments,
+            "language":info.language,"duration":round(duration,1),
+            "output_file":output_filename,
+            "ollama_used":use_ollama and bool(ollama_model),
+            "ollama_model":ollama_model if (use_ollama and ollama_model) else "",
+            "openrouter_used":use_openrouter and bool(openrouter_model) and bool(openrouter_key),
+            "openrouter_model":openrouter_model if (use_openrouter and openrouter_model) else "",
+        }
+    except Exception as e:
+        raise Exception(f"변환 실패: {str(e)}")
+
+
+# ══════════════════════════════════════════════════════════════
+#  자막 파이프라인 태스크
+#  Step 1: ffmpeg → WAV
+#  Step 2: Whisper → 원어 SRT/VTT
+#  Step 3: LLM → 번역 SRT/VTT (선택)
+# ══════════════════════════════════════════════════════════════
+@celery_app.task(bind=True, name="tasks.subtitle_pipeline_task", queue="stt")
+def subtitle_pipeline_task(
+    self,
+    file_id:        str,
+    video_path:     str,
+    src_language:   str  = "",      # 원어 코드 (빈칸=자동)
+    subtitle_fmt:   str  = "srt",   # srt | vtt | both
+    translate_to:   str  = "",      # 번역 대상 (빈칸=번역 안 함)
+    trans_model:    str  = "",      # 번역 모델
+    trans_via:      str  = "ollama",# ollama | openrouter
+    openrouter_url: str  = "",
+    openrouter_key: str  = "",
+):
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    wav_path = os.path.join(os.path.dirname(video_path), f"{file_id}_audio.wav")
+    result_files = {}
+
+    try:
+        # ── Step 1: ffmpeg 오디오 추출 ────────────────────────
+        self.update_state(state="PROGRESS", meta={
+            "progress": 5,
+            "step": 1,
+            "step_msg": "오디오 추출 중...",
+            "message": "Step 1/3 — ffmpeg 오디오 추출 중..."
+        })
+
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", video_path,
+            "-vn",              # 비디오 스트림 제거
+            "-ar", "16000",     # 16kHz — Whisper 최적
+            "-ac", "1",         # 모노
+            "-c:a", "pcm_s16le",# WAV 무손실
+            wav_path
+        ]
+        proc = subprocess.run(cmd, capture_output=True, timeout=600)
+        if proc.returncode != 0:
+            err = proc.stderr.decode(errors="replace")[-500:]
+            raise Exception(f"ffmpeg 오디오 추출 실패: {err}")
+        if not os.path.exists(wav_path) or os.path.getsize(wav_path) < 1000:
+            raise Exception("ffmpeg가 오디오를 추출하지 못했습니다. 영상에 오디오 트랙이 있는지 확인하세요.")
+
+        try: os.remove(video_path)
+        except: pass
+
+        # ── Step 2: Whisper STT → 원어 자막 ───────────────────
+        self.update_state(state="PROGRESS", meta={
+            "progress": 15,
+            "step": 2,
+            "step_msg": "음성 인식 중...",
+            "message": "Step 2/3 — Whisper 음성 인식 시작..."
+        })
+
+        whisper = get_model()
+        lang = src_language.strip() or None
+        segments_gen, info = whisper.transcribe(
+            wav_path,
            language=lang,
            beam_size=BEAM_SIZE,
            initial_prompt=INITIAL_PROMPT,
@@ -252,131 +300,115 @@ def transcribe_task(
            word_timestamps=False,
        )

-        self.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 변환 중..."})
-        segments, parts = [], []
+        segments = []
        duration = info.duration
-
-        for seg in segments_gen:
-            segments.append({"start":round(seg.start,3),"end":round(seg.end,3),"text":seg.text.strip()})
-            parts.append(seg.text.strip())
-            if duration > 0:
-                pct = 30 + int((seg.end/duration)*45)
-                self.update_state(state="PROGRESS",
-                    meta={"progress":min(pct,75),
-                          "message":f"변환 중... {seg.end:.0f}s / {duration:.0f}s"})
-
-        raw_text  = "\n".join(parts)
-        full_text = raw_text
        detected_lang = info.language

-        # ── 텍스트 후처리 (교정) ──────────────────────────────
-        if use_ollama and ollama_model and not subtitle_mode:
-            self.update_state(state="PROGRESS",
-                meta={"progress":80,"message":f"Ollama({ollama_model}) 교정 중..."})
-            full_text = _ollama_postprocess(raw_text, ollama_model)
+        for seg in segments_gen:
+            segments.append({
+                "start": round(seg.start, 3),
+                "end":   round(seg.end,   3),
+                "text":  seg.text.strip(),
+            })
+            if duration > 0:
+                pct = 15 + int((seg.end / duration) * 55)
+                self.update_state(state="PROGRESS", meta={
+                    "progress": min(pct, 70),
+                    "step": 2,
+                    "step_msg": f"{seg.end:.0f}s / {duration:.0f}s 인식 완료",
+                    "message": f"Step 2/3 — {seg.end:.0f}s / {duration:.0f}s",
+                })

-        elif use_openrouter and openrouter_model and openrouter_key and not subtitle_mode:
-            self.update_state(state="PROGRESS",
-                meta={"progress":80,"message":f"OpenRouter({openrouter_model}) 교정 중..."})
-            full_text = _openrouter_postprocess(raw_text, openrouter_model, openrouter_url, openrouter_key)
-
-        # ── 자막 모드: 번역 ──────────────────────────────────
-        translated_segments = segments
-        is_translated = False
-
-        if subtitle_mode and translate_to and translate_to != detected_lang:
-            t_model = translate_model or (ollama_model if translate_via=="ollama" else openrouter_model)
-            t_via_or = (translate_via == "openrouter" and bool(openrouter_key))
-            self.update_state(state="PROGRESS",
-                meta={"progress":82,
-                      "message":f"{_lang_name(translate_to)}로 번역 중..."})
-            translated_segments = _translate_segments(
-                segments, translate_to,
-                use_openrouter=t_via_or,
-                model=t_model,
-                openrouter_url=openrouter_url,
-                openrouter_key=openrouter_key,
-                task_self=self,
-            )
-            is_translated = True
-            # 번역된 전체 텍스트
-            full_text = "\n".join(s["text"] for s in translated_segments)
-
-        self.update_state(state="PROGRESS", meta={"progress":93,"message":"파일 저장 중..."})
-        os.makedirs(OUTPUT_DIR, exist_ok=True)
-
-        result_files = {}
-
-        # ── TXT 저장 ─────────────────────────────────────────
-        txt_filename = f"{file_id}.txt"
-        with open(os.path.join(OUTPUT_DIR, txt_filename), "w", encoding="utf-8") as f:
-            f.write(f"# 변환 결과\n")
-            f.write(f"# 언어: {detected_lang}  |  재생 시간: {duration:.1f}초\n")
-            if is_translated:
-                f.write(f"# 번역: {_lang_name(translate_to)}\n")
-            f.write(f"\n## 전체 텍스트\n\n{full_text}\n\n")
-            f.write(f"## 타임스탬프별 세그먼트\n\n")
-            for seg in (translated_segments if is_translated else segments):
-                f.write(f"[{_fmt_ts(seg['start'])} → {_fmt_ts(seg['end'])}]  {seg['text']}\n")
-        result_files["txt"] = txt_filename
-
-        # ── 자막 파일 저장 ────────────────────────────────────
-        if subtitle_mode:
-            sub_segs = translated_segments if is_translated else segments
-            lang_suffix = f".{translate_to}" if is_translated else f".{detected_lang}"
-
-            if subtitle_format in ("srt", "both"):
-                srt_fn = f"{file_id}{lang_suffix}.srt"
-                with open(os.path.join(OUTPUT_DIR, srt_fn), "w", encoding="utf-8") as f:
-                    f.write(_make_srt(sub_segs))
-                result_files["srt"] = srt_fn
-
-            if subtitle_format in ("vtt", "both"):
-                vtt_fn = f"{file_id}{lang_suffix}.vtt"
-                with open(os.path.join(OUTPUT_DIR, vtt_fn), "w", encoding="utf-8") as f:
-                    f.write(_make_vtt(sub_segs))
-                result_files["vtt"] = vtt_fn
-
-            # 원본 언어 SRT도 함께 (번역 시)
-            if is_translated and subtitle_format in ("srt","both"):
-                orig_fn = f"{file_id}.{detected_lang}.srt"
-                with open(os.path.join(OUTPUT_DIR, orig_fn), "w", encoding="utf-8") as f:
-                    f.write(_make_srt(segments))
-                result_files["srt_original"] = orig_fn
-
-        try: os.remove(audio_path)
+        try: os.remove(wav_path)
        except: pass

+        if not segments:
+            raise Exception("음성이 감지되지 않았습니다. 영상에 음성이 있는지 확인하세요.")
+
+        # 원어 자막 저장
+        lang_suffix = detected_lang
+        if subtitle_fmt in ("srt", "both"):
+            fn = f"{file_id}.{lang_suffix}.srt"
+            with open(os.path.join(OUTPUT_DIR, fn), "w", encoding="utf-8") as f:
+                f.write(make_srt(segments))
+            result_files["srt_orig"] = fn
+        if subtitle_fmt in ("vtt", "both"):
+            fn = f"{file_id}.{lang_suffix}.vtt"
+            with open(os.path.join(OUTPUT_DIR, fn), "w", encoding="utf-8") as f:
+                f.write(make_vtt(segments))
+            result_files["vtt_orig"] = fn
+
+        # ── Step 3: LLM 번역 (선택) ───────────────────────────
+        translated_segments = None
+
+        if translate_to and translate_to != detected_lang and trans_model:
+            target_name = _lang_name(translate_to)
+            use_or = (trans_via == "openrouter" and bool(openrouter_key))
+
+            total = len(segments)
+            CHUNK = 25  # 한 번에 25개씩 번역
+            translated_texts = []
+
+            for ci, start in enumerate(range(0, total, CHUNK)):
+                chunk = segments[start:start+CHUNK]
+                pct = 72 + int((ci * CHUNK / total) * 22)
+                self.update_state(state="PROGRESS", meta={
+                    "progress": min(pct, 94),
+                    "step": 3,
+                    "step_msg": f"{min(start+CHUNK, total)}/{total}개 번역 완료",
+                    "message": f"Step 3/3 — {target_name}로 번역 중... ({min(start+CHUNK,total)}/{total})",
+                })
+                batch_texts = [s["text"] for s in chunk]
+                translated = _translate_batch(
+                    batch_texts, translate_to,
+                    use_openrouter=use_or,
+                    model=trans_model,
+                    openrouter_url=openrouter_url,
+                    openrouter_key=openrouter_key,
+                )
+                translated_texts.extend(translated)
+
+            # 번역된 텍스트 → 세그먼트 조합 (타임스탬프 유지)
+            translated_segments = [
+                {**seg, "text": translated_texts[i] if i < len(translated_texts) else seg["text"]}
+                for i, seg in enumerate(segments)
+            ]
+
+            # 번역 자막 저장
+            trans_suffix = translate_to
+            if subtitle_fmt in ("srt", "both"):
+                fn = f"{file_id}.{trans_suffix}.srt"
+                with open(os.path.join(OUTPUT_DIR, fn), "w", encoding="utf-8") as f:
+                    f.write(make_srt(translated_segments))
+                result_files["srt_trans"] = fn
+            if subtitle_fmt in ("vtt", "both"):
+                fn = f"{file_id}.{trans_suffix}.vtt"
+                with open(os.path.join(OUTPUT_DIR, fn), "w", encoding="utf-8") as f:
+                    f.write(make_vtt(translated_segments))
+                result_files["vtt_trans"] = fn
+
+        self.update_state(state="PROGRESS", meta={
+            "progress": 98, "step": 3,
+            "step_msg": "완료", "message": "자막 파일 저장 완료"
+        })
+
        return {
-            # 기본 STT 결과
-            "text":              full_text,
-            "raw_text":          raw_text,
-            "segments":          translated_segments if is_translated else segments,
-            "orig_segments":     segments,
-            "language":          detected_lang,
-            "duration":          round(duration, 1),
-            # 후처리
-            "ollama_used":       use_ollama and bool(ollama_model) and not subtitle_mode,
-            "ollama_model":      ollama_model if (use_ollama and not subtitle_mode) else "",
-            "openrouter_used":   use_openrouter and bool(openrouter_model) and not subtitle_mode,
-            "openrouter_model":  openrouter_model if (use_openrouter and not subtitle_mode) else "",
-            # 자막
-            "subtitle_mode":     subtitle_mode,
-            "subtitle_format":   subtitle_format,
-            "translated":        is_translated,
-            "translate_to":      translate_to if is_translated else "",
-            "translate_model":   translate_model if is_translated else "",
+            "detected_language":  detected_lang,
+            "duration":           round(duration, 1),
+            "segment_count":      len(segments),
+            "translated":         bool(translated_segments),
+            "translate_to":       translate_to if translated_segments else "",
+            "subtitle_fmt":       subtitle_fmt,
            # 파일
-            "output_file":       result_files.get("txt",""),
-            "srt_file":          result_files.get("srt",""),
-            "vtt_file":          result_files.get("vtt",""),
-            "srt_original_file": result_files.get("srt_original",""),
+            "srt_orig":           result_files.get("srt_orig", ""),
+            "vtt_orig":           result_files.get("vtt_orig", ""),
+            "srt_trans":          result_files.get("srt_trans", ""),
+            "vtt_trans":          result_files.get("vtt_trans", ""),
        }

    except Exception as e:
-        raise Exception(f"변환 실패: {str(e)}")
-
-
-def _fmt_ts(s: float) -> str:
-    m, sec = divmod(int(s), 60)
-    return f"{m:02d}:{sec:02d}"
+        # 임시 파일 정리
+        for p in [video_path, wav_path]:
+            try: os.remove(p)
+            except: pass
+        raise Exception(f"자막 생성 실패: {str(e)}")