""" STT Celery Tasks - faster-whisper 변환 - Ollama / OpenRouter 후처리 (교정 또는 번역) - SRT / VTT / TXT 자막 파일 생성 """ import os, json import httpx from celery import Celery from ocr_tasks import ocr_task # noqa: F401 REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") MODEL_SIZE = os.getenv("WHISPER_MODEL", "medium") DEVICE = os.getenv("WHISPER_DEVICE", "cpu") COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8") LANGUAGE = os.getenv("WHISPER_LANGUAGE", "ko") or None BEAM_SIZE = int(os.getenv("WHISPER_BEAM_SIZE", "5")) INITIAL_PROMPT = os.getenv("WHISPER_INITIAL_PROMPT", "") or None OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/data/outputs") OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.126:11434") OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "600")) _cpu_threads_env = int(os.getenv("CPU_THREADS", "0")) CPU_THREADS = _cpu_threads_env if _cpu_threads_env > 0 else None celery_app = Celery("whisper_tasks", broker=REDIS_URL, backend=REDIS_URL) celery_app.conf.update( task_serializer="json", result_serializer="json", accept_content=["json"], task_track_started=True, result_expires=3600, ) _model = None def get_model(): global _model if _model is None: from faster_whisper import WhisperModel kwargs = dict(device=DEVICE, compute_type=COMPUTE_TYPE) if CPU_THREADS is not None: kwargs["cpu_threads"] = CPU_THREADS print(f"[Whisper] 로딩: {MODEL_SIZE}/{DEVICE}/{COMPUTE_TYPE}/threads={CPU_THREADS or 'auto'}") _model = WhisperModel(MODEL_SIZE, **kwargs) print("[Whisper] 로드 완료") return _model # ══════════════════════════════════════════════════════════════ # 언어 코드 매핑 # ══════════════════════════════════════════════════════════════ LANG_NAMES = { "ko":"한국어","en":"English","ja":"日本語","zh":"中文","fr":"Français", "de":"Deutsch","es":"Español","it":"Italiano","pt":"Português","ru":"Русский", "ar":"العربية","vi":"Tiếng Việt","th":"ไทย","id":"Bahasa Indonesia", "nl":"Nederlands","pl":"Polski","tr":"Türkçe","sv":"Svenska","uk":"Українська", } def _lang_name(code: str) -> str: return LANG_NAMES.get(code, code) # ══════════════════════════════════════════════════════════════ # 자막 포맷 생성 # ══════════════════════════════════════════════════════════════ def _fmt_srt_time(s: float) -> str: """초 → SRT 시간 포맷 00:00:00,000""" ms = int(round(s * 1000)) h, rem = divmod(ms, 3600000) m, rem = divmod(rem, 60000) sec, ms = divmod(rem, 1000) return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}" def _fmt_vtt_time(s: float) -> str: """초 → VTT 시간 포맷 00:00:00.000""" return _fmt_srt_time(s).replace(",", ".") def _make_srt(segments: list) -> str: lines = [] for i, seg in enumerate(segments, 1): lines.append(str(i)) lines.append(f"{_fmt_srt_time(seg['start'])} --> {_fmt_srt_time(seg['end'])}") lines.append(seg["text"].strip()) lines.append("") return "\n".join(lines) def _make_vtt(segments: list) -> str: lines = ["WEBVTT", ""] for i, seg in enumerate(segments, 1): lines.append(f"{i}") lines.append(f"{_fmt_vtt_time(seg['start'])} --> {_fmt_vtt_time(seg['end'])}") lines.append(seg["text"].strip()) lines.append("") return "\n".join(lines) # ══════════════════════════════════════════════════════════════ # 번역 (Ollama / OpenRouter) # ══════════════════════════════════════════════════════════════ def _translate_segments(segments: list, target_lang: str, use_openrouter: bool, model: str, openrouter_url: str, openrouter_key: str, task_self=None) -> list: """세그먼트 텍스트를 target_lang으로 번역해서 새 세그먼트 리스트 반환""" if not model or not target_lang: return segments lang_name = _lang_name(target_lang) translated = [] # 세그먼트를 청크로 묶어서 번역 (API 호출 최소화) # 최대 20개씩 묶음 CHUNK = 20 chunks = [segments[i:i+CHUNK] for i in range(0, len(segments), CHUNK)] for ci, chunk in enumerate(chunks): if task_self: pct = 85 + int((ci / len(chunks)) * 10) task_self.update_state(state="PROGRESS", meta={"progress": pct, "message": f"번역 중... ({ci*CHUNK+1}/{len(segments)})"}) # JSON 배열로 텍스트만 전달 texts = [seg["text"].strip() for seg in chunk] prompt = ( f"다음 문장들을 {lang_name}로 번역해줘.\n" f"JSON 배열 형식으로만 답해줘. 설명 없이 번역된 문장 배열만 출력해.\n" f"입력 배열과 동일한 개수, 동일한 순서로 출력해.\n\n" f"입력: {json.dumps(texts, ensure_ascii=False)}" ) try: if use_openrouter and openrouter_key: resp = httpx.post( f"{openrouter_url.rstrip('/')}/chat/completions", headers={"Authorization": f"Bearer {openrouter_key}", "HTTP-Referer": "https://voicescript.local", "Content-Type": "application/json"}, json={"model": model, "messages": [{"role":"user","content":prompt}], "temperature": 0.2}, timeout=float(OLLAMA_TIMEOUT), ) resp.raise_for_status() raw = resp.json()["choices"][0]["message"]["content"].strip() else: resp = httpx.post(f"{OLLAMA_URL}/api/chat", json={"model": model, "messages": [{"role":"user","content":prompt}], "stream": False, "options": {"temperature": 0.2}}, timeout=float(OLLAMA_TIMEOUT)) resp.raise_for_status() raw = resp.json().get("message",{}).get("content","").strip() # JSON 파싱 # 코드블록 제거 if "```" in raw: raw = raw.split("```")[1].lstrip("json").strip() trans_texts = json.loads(raw) if not isinstance(trans_texts, list): trans_texts = texts # 파싱 실패 시 원본 유지 except Exception as e: print(f"[번역 실패 chunk {ci}] {e}") trans_texts = texts # 실패 시 원본 유지 # 번역된 텍스트를 세그먼트에 결합 for seg, t_text in zip(chunk, trans_texts): translated.append({**seg, "text": t_text}) # 남은 세그먼트 (번역 누락) if len(trans_texts) < len(chunk): for seg in chunk[len(trans_texts):]: translated.append(seg) return translated # ══════════════════════════════════════════════════════════════ # Ollama 텍스트 후처리 (교정) # ══════════════════════════════════════════════════════════════ def _ollama_postprocess(text: str, model: str) -> str: if not model or not text.strip(): return text prompt = ( "다음은 음성 인식으로 추출된 텍스트입니다. " "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. " "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text ) try: resp = httpx.post(f"{OLLAMA_URL}/api/chat", json={"model":model,"messages":[{"role":"user","content":prompt}], "stream":False,"options":{"temperature":0.1}}, timeout=float(OLLAMA_TIMEOUT)) resp.raise_for_status() result = resp.json().get("message",{}).get("content","").strip() return result if result else text except Exception as e: print(f"[Ollama 후처리 실패] {e}"); return text def _openrouter_postprocess(text: str, model: str, base_url: str, api_key: str) -> str: if not model or not api_key or not text.strip(): return text prompt = ( "다음은 음성 인식으로 추출된 텍스트입니다. " "내용은 절대 변경하지 말고, 문장 부호를 추가하고 자연스럽게 다듬어줘. " "결과 텍스트만 출력하고 설명은 하지 마.\n\n" + text ) try: resp = httpx.post(f"{base_url.rstrip('/')}/chat/completions", headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local","Content-Type":"application/json"}, json={"model":model,"messages":[{"role":"user","content":prompt}],"temperature":0.1}, timeout=float(OLLAMA_TIMEOUT)) resp.raise_for_status() result = resp.json()["choices"][0]["message"]["content"].strip() return result if result else text except Exception as e: print(f"[OpenRouter 후처리 실패] {e}"); return text # ══════════════════════════════════════════════════════════════ # 메인 STT Task # ══════════════════════════════════════════════════════════════ @celery_app.task(bind=True, name="tasks.transcribe_task", queue="stt") def transcribe_task( self, file_id: str, audio_path: str, # 후처리 use_ollama: bool = False, ollama_model: str = "", use_openrouter: bool = False, openrouter_model: str = "", openrouter_url: str = "", openrouter_key: str = "", # 자막 subtitle_mode: bool = False, # True → 자막 파일 생성 subtitle_format: str = "srt", # srt | vtt | both translate_to: str = "", # 번역 대상 언어 코드 (없으면 원어 자막) translate_model: str = "", # 번역에 쓸 모델 translate_via: str = "ollama",# ollama | openrouter # 원본 언어 강제 지정 (없으면 auto) force_language: str = "", ): self.update_state(state="PROGRESS", meta={"progress":5,"message":"모델 준비 중..."}) try: model = get_model() self.update_state(state="PROGRESS", meta={"progress":15,"message":"오디오 분석 중..."}) lang = force_language.strip() or LANGUAGE segments_gen, info = model.transcribe( audio_path, language=lang, beam_size=BEAM_SIZE, initial_prompt=INITIAL_PROMPT, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500), word_timestamps=False, ) self.update_state(state="PROGRESS", meta={"progress":30,"message":"텍스트 변환 중..."}) segments, parts = [], [] duration = info.duration for seg in segments_gen: segments.append({"start":round(seg.start,3),"end":round(seg.end,3),"text":seg.text.strip()}) parts.append(seg.text.strip()) if duration > 0: pct = 30 + int((seg.end/duration)*45) self.update_state(state="PROGRESS", meta={"progress":min(pct,75), "message":f"변환 중... {seg.end:.0f}s / {duration:.0f}s"}) raw_text = "\n".join(parts) full_text = raw_text detected_lang = info.language # ── 텍스트 후처리 (교정) ────────────────────────────── if use_ollama and ollama_model and not subtitle_mode: self.update_state(state="PROGRESS", meta={"progress":80,"message":f"Ollama({ollama_model}) 교정 중..."}) full_text = _ollama_postprocess(raw_text, ollama_model) elif use_openrouter and openrouter_model and openrouter_key and not subtitle_mode: self.update_state(state="PROGRESS", meta={"progress":80,"message":f"OpenRouter({openrouter_model}) 교정 중..."}) full_text = _openrouter_postprocess(raw_text, openrouter_model, openrouter_url, openrouter_key) # ── 자막 모드: 번역 ────────────────────────────────── translated_segments = segments is_translated = False if subtitle_mode and translate_to and translate_to != detected_lang: t_model = translate_model or (ollama_model if translate_via=="ollama" else openrouter_model) t_via_or = (translate_via == "openrouter" and bool(openrouter_key)) self.update_state(state="PROGRESS", meta={"progress":82, "message":f"{_lang_name(translate_to)}로 번역 중..."}) translated_segments = _translate_segments( segments, translate_to, use_openrouter=t_via_or, model=t_model, openrouter_url=openrouter_url, openrouter_key=openrouter_key, task_self=self, ) is_translated = True # 번역된 전체 텍스트 full_text = "\n".join(s["text"] for s in translated_segments) self.update_state(state="PROGRESS", meta={"progress":93,"message":"파일 저장 중..."}) os.makedirs(OUTPUT_DIR, exist_ok=True) result_files = {} # ── TXT 저장 ───────────────────────────────────────── txt_filename = f"{file_id}.txt" with open(os.path.join(OUTPUT_DIR, txt_filename), "w", encoding="utf-8") as f: f.write(f"# 변환 결과\n") f.write(f"# 언어: {detected_lang} | 재생 시간: {duration:.1f}초\n") if is_translated: f.write(f"# 번역: {_lang_name(translate_to)}\n") f.write(f"\n## 전체 텍스트\n\n{full_text}\n\n") f.write(f"## 타임스탬프별 세그먼트\n\n") for seg in (translated_segments if is_translated else segments): f.write(f"[{_fmt_ts(seg['start'])} → {_fmt_ts(seg['end'])}] {seg['text']}\n") result_files["txt"] = txt_filename # ── 자막 파일 저장 ──────────────────────────────────── if subtitle_mode: sub_segs = translated_segments if is_translated else segments lang_suffix = f".{translate_to}" if is_translated else f".{detected_lang}" if subtitle_format in ("srt", "both"): srt_fn = f"{file_id}{lang_suffix}.srt" with open(os.path.join(OUTPUT_DIR, srt_fn), "w", encoding="utf-8") as f: f.write(_make_srt(sub_segs)) result_files["srt"] = srt_fn if subtitle_format in ("vtt", "both"): vtt_fn = f"{file_id}{lang_suffix}.vtt" with open(os.path.join(OUTPUT_DIR, vtt_fn), "w", encoding="utf-8") as f: f.write(_make_vtt(sub_segs)) result_files["vtt"] = vtt_fn # 원본 언어 SRT도 함께 (번역 시) if is_translated and subtitle_format in ("srt","both"): orig_fn = f"{file_id}.{detected_lang}.srt" with open(os.path.join(OUTPUT_DIR, orig_fn), "w", encoding="utf-8") as f: f.write(_make_srt(segments)) result_files["srt_original"] = orig_fn try: os.remove(audio_path) except: pass return { # 기본 STT 결과 "text": full_text, "raw_text": raw_text, "segments": translated_segments if is_translated else segments, "orig_segments": segments, "language": detected_lang, "duration": round(duration, 1), # 후처리 "ollama_used": use_ollama and bool(ollama_model) and not subtitle_mode, "ollama_model": ollama_model if (use_ollama and not subtitle_mode) else "", "openrouter_used": use_openrouter and bool(openrouter_model) and not subtitle_mode, "openrouter_model": openrouter_model if (use_openrouter and not subtitle_mode) else "", # 자막 "subtitle_mode": subtitle_mode, "subtitle_format": subtitle_format, "translated": is_translated, "translate_to": translate_to if is_translated else "", "translate_model": translate_model if is_translated else "", # 파일 "output_file": result_files.get("txt",""), "srt_file": result_files.get("srt",""), "vtt_file": result_files.get("vtt",""), "srt_original_file": result_files.get("srt_original",""), } except Exception as e: raise Exception(f"변환 실패: {str(e)}") def _fmt_ts(s: float) -> str: m, sec = divmod(int(s), 60) return f"{m:02d}:{sec:02d}"