feat: 자막 생성 탭 (ffmpeg+Whisper+LLM 3단계 파이프라인)

This commit is contained in:
root
2026-05-02 07:28:34 +09:00
parent 4fc3da1a2d
commit b3805c2b0b
3 changed files with 884 additions and 608 deletions

View File

@@ -10,7 +10,7 @@ from typing import List
from auth import (authenticate, create_access_token, init_users,
require_auth, require_admin, require_stt, require_ocr,
list_users, create_user, update_user, delete_user)
from tasks import celery_app, transcribe_task
from tasks import celery_app, transcribe_task, subtitle_pipeline_task
from ocr_tasks import ocr_task
app = FastAPI(title="VoiceScript API")
@@ -30,18 +30,9 @@ os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
AUDIO_EXT = {"mp3","mp4","wav","m4a","ogg","flac","aac","wma","webm",
"mkv","avi","mov","ts","mts","m2ts","wmv","flv","rmvb",
"h264","h265","hevc","264","265"}
"mkv","avi","mov","ts","mts","m2ts","wmv","flv","h264","h265","hevc","264","265"}
IMAGE_EXT = {"jpg","jpeg","png","bmp","tiff","tif","webp","gif"}
SUPPORTED_LANGS = {
"ko":"한국어","en":"English","ja":"日本語","zh":"中文(简体)",
"zh-tw":"中文(繁體)","fr":"Français","de":"Deutsch","es":"Español",
"it":"Italiano","pt":"Português","ru":"Русский","ar":"العربية",
"vi":"Tiếng Việt","th":"ไทย","id":"Bahasa Indonesia",
"nl":"Nederlands","pl":"Polski","tr":"Türkçe","sv":"Svenska",
"uk":"Українська","hi":"हिन्दी","bn":"বাংলা",
}
VIDEO_EXT = {"mp4","mkv","avi","mov","webm","ts","mts","m2ts","wmv","flv","h264","h265","hevc","264","265","m4v","3gp","rm","rmvb"}
_DEFAULT_SETTINGS = {
"stt_ollama_model":"","ocr_ollama_model":"granite3.2-vision:latest",
@@ -85,7 +76,7 @@ def append_history(record:dict):
history.insert(0,record); _write_history(history[:HISTORY_MAX])
except: pass
def _update_history_by_task(task_id:str,result:dict,success:bool,error_msg:str=""):
def _update_history_by_task(task_id:str, result:dict, success:bool, error_msg:str=""):
with _hist_lock:
if not HISTORY_FILE.exists(): return
try:
@@ -93,9 +84,10 @@ def _update_history_by_task(task_id:str,result:dict,success:bool,error_msg:str="
for h in history:
if h.get("task_id")!=task_id: continue
if h.get("status")!="processing": break
if not success: h["status"]="failed";h["output"]={"error":error_msg[:300]};break
h["status"]="success"
if h["type"]=="stt":
h["status"]="failed" if not success else "success"
if not success:
h["output"]={"error":error_msg[:300]}
elif h["type"]=="stt":
text=result.get("text","")
h["output"]={
"filename":result.get("output_file",""),
@@ -107,11 +99,18 @@ def _update_history_by_task(task_id:str,result:dict,success:bool,error_msg:str="
"ollama_model":result.get("ollama_model",""),
"openrouter_used":result.get("openrouter_used",False),
"openrouter_model":result.get("openrouter_model",""),
"subtitle_mode":result.get("subtitle_mode",False),
}
elif h["type"]=="subtitle":
h["output"]={
"detected_language":result.get("detected_language",""),
"duration_s":result.get("duration",0),
"segment_count":result.get("segment_count",0),
"translated":result.get("translated",False),
"translate_to":result.get("translate_to",""),
"srt_file":result.get("srt_file",""),
"vtt_file":result.get("vtt_file",""),
"srt_orig":result.get("srt_orig",""),
"vtt_orig":result.get("vtt_orig",""),
"srt_trans":result.get("srt_trans",""),
"vtt_trans":result.get("vtt_trans",""),
}
else:
ft=result.get("full_text","")
@@ -129,12 +128,12 @@ def _update_history_by_task(task_id:str,result:dict,success:bool,error_msg:str="
_write_history(history)
except: pass
def delete_history_item(history_id:str)->bool:
def delete_history_item(hid:str)->bool:
with _hist_lock:
if not HISTORY_FILE.exists(): return False
try:
with open(HISTORY_FILE,"r",encoding="utf-8") as f: history=json.load(f)
new=[h for h in history if h.get("id")!=history_id]
new=[h for h in history if h.get("id")!=hid]
if len(new)==len(history): return False
_write_history(new); return True
except: return False
@@ -166,10 +165,6 @@ def me(user:dict=Depends(require_auth)):
return {"username":user["username"],"role":user.get("role","user"),
"permissions":user.get("permissions",{"stt":False,"ocr":False})}
@app.get("/api/languages")
def get_languages(user:dict=Depends(require_auth)):
return {"languages":SUPPORTED_LANGS}
# ════════════════════════════════════════════════════════════════
# 시스템 정보
@@ -189,115 +184,120 @@ def system_info(user:dict=Depends(require_auth)):
# ════════════════════════════════════════════════════════════════
# STT 공통 디스패
# STT 단일 / 배
# ════════════════════════════════════════════════════════════════
async def _dispatch_stt(
request, files,
use_ollama, ollama_model,
use_openrouter, openrouter_model,
subtitle_mode, subtitle_format,
force_language,
translate_to, translate_model, translate_via,
user,
):
s = _load_settings()
_use_ollama = use_ollama.lower() == "true"
_use_openrouter = use_openrouter.lower() == "true"
_sub_mode = subtitle_mode.lower() == "true"
if _use_ollama and not ollama_model.strip(): ollama_model = s.get("stt_ollama_model","")
if _use_openrouter and not openrouter_model.strip():openrouter_model= s.get("openrouter_stt_model","")
if not translate_model.strip():
translate_model = ollama_model if translate_via=="ollama" else openrouter_model
async def _dispatch_stt(request,files,use_ollama,ollama_model,use_openrouter,openrouter_model,user):
s=_load_settings()
_uo=use_ollama.lower()=="true"; _uor=use_openrouter.lower()=="true"
if _uo and not ollama_model.strip(): ollama_model=s.get("stt_ollama_model","")
if _uor and not openrouter_model.strip():openrouter_model=s.get("openrouter_stt_model","")
results=[]
for file in files:
_check_size(request)
ext=_ext(file.filename)
if ext not in AUDIO_EXT:
results.append({"error":f"{file.filename}: 지원하지 않는 형식","filename":file.filename})
continue
results.append({"error":f"{file.filename}: 지원하지 않는 형식","filename":file.filename}); continue
file_id=str(uuid.uuid4())
save_path=os.path.join(UPLOAD_DIR,f"{file_id}.{ext}")
await _save_upload(file,save_path)
file_size=os.path.getsize(save_path)
task=transcribe_task.delay(
file_id, save_path,
_use_ollama, ollama_model,
_use_openrouter, openrouter_model,
s.get("openrouter_url",""), s.get("openrouter_api_key",""),
_sub_mode, subtitle_format or "srt",
translate_to or "",
translate_model or "",
translate_via or "ollama",
force_language or "",
)
append_history({
"id":file_id,"task_id":task.id,"type":"stt",
"status":"processing",
"timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"username":user["username"],
await _save_upload(file,save_path); file_size=os.path.getsize(save_path)
task=transcribe_task.delay(file_id,save_path,_uo,ollama_model,_uor,openrouter_model,
s.get("openrouter_url",""),s.get("openrouter_api_key",""))
append_history({"id":file_id,"task_id":task.id,"type":"stt","status":"processing",
"timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"username":user["username"],
"input":{"filename":file.filename,"size_bytes":file_size,"format":ext.upper()},
"settings":{
"model":os.getenv("WHISPER_MODEL","medium"),
"language":force_language or os.getenv("WHISPER_LANGUAGE","auto"),
"compute_type":os.getenv("WHISPER_COMPUTE_TYPE","int8"),
"cpu_threads":s.get("cpu_threads",0),
"subtitle_mode":_sub_mode,
"subtitle_format":subtitle_format,
"translate_to":translate_to,
"translate_model":translate_model,
"use_ollama":_use_ollama,"ollama_model":ollama_model if _use_ollama else "",
"use_openrouter":_use_openrouter,"openrouter_model":openrouter_model if _use_openrouter else "",
},
"output":None,
})
"settings":{"model":os.getenv("WHISPER_MODEL","medium"),"language":os.getenv("WHISPER_LANGUAGE","ko"),
"compute_type":os.getenv("WHISPER_COMPUTE_TYPE","int8"),"cpu_threads":s.get("cpu_threads",0),
"use_ollama":_uo,"ollama_model":ollama_model if _uo else "",
"use_openrouter":_uor,"openrouter_model":openrouter_model if _uor else ""},
"output":None})
results.append({"task_id":task.id,"file_id":file_id,"filename":file.filename})
return results
# ════════════════════════════════════════════════════════════════
# STT — 단일 / 배치
# ════════════════════════════════════════════════════════════════
@app.post("/api/transcribe")
async def transcribe(
request:Request, file:UploadFile=File(...),
use_ollama:str=Form("false"), ollama_model:str=Form(""),
use_openrouter:str=Form("false"), openrouter_model:str=Form(""),
subtitle_mode:str=Form("false"), subtitle_format:str=Form("srt"),
force_language:str=Form(""),
translate_to:str=Form(""), translate_model:str=Form(""), translate_via:str=Form("ollama"),
user:dict=Depends(require_stt),
):
items=await _dispatch_stt(request,[file],use_ollama,ollama_model,use_openrouter,openrouter_model,
subtitle_mode,subtitle_format,force_language,translate_to,translate_model,translate_via,user)
async def transcribe(request:Request,file:UploadFile=File(...),
use_ollama:str=Form("false"),ollama_model:str=Form(""),
use_openrouter:str=Form("false"),openrouter_model:str=Form(""),
user:dict=Depends(require_stt)):
items=await _dispatch_stt(request,[file],use_ollama,ollama_model,use_openrouter,openrouter_model,user)
return items[0]
@app.post("/api/transcribe/batch")
async def transcribe_batch(
request:Request, files:List[UploadFile]=File(...),
use_ollama:str=Form("false"), ollama_model:str=Form(""),
use_openrouter:str=Form("false"), openrouter_model:str=Form(""),
subtitle_mode:str=Form("false"), subtitle_format:str=Form("srt"),
force_language:str=Form(""),
translate_to:str=Form(""), translate_model:str=Form(""), translate_via:str=Form("ollama"),
user:dict=Depends(require_stt),
):
async def transcribe_batch(request:Request,files:List[UploadFile]=File(...),
use_ollama:str=Form("false"),ollama_model:str=Form(""),
use_openrouter:str=Form("false"),openrouter_model:str=Form(""),
user:dict=Depends(require_stt)):
if not files: raise HTTPException(400,"파일이 없습니다")
if len(files)>20: raise HTTPException(400,"한 번에 최대 20개까지 업로드할 수 있습니다")
items=await _dispatch_stt(request,files,use_ollama,ollama_model,use_openrouter,openrouter_model,
subtitle_mode,subtitle_format,force_language,translate_to,translate_model,translate_via,user)
if len(files)>20: raise HTTPException(400,"최대 20개까지")
items=await _dispatch_stt(request,files,use_ollama,ollama_model,use_openrouter,openrouter_model,user)
return {"items":items,"total":len(items)}
# ════════════════════════════════════════════════════════════════
# OCR 공통 디스패치
# 자막 파이프라인 (영상 → SRT/VTT)
# ════════════════════════════════════════════════════════════════
@app.post("/api/subtitle")
async def create_subtitle(
request: Request,
file: UploadFile = File(...),
src_language: str = Form(""), # 원어 (빈칸=자동)
subtitle_fmt: str = Form("srt"), # srt | vtt | both
translate_to: str = Form(""), # 번역 대상 언어 (빈칸=번역 안 함)
trans_model: str = Form(""), # 번역 모델
trans_via: str = Form("ollama"), # ollama | openrouter
user: dict = Depends(require_stt),
):
_check_size(request)
ext = _ext(file.filename)
# 영상 + 오디오 모두 허용 (오디오만 있어도 자막 생성 가능)
if ext not in AUDIO_EXT:
raise HTTPException(400, f"지원하지 않는 형식입니다. 영상/오디오 파일을 업로드하세요.")
if subtitle_fmt not in ("srt","vtt","both"): subtitle_fmt = "srt"
s = _load_settings()
# 번역 모델 미지정 시 설정에서 가져옴
if not trans_model.strip():
trans_model = (s.get("openrouter_stt_model","") if trans_via=="openrouter"
else s.get("stt_ollama_model",""))
file_id = str(uuid.uuid4())
save_path = os.path.join(UPLOAD_DIR, f"{file_id}.{ext}")
await _save_upload(file, save_path)
file_size = os.path.getsize(save_path)
task = subtitle_pipeline_task.delay(
file_id, save_path,
src_language, subtitle_fmt,
translate_to, trans_model, trans_via,
s.get("openrouter_url",""), s.get("openrouter_api_key",""),
)
append_history({
"id": file_id, "task_id": task.id, "type": "subtitle",
"status": "processing",
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"username": user["username"],
"input": {"filename": file.filename, "size_bytes": file_size, "format": ext.upper()},
"settings": {
"src_language": src_language or "auto",
"subtitle_fmt": subtitle_fmt,
"translate_to": translate_to,
"trans_model": trans_model,
"trans_via": trans_via,
},
"output": None,
})
return {"task_id": task.id, "file_id": file_id, "filename": file.filename}
# ════════════════════════════════════════════════════════════════
# OCR 단일 / 배치
# ════════════════════════════════════════════════════════════════
async def _dispatch_ocr(request,files,mode,backend,ollama_model,openrouter_model,custom_prompt,user):
if mode not in ("text","structure"): mode="text"
if backend not in ("paddle","ollama","openrouter"): backend="paddle"
s=_load_settings()
if backend=="ollama" and not ollama_model.strip(): ollama_model=s.get("ocr_ollama_model","granite3.2-vision:latest")
if backend=="openrouter" and not openrouter_model.strip(): openrouter_model=s.get("openrouter_ocr_model","")
if backend=="ollama" and not ollama_model.strip(): ollama_model=s.get("ocr_ollama_model","granite3.2-vision:latest")
if backend=="openrouter" and not openrouter_model.strip():openrouter_model=s.get("openrouter_ocr_model","")
results=[]
for file in files:
_check_size(request)
@@ -334,7 +334,7 @@ async def ocr_batch(request:Request,files:List[UploadFile]=File(...),
ollama_model:str=Form(""),openrouter_model:str=Form(""),custom_prompt:str=Form(""),
user:dict=Depends(require_ocr)):
if not files: raise HTTPException(400,"파일이 없습니다")
if len(files)>20: raise HTTPException(400,"한 번에 최대 20개까지")
if len(files)>20: raise HTTPException(400,"최대 20개까지")
items=await _dispatch_ocr(request,files,mode,backend,ollama_model,openrouter_model,custom_prompt,user)
return {"items":items,"total":len(items)}
@@ -346,7 +346,7 @@ async def ocr_batch(request:Request,files:List[UploadFile]=File(...),
def get_status(task_id:str,user:dict=Depends(require_auth)):
r=celery_app.AsyncResult(task_id)
if r.state=="PENDING": return {"state":"pending","progress":0,"message":"대기 중..."}
if r.state=="PROGRESS": m=r.info or {};return {"state":"progress","progress":m.get("progress",0),"message":m.get("message","처리 중...")}
if r.state=="PROGRESS": m=r.info or {};return {"state":"progress","progress":m.get("progress",0),"step":m.get("step",0),"step_msg":m.get("step_msg",""),"message":m.get("message","처리 중...")}
if r.state=="SUCCESS": _update_history_by_task(task_id,r.result or {},True);return {"state":"success","progress":100,**(r.result or {})}
if r.state=="FAILURE": _update_history_by_task(task_id,{},False,str(r.info));return {"state":"failure","progress":0,"message":str(r.info)}
return {"state":r.state.lower(),"progress":0}
@@ -355,7 +355,7 @@ def get_status(task_id:str,user:dict=Depends(require_auth)):
def get_history(page:int=1,per_page:int=15,type_:str="",user:dict=Depends(require_auth)):
history=_load_history()
if user.get("role")!="admin": history=[h for h in history if h.get("username")==user["username"]]
if type_ in ("stt","ocr"): history=[h for h in history if h.get("type")==type_]
if type_ in ("stt","ocr","subtitle"): history=[h for h in history if h.get("type")==type_]
total=len(history);start=(page-1)*per_page
return {"total":total,"page":page,"per_page":per_page,"items":history[start:start+per_page]}
@@ -373,11 +373,10 @@ def download(filename:str,user:dict=Depends(require_auth)):
if ".." in filename or "/" in filename: raise HTTPException(400,"잘못된 파일명")
path=os.path.join(OUTPUT_DIR,filename)
if not os.path.exists(path): raise HTTPException(404,"파일을 찾을 수 없습니다")
if filename.endswith(".xlsx"):
media="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif filename.endswith(".srt"): media="text/plain"
elif filename.endswith(".vtt"): media="text/vtt"
else: media="text/plain"
if filename.endswith(".xlsx"): media="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif filename.endswith(".vtt"): media="text/vtt"
elif filename.endswith(".srt"): media="text/plain; charset=utf-8"
else: media="text/plain; charset=utf-8"
return FileResponse(path,media_type=media,filename=filename)
@app.get("/api/ollama/models")
@@ -392,12 +391,10 @@ def openrouter_models(user:dict=Depends(require_auth)):
s=_load_settings();api_key=s.get("openrouter_api_key","");base_url=s.get("openrouter_url","https://openrouter.ai/api/v1").rstrip("/")
if not api_key: return {"models":[],"connected":False,"error":"API 키가 설정되지 않았습니다"}
try:
resp=httpx.get(f"{base_url}/models",
headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local"},timeout=12.0)
resp=httpx.get(f"{base_url}/models",headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local"},timeout=12.0)
resp.raise_for_status()
all_models=resp.json().get("data",[])
vision=[m["id"] for m in all_models if any(k in m["id"].lower()
for k in ["vision","claude-3","gemini","gpt-4o","llava","pixtral","qwen-vl","deepseek-vl"])]
vision=[m["id"] for m in all_models if any(k in m["id"].lower() for k in ["vision","claude-3","gemini","gpt-4o","llava","pixtral","qwen-vl","deepseek-vl"])]
return {"models":[m["id"] for m in all_models],"vision_models":vision,"connected":True,"total":len(all_models)}
except httpx.HTTPStatusError as e: return {"models":[],"connected":False,"error":f"HTTP {e.response.status_code}"}
except Exception as e: return {"models":[],"connected":False,"error":str(e)}
@@ -405,10 +402,9 @@ def openrouter_models(user:dict=Depends(require_auth)):
@app.post("/api/openrouter/test")
def openrouter_test(api_key:str=Form(...),base_url:str=Form("https://openrouter.ai/api/v1"),user:dict=Depends(require_auth)):
try:
resp=httpx.get(f"{base_url.rstrip('/')}/models",
headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local"},timeout=10.0)
resp.raise_for_status()
count=len(resp.json().get("data",[]));return {"ok":True,"message":f"연결 성공 — {count}개 모델 사용 가능"}
resp=httpx.get(f"{base_url.rstrip('/')}/models",headers={"Authorization":f"Bearer {api_key}","HTTP-Referer":"https://voicescript.local"},timeout=10.0)
resp.raise_for_status();count=len(resp.json().get("data",[]))
return {"ok":True,"message":f"연결 성공 — {count}개 모델 사용 가능"}
except httpx.HTTPStatusError as e: return {"ok":False,"message":f"인증 실패 (HTTP {e.response.status_code})"}
except Exception as e: return {"ok":False,"message":f"연결 실패: {str(e)}"}
@@ -466,7 +462,7 @@ def admin_update_user(username:str,perm_stt:str=Form("false"),perm_ocr:str=Form(
@app.delete("/api/admin/users/{username}")
def admin_delete_user(username:str,user:dict=Depends(require_admin)):
ok,msg=delete_user(username);
ok,msg=delete_user(username)
if not ok: raise HTTPException(400,msg)
return {"ok":True,"message":msg}