diff --git a/1eee6fcc4d86 b/1eee6fcc4d86
new file mode 100644
index 0000000..e69de29
diff --git a/Running b/Running
new file mode 100644
index 0000000..e69de29
diff --git a/app/Dockerfile b/app/Dockerfile
index ea4ac1f..ee2cfda 100644
--- a/app/Dockerfile
+++ b/app/Dockerfile
@@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y \
libegl1 \
wget \
curl \
+ openjdk-21-jre-headless \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
diff --git a/app/main.py b/app/main.py
index e9fbe12..e0de3cd 100644
--- a/app/main.py
+++ b/app/main.py
@@ -12,6 +12,7 @@ from auth import (authenticate, create_access_token, init_users,
list_users, create_user, update_user, delete_user)
from tasks import celery_app, transcribe_task, subtitle_pipeline_task
from ocr_tasks import ocr_task
+from pdf_tasks import pdf_convert_task
app = FastAPI(title="VoiceScript API")
@@ -32,6 +33,7 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
AUDIO_EXT = {"mp3","mp4","wav","m4a","ogg","flac","aac","wma","webm",
"mkv","avi","mov","ts","mts","m2ts","wmv","flv","h264","h265","hevc","264","265","m4v"}
IMAGE_EXT = {"jpg","jpeg","png","bmp","tiff","tif","webp","gif"}
+PDF_FMT = {"html","docx","xlsx","pptx"}
_DEFAULT_SETTINGS = {
"stt_ollama_model":"","ocr_ollama_model":"granite3.2-vision:latest",
@@ -122,6 +124,13 @@ def _update_history_by_task(task_id:str, result:dict, success:bool, error_msg:st
"srt_trans":result.get("srt_trans",""),
"vtt_trans":result.get("vtt_trans",""),
}
+ elif h["type"]=="pdf":
+ h["output"]={
+ "output_file":result.get("output_file",""),
+ "target_fmt":result.get("target_fmt",""),
+ "file_size":result.get("file_size",0),
+ "pdf_name":result.get("pdf_name",""),
+ }
else:
ft=result.get("full_text","")
h["output"]={
@@ -307,6 +316,44 @@ async def transcribe_batch(request:Request,files:List[UploadFile]=File(...),
# ════════════════════════════════════════════════════════════════
# 자막
# ════════════════════════════════════════════════════════════════
+async def _dispatch_subtitle(request,files,src_language,subtitle_fmt,stt_engine,
+ refine_model,refine_via,translate_to,trans_model,trans_via,user):
+ if subtitle_fmt not in ("srt","vtt","both"): subtitle_fmt="srt"
+ s=_load_settings()
+ if not stt_engine: stt_engine=s.get("default_stt_engine","local")
+ _rm=refine_model if refine_model.strip() else (
+ s.get("openrouter_stt_model","") if refine_via=="openrouter" else s.get("stt_ollama_model",""))
+ _tm=trans_model if trans_model.strip() else (
+ s.get("openrouter_stt_model","") if trans_via=="openrouter" else s.get("stt_ollama_model",""))
+ subtitle_timeout=int(s.get("subtitle_timeout",600))
+ results=[]
+ for file in files:
+ _check_size(request)
+ ext=_ext(file.filename)
+ if ext not in AUDIO_EXT:
+ results.append({"error":f"{file.filename}: 지원하지 않는 형식","filename":file.filename}); continue
+ file_id=str(uuid.uuid4())
+ save_path=os.path.join(UPLOAD_DIR,f"{file_id}.{ext}")
+ await _save_upload(file,save_path)
+ file_size=os.path.getsize(save_path)
+ task=subtitle_pipeline_task.delay(
+ file_id,save_path,src_language,subtitle_fmt,
+ stt_engine,s.get("groq_api_key",""),s.get("openai_api_key",""),
+ _rm,refine_via,translate_to,_tm,trans_via,
+ s.get("openrouter_url",""),s.get("openrouter_api_key",""),
+ subtitle_timeout,
+ )
+ append_history({"id":file_id,"task_id":task.id,"type":"subtitle","status":"processing",
+ "timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"username":user["username"],
+ "input":{"filename":file.filename,"size_bytes":file_size,"format":ext.upper()},
+ "settings":{"src_language":src_language or "auto","subtitle_fmt":subtitle_fmt,
+ "stt_engine":stt_engine,"refine_model":_rm,"refine_via":refine_via,
+ "translate_to":translate_to,"trans_model":_tm,"trans_via":trans_via,
+ "subtitle_timeout":subtitle_timeout},
+ "output":None})
+ results.append({"task_id":task.id,"file_id":file_id,"filename":file.filename})
+ return results
+
@app.post("/api/subtitle")
async def create_subtitle(
request:Request, file:UploadFile=File(...),
@@ -316,39 +363,25 @@ async def create_subtitle(
translate_to:str=Form(""),trans_model:str=Form(""),trans_via:str=Form("ollama"),
user:dict=Depends(require_subtitle),
):
- _check_size(request)
- ext=_ext(file.filename)
- if ext not in AUDIO_EXT: raise HTTPException(400,"지원하지 않는 형식입니다")
- if subtitle_fmt not in ("srt","vtt","both"): subtitle_fmt="srt"
- s=_load_settings()
- if not stt_engine: stt_engine=s.get("default_stt_engine","local")
- if not refine_model.strip():
- refine_model=(s.get("openrouter_stt_model","") if refine_via=="openrouter"
- else s.get("stt_ollama_model",""))
- if not trans_model.strip():
- trans_model=(s.get("openrouter_stt_model","") if trans_via=="openrouter"
- else s.get("stt_ollama_model",""))
- file_id=str(uuid.uuid4())
- save_path=os.path.join(UPLOAD_DIR,f"{file_id}.{ext}")
- await _save_upload(file,save_path)
- file_size=os.path.getsize(save_path)
- subtitle_timeout=int(s.get("subtitle_timeout",600))
- task=subtitle_pipeline_task.delay(
- file_id,save_path,src_language,subtitle_fmt,
- stt_engine,s.get("groq_api_key",""),s.get("openai_api_key",""),
- refine_model,refine_via,translate_to,trans_model,trans_via,
- s.get("openrouter_url",""),s.get("openrouter_api_key",""),
- subtitle_timeout,
- )
- append_history({"id":file_id,"task_id":task.id,"type":"subtitle","status":"processing",
- "timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"username":user["username"],
- "input":{"filename":file.filename,"size_bytes":file_size,"format":ext.upper()},
- "settings":{"src_language":src_language or "auto","subtitle_fmt":subtitle_fmt,
- "stt_engine":stt_engine,"refine_model":refine_model,"refine_via":refine_via,
- "translate_to":translate_to,"trans_model":trans_model,"trans_via":trans_via,
- "subtitle_timeout":subtitle_timeout},
- "output":None})
- return {"task_id":task.id,"file_id":file_id,"filename":file.filename}
+ items=await _dispatch_subtitle(request,[file],src_language,subtitle_fmt,stt_engine,
+ refine_model,refine_via,translate_to,trans_model,trans_via,user)
+ if "error" in items[0]: raise HTTPException(400,items[0]["error"])
+ return items[0]
+
+@app.post("/api/subtitle/batch")
+async def create_subtitle_batch(
+ request:Request, files:List[UploadFile]=File(...),
+ src_language:str=Form(""),subtitle_fmt:str=Form("srt"),
+ stt_engine:str=Form("local"),
+ refine_model:str=Form(""),refine_via:str=Form("ollama"),
+ translate_to:str=Form(""),trans_model:str=Form(""),trans_via:str=Form("ollama"),
+ user:dict=Depends(require_subtitle),
+):
+ if not files: raise HTTPException(400,"파일이 없습니다")
+ if len(files)>10: raise HTTPException(400,"최대 10개까지")
+ items=await _dispatch_subtitle(request,files,src_language,subtitle_fmt,stt_engine,
+ refine_model,refine_via,translate_to,trans_model,trans_via,user)
+ return {"items":items,"total":len(items)}
# ════════════════════════════════════════════════════════════════
@@ -401,6 +434,44 @@ async def ocr_batch(request:Request,files:List[UploadFile]=File(...),
return {"items":items,"total":len(items)}
+# ════════════════════════════════════════════════════════════════
+# PDF 변환
+# ════════════════════════════════════════════════════════════════
+async def _dispatch_pdf(request, files, target_fmt, user):
+ if target_fmt not in PDF_FMT: target_fmt="html"
+ results=[]
+ for file in files:
+ _check_size(request)
+ if not file.filename.lower().endswith(".pdf"):
+ results.append({"error":f"{file.filename}: PDF 파일만 지원합니다","filename":file.filename}); continue
+ file_id=str(uuid.uuid4())
+ save_path=os.path.join(UPLOAD_DIR,f"{file_id}.pdf")
+ await _save_upload(file,save_path); file_size=os.path.getsize(save_path)
+ task=pdf_convert_task.delay(file_id,save_path,target_fmt)
+ append_history({"id":file_id,"task_id":task.id,"type":"pdf","status":"processing",
+ "timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"username":user["username"],
+ "input":{"filename":file.filename,"size_bytes":file_size,"format":"PDF"},
+ "settings":{"target_fmt":target_fmt},
+ "output":None})
+ results.append({"task_id":task.id,"file_id":file_id,"filename":file.filename})
+ return results
+
+@app.post("/api/pdf/convert")
+async def pdf_convert(request:Request,file:UploadFile=File(...),
+ target_fmt:str=Form("html"),user:dict=Depends(require_auth)):
+ items=await _dispatch_pdf(request,[file],target_fmt,user)
+ if "error" in items[0]: raise HTTPException(400,items[0]["error"])
+ return items[0]
+
+@app.post("/api/pdf/convert/batch")
+async def pdf_convert_batch(request:Request,files:List[UploadFile]=File(...),
+ target_fmt:str=Form("html"),user:dict=Depends(require_auth)):
+ if not files: raise HTTPException(400,"파일이 없습니다")
+ if len(files)>10: raise HTTPException(400,"최대 10개까지")
+ items=await _dispatch_pdf(request,files,target_fmt,user)
+ return {"items":items,"total":len(items)}
+
+
# ════════════════════════════════════════════════════════════════
# 이력
# ════════════════════════════════════════════════════════════════
@@ -431,6 +502,9 @@ def download(filename:str,user:dict=Depends(require_auth)):
path=os.path.join(OUTPUT_DIR,filename)
if not os.path.exists(path): raise HTTPException(404,"파일을 찾을 수 없습니다")
if filename.endswith(".xlsx"): media="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+ elif filename.endswith(".docx"): media="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ elif filename.endswith(".pptx"): media="application/vnd.openxmlformats-officedocument.presentationml.presentation"
+ elif filename.endswith(".html"): media="text/html; charset=utf-8"
elif filename.endswith(".vtt"): media="text/vtt"
elif filename.endswith(".srt"): media="text/plain; charset=utf-8"
else: media="text/plain; charset=utf-8"
diff --git a/app/pdf_tasks.py b/app/pdf_tasks.py
new file mode 100644
index 0000000..b65485c
--- /dev/null
+++ b/app/pdf_tasks.py
@@ -0,0 +1,357 @@
+"""
+PDF 변환 Celery Tasks — opendataloader-pdf (Java) 기반
+지원 출력 포맷: html, docx, xlsx, pptx
+"""
+import os, tempfile
+from pathlib import Path
+from celery import Celery
+
+REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
+celery_app = Celery("whisper_tasks", broker=REDIS_URL, backend=REDIS_URL)
+celery_app.conf.update(
+ task_serializer="json", result_serializer="json",
+ accept_content=["json"], task_track_started=True, result_expires=86400,
+)
+
+OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/data/outputs")
+
+
+# ── 출력 포맷 상수 ────────────────────────────────────────────────
+SUPPORTED_FORMATS = ("html", "docx", "xlsx", "pptx")
+
+
+# ── opendataloader-pdf 실행 ────────────────────────────────────────
+def _run_odpdf(pdf_path: str, out_dir: str, formats: list) -> dict:
+ """
+ opendataloader-pdf로 PDF를 변환하고 생성된 파일 경로 dict 반환.
+ formats: ["html", "json"] 형태 (odpdf 네이티브 포맷)
+ """
+ try:
+ from opendataloader_pdf import convert
+ except ImportError:
+ raise Exception("opendataloader-pdf 패키지가 설치되지 않았습니다.")
+
+ try:
+ convert(
+ input_path=pdf_path,
+ output_dir=out_dir,
+ format=formats,
+ quiet=True,
+ )
+ except FileNotFoundError:
+ raise Exception("Java가 설치되지 않았습니다. Docker 이미지를 재빌드하세요.")
+ except Exception as e:
+ raise Exception(f"PDF 파싱 실패: {e}")
+
+ stem = Path(pdf_path).stem
+ result = {}
+ for fmt in formats:
+ ext = "json" if fmt == "json" else fmt
+ candidate = os.path.join(out_dir, f"{stem}.{ext}")
+ if os.path.exists(candidate):
+ result[fmt] = candidate
+ return result
+
+
+# ── HTML 파일 정리 ─────────────────────────────────────────────────
+def _read_html(path: str) -> str:
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
+ return f.read()
+
+
+# ── HTML → DOCX ───────────────────────────────────────────────────
+def _html_to_docx(html_content: str, output_path: str):
+ from docx import Document
+ from docx.shared import Pt, RGBColor
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(html_content, "lxml")
+ doc = Document()
+
+ HEADING_MAP = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
+
+ def _add_table(tag):
+ rows = tag.find_all("tr")
+ if not rows:
+ return
+ cols = max(len(r.find_all(["td", "th"])) for r in rows)
+ if cols == 0:
+ return
+ t = doc.add_table(rows=len(rows), cols=cols)
+ t.style = "Table Grid"
+ for ri, row in enumerate(rows):
+ cells = row.find_all(["td", "th"])
+ for ci, cell in enumerate(cells):
+ if ci < cols:
+ t.cell(ri, ci).text = cell.get_text(strip=True)
+
+ body = soup.find("body") or soup
+ for el in body.find_all(
+ ["h1", "h2", "h3", "h4", "h5", "h6", "p", "table", "ul", "ol"],
+ recursive=False,
+ ) or body.children:
+ name = getattr(el, "name", None)
+ if name in HEADING_MAP:
+ text = el.get_text(strip=True)
+ if text:
+ doc.add_heading(text, level=HEADING_MAP[name])
+ elif name == "p":
+ text = el.get_text(strip=True)
+ if text:
+ doc.add_paragraph(text)
+ elif name == "table":
+ _add_table(el)
+ elif name in ("ul", "ol"):
+ for li in el.find_all("li"):
+ text = li.get_text(strip=True)
+ if text:
+ doc.add_paragraph(text, style="List Bullet")
+
+ # 본문에 태그 없는 경우 전체 텍스트 추가
+ if not any(p.text.strip() for p in doc.paragraphs):
+ doc.add_paragraph(soup.get_text(separator="\n", strip=True))
+
+ doc.save(output_path)
+
+
+# ── HTML → XLSX ────────────────────────────────────────────────────
+def _html_to_xlsx(html_content: str, output_path: str, pdf_name: str):
+ import openpyxl
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(html_content, "lxml")
+ wb = openpyxl.Workbook()
+ ws = wb.active
+ ws.title = "텍스트"
+
+ tables = soup.find_all("table")
+ if tables:
+ # 첫 번째 시트: 전체 텍스트
+ row_idx = 1
+ for el in (soup.find("body") or soup).find_all(
+ ["h1","h2","h3","h4","h5","h6","p"], recursive=True
+ ):
+ text = el.get_text(strip=True)
+ if text:
+ ws.cell(row=row_idx, column=1, value=text)
+ row_idx += 1
+
+ # 각 표마다 시트 추가
+ for ti, tbl in enumerate(tables, 1):
+ ws2 = wb.create_sheet(title=f"표{ti}")
+ rows = tbl.find_all("tr")
+ for ri, row in enumerate(rows, 1):
+ cells = row.find_all(["td", "th"])
+ for ci, cell in enumerate(cells, 1):
+ text = cell.get_text(strip=True)
+ c = ws2.cell(row=ri, column=ci, value=text)
+ if cell.name == "th":
+ c.font = Font(bold=True)
+ c.fill = PatternFill("solid", fgColor="D9E1F2")
+ c.alignment = Alignment(wrap_text=True)
+ ws2.column_dimensions[
+ openpyxl.utils.get_column_letter(max(len(row.find_all(["td","th"])) for row in rows) or 1)
+ ]
+ else:
+ # 표 없음 — 전체 텍스트를 행으로 분리
+ lines = [l.strip() for l in soup.get_text(separator="\n").split("\n") if l.strip()]
+ for i, line in enumerate(lines, 1):
+ ws.cell(row=i, column=1, value=line)
+
+ wb.save(output_path)
+
+
+# ── HTML → PPTX ────────────────────────────────────────────────────
+def _html_to_pptx(html_content: str, output_path: str, pdf_name: str):
+ from pptx import Presentation
+ from pptx.util import Inches, Pt
+ from pptx.dml.color import RGBColor
+ from pptx.enum.text import PP_ALIGN
+ from bs4 import BeautifulSoup, NavigableString
+
+ soup = BeautifulSoup(html_content, "lxml")
+ prs = Presentation()
+ prs.slide_width = Inches(13.33)
+ prs.slide_height = Inches(7.5)
+
+ blank_layout = prs.slide_layouts[6] # blank
+
+ def _new_slide():
+ return prs.slides.add_slide(blank_layout)
+
+ def _add_textbox(slide, text, left, top, width, height, font_size=18, bold=False, color=None):
+ txBox = slide.shapes.add_textbox(left, top, width, height)
+ tf = txBox.text_frame
+ tf.word_wrap = True
+ p = tf.paragraphs[0]
+ run = p.add_run()
+ run.text = text
+ run.font.size = Pt(font_size)
+ run.font.bold = bold
+ if color:
+ run.font.color.rgb = RGBColor(*color)
+
+ HEADING_SIZES = {1: 36, 2: 28, 3: 22, 4: 18, 5: 16, 6: 14}
+ HEADING_MAP = {f"h{i}": i for i in range(1, 7)}
+
+ body = soup.find("body") or soup
+ slide = _new_slide()
+ y = Inches(0.3)
+ margin_x = Inches(0.5)
+ slide_w = prs.slide_width - Inches(1.0)
+ slide_h = prs.slide_height
+
+ # 제목 슬라이드
+ title_text = (soup.find("title") or soup.find("h1") or "")
+ title_str = title_text.get_text(strip=True) if hasattr(title_text, "get_text") else pdf_name
+ _add_textbox(slide, title_str, margin_x, Inches(3.0), slide_w, Inches(1.5), font_size=40, bold=True, color=(0x1F, 0x39, 0x7D))
+ slide = _new_slide()
+ y = Inches(0.3)
+
+ for el in body.find_all(
+ ["h1","h2","h3","h4","h5","h6","p","table"], recursive=True
+ ):
+ name = el.name
+ if y > slide_h - Inches(0.8):
+ slide = _new_slide()
+ y = Inches(0.3)
+
+ if name in HEADING_MAP:
+ level = HEADING_MAP[name]
+ text = el.get_text(strip=True)
+ if not text:
+ continue
+ fs = HEADING_SIZES.get(level, 18)
+ h = Inches(0.6) if level <= 2 else Inches(0.45)
+ _add_textbox(slide, text, margin_x, y, slide_w, h,
+ font_size=fs, bold=True,
+ color=(0x1F, 0x39, 0x7D) if level == 1 else (0x2E, 0x74, 0xB5))
+ y += h + Inches(0.1)
+
+ elif name == "p":
+ text = el.get_text(strip=True)
+ if not text:
+ continue
+ lines = (len(text) // 80) + 1
+ h = Inches(0.28 * min(lines, 8))
+ _add_textbox(slide, text, margin_x, y, slide_w, h, font_size=14)
+ y += h + Inches(0.08)
+
+ elif name == "table":
+ rows = el.find_all("tr")
+ if not rows:
+ continue
+ # 표 제목 박스
+ _add_textbox(slide, "[ 표 ]", margin_x, y, slide_w, Inches(0.3), font_size=12, bold=True, color=(0x80, 0x80, 0x80))
+ y += Inches(0.32)
+ row_h = Inches(0.28)
+ for row in rows[:20]:
+ cells = row.find_all(["td", "th"])
+ line = " │ ".join(c.get_text(strip=True) for c in cells)
+ if not line.strip():
+ continue
+ if y > slide_h - Inches(0.5):
+ slide = _new_slide()
+ y = Inches(0.3)
+ is_header = any(c.name == "th" for c in cells)
+ _add_textbox(slide, line, margin_x, y, slide_w, row_h,
+ font_size=11, bold=is_header)
+ y += row_h
+ y += Inches(0.1)
+
+ if len(prs.slides) == 0 or (len(prs.slides) == 1 and not prs.slides[0].shapes):
+ slide = _new_slide()
+ _add_textbox(slide, soup.get_text(separator="\n", strip=True)[:2000],
+ margin_x, Inches(0.3), slide_w, Inches(6.5), font_size=14)
+
+ prs.save(output_path)
+
+
+# ══════════════════════════════════════════════════════════════════
+# Celery Task
+# ══════════════════════════════════════════════════════════════════
+@celery_app.task(bind=True, name="tasks.pdf_convert_task", queue="stt")
+def pdf_convert_task(self, file_id: str, pdf_path: str, target_fmt: str):
+ """
+ PDF를 target_fmt(html/docx/xlsx/pptx)로 변환.
+ """
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ tmp_dir = tempfile.mkdtemp(prefix=f"odpdf_{file_id}_")
+ pdf_name = Path(pdf_path).stem
+
+ try:
+ self.update_state(state="PROGRESS", meta={"progress": 10, "message": "PDF 파싱 중 (Java)..."})
+
+ # opendataloader-pdf 실행 — HTML + JSON 동시 추출
+ odpdf_formats = ["html"]
+ if target_fmt in ("docx", "xlsx", "pptx"):
+ odpdf_formats.append("json")
+
+ converted = _run_odpdf(pdf_path, tmp_dir, odpdf_formats)
+
+ if "html" not in converted:
+ # fallback: text 포맷으로 재시도
+ converted = _run_odpdf(pdf_path, tmp_dir, ["text"])
+ if not converted:
+ raise Exception("PDF에서 내용을 추출할 수 없습니다.")
+
+ self.update_state(state="PROGRESS", meta={"progress": 50, "message": f"{target_fmt.upper()} 변환 중..."})
+
+ output_filename = f"{file_id}.{target_fmt}"
+ output_path = os.path.join(OUTPUT_DIR, output_filename)
+
+ if target_fmt == "html":
+ # HTML은 opendataloader-pdf 직접 출력 사용
+ html_src = converted.get("html")
+ if html_src:
+ import shutil
+ shutil.copy2(html_src, output_path)
+ else:
+ # text fallback
+ text_src = converted.get("text", "")
+ with open(output_path, "w", encoding="utf-8") as f:
+ f.write(f"
{text_src}
")
+
+ else:
+ html_src = converted.get("html")
+ if html_src:
+ html_content = _read_html(html_src)
+ else:
+ # HTML 없으면 텍스트로 최소 HTML 생성
+ text_src = converted.get("text")
+ if text_src:
+ with open(text_src, "r", encoding="utf-8", errors="replace") as f:
+ raw = f.read()
+ html_content = f"