54 lines
1.4 KiB
Python
54 lines
1.4 KiB
Python
import io
|
|
from dataclasses import dataclass
|
|
|
|
import fitz # pymupdf
|
|
from docx import Document
|
|
|
|
|
|
SUPPORTED_TYPES = {"pdf", "md", "docx", "xlsx"}
|
|
|
|
|
|
class UnsupportedFileType(Exception):
|
|
pass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExtractedPage:
|
|
page: int
|
|
text: str
|
|
|
|
|
|
def extract(data: bytes, file_type: str, filename: str) -> list[ExtractedPage]:
|
|
file_type = file_type.lower().lstrip(".")
|
|
if file_type not in SUPPORTED_TYPES:
|
|
raise UnsupportedFileType(file_type)
|
|
|
|
if file_type == "pdf":
|
|
return _extract_pdf(data)
|
|
if file_type == "md":
|
|
return _extract_md(data)
|
|
if file_type == "docx":
|
|
return _extract_docx(data)
|
|
if file_type == "xlsx":
|
|
return [ExtractedPage(page=1, text=f"Tabelle: {filename}")]
|
|
|
|
raise UnsupportedFileType(file_type) # unreachable
|
|
|
|
|
|
def _extract_pdf(data: bytes) -> list[ExtractedPage]:
|
|
pages: list[ExtractedPage] = []
|
|
with fitz.open(stream=data, filetype="pdf") as doc:
|
|
for i, page in enumerate(doc, start=1):
|
|
pages.append(ExtractedPage(page=i, text=page.get_text() or ""))
|
|
return pages
|
|
|
|
|
|
def _extract_md(data: bytes) -> list[ExtractedPage]:
|
|
return [ExtractedPage(page=1, text=data.decode("utf-8", errors="replace"))]
|
|
|
|
|
|
def _extract_docx(data: bytes) -> list[ExtractedPage]:
|
|
doc = Document(io.BytesIO(data))
|
|
text = "\n\n".join(p.text for p in doc.paragraphs if p.text)
|
|
return [ExtractedPage(page=1, text=text)]
|