import io from dataclasses import dataclass import fitz # pymupdf from docx import Document SUPPORTED_TYPES = {"pdf", "md", "docx", "xlsx"} class UnsupportedFileType(Exception): pass @dataclass(frozen=True) class ExtractedPage: page: int text: str def extract(data: bytes, file_type: str, filename: str) -> list[ExtractedPage]: file_type = file_type.lower().lstrip(".") if file_type not in SUPPORTED_TYPES: raise UnsupportedFileType(file_type) if file_type == "pdf": return _extract_pdf(data) if file_type == "md": return _extract_md(data) if file_type == "docx": return _extract_docx(data) if file_type == "xlsx": return [ExtractedPage(page=1, text=f"Tabelle: {filename}")] raise UnsupportedFileType(file_type) # unreachable def _extract_pdf(data: bytes) -> list[ExtractedPage]: pages: list[ExtractedPage] = [] with fitz.open(stream=data, filetype="pdf") as doc: for i, page in enumerate(doc, start=1): pages.append(ExtractedPage(page=i, text=page.get_text() or "")) return pages def _extract_md(data: bytes) -> list[ExtractedPage]: return [ExtractedPage(page=1, text=data.decode("utf-8", errors="replace"))] def _extract_docx(data: bytes) -> list[ExtractedPage]: doc = Document(io.BytesIO(data)) text = "\n\n".join(p.text for p in doc.paragraphs if p.text) return [ExtractedPage(page=1, text=text)]