feat: extractors fuer pdf/md/docx/xlsx mit dynamic fixtures
This commit is contained in:
53
app/ingest/extractors.py
Normal file
53
app/ingest/extractors.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import io
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import fitz # pymupdf
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_TYPES = {"pdf", "md", "docx", "xlsx"}
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedFileType(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ExtractedPage:
|
||||||
|
page: int
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
def extract(data: bytes, file_type: str, filename: str) -> list[ExtractedPage]:
|
||||||
|
file_type = file_type.lower().lstrip(".")
|
||||||
|
if file_type not in SUPPORTED_TYPES:
|
||||||
|
raise UnsupportedFileType(file_type)
|
||||||
|
|
||||||
|
if file_type == "pdf":
|
||||||
|
return _extract_pdf(data)
|
||||||
|
if file_type == "md":
|
||||||
|
return _extract_md(data)
|
||||||
|
if file_type == "docx":
|
||||||
|
return _extract_docx(data)
|
||||||
|
if file_type == "xlsx":
|
||||||
|
return [ExtractedPage(page=1, text=f"Tabelle: {filename}")]
|
||||||
|
|
||||||
|
raise UnsupportedFileType(file_type) # unreachable
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf(data: bytes) -> list[ExtractedPage]:
|
||||||
|
pages: list[ExtractedPage] = []
|
||||||
|
with fitz.open(stream=data, filetype="pdf") as doc:
|
||||||
|
for i, page in enumerate(doc, start=1):
|
||||||
|
pages.append(ExtractedPage(page=i, text=page.get_text() or ""))
|
||||||
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_md(data: bytes) -> list[ExtractedPage]:
|
||||||
|
return [ExtractedPage(page=1, text=data.decode("utf-8", errors="replace"))]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_docx(data: bytes) -> list[ExtractedPage]:
|
||||||
|
doc = Document(io.BytesIO(data))
|
||||||
|
text = "\n\n".join(p.text for p in doc.paragraphs if p.text)
|
||||||
|
return [ExtractedPage(page=1, text=text)]
|
||||||
37
tests/conftest.py
Normal file
37
tests/conftest.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import io
|
||||||
|
import pytest
|
||||||
|
import fitz # pymupdf
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pdf_bytes() -> bytes:
|
||||||
|
doc = fitz.open()
|
||||||
|
p1 = doc.new_page()
|
||||||
|
p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.")
|
||||||
|
p2 = doc.new_page()
|
||||||
|
p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.")
|
||||||
|
data = doc.tobytes()
|
||||||
|
doc.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_docx_bytes() -> bytes:
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("Erster Absatz.")
|
||||||
|
doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.")
|
||||||
|
buf = io.BytesIO()
|
||||||
|
doc.save(buf)
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_md_bytes() -> bytes:
|
||||||
|
return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_xlsx_bytes() -> bytes:
|
||||||
|
# Minimal placeholder; extractor doesn't read content for xlsx
|
||||||
|
return b"PK\x03\x04dummy"
|
||||||
38
tests/test_extractors.py
Normal file
38
tests/test_extractors.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import pytest
|
||||||
|
from app.ingest.extractors import extract, ExtractedPage, UnsupportedFileType
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_pdf_returns_pages(sample_pdf_bytes):
|
||||||
|
pages = extract(sample_pdf_bytes, "pdf", filename="x.pdf")
|
||||||
|
assert len(pages) == 2
|
||||||
|
assert pages[0].page == 1
|
||||||
|
assert "Seite eins" in pages[0].text
|
||||||
|
assert pages[1].page == 2
|
||||||
|
assert "Seite zwei" in pages[1].text
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_docx_returns_single_page(sample_docx_bytes):
|
||||||
|
pages = extract(sample_docx_bytes, "docx", filename="x.docx")
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].page == 1
|
||||||
|
assert "Erster Absatz." in pages[0].text
|
||||||
|
assert "Zweiter Absatz" in pages[0].text
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_md_returns_single_page(sample_md_bytes):
|
||||||
|
pages = extract(sample_md_bytes, "md", filename="x.md")
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].page == 1
|
||||||
|
assert "First paragraph." in pages[0].text
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_xlsx_returns_filename_pseudo_text(sample_xlsx_bytes):
|
||||||
|
pages = extract(sample_xlsx_bytes, "xlsx", filename="my-sheet.xlsx")
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].page == 1
|
||||||
|
assert pages[0].text == "Tabelle: my-sheet.xlsx"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_unsupported_raises():
|
||||||
|
with pytest.raises(UnsupportedFileType):
|
||||||
|
extract(b"data", "txt", filename="x.txt")
|
||||||
Reference in New Issue
Block a user