Files
rag-ingestor/tests/test_extractors.py
Jean-Luc Makiola 4b9280a972
All checks were successful
CI / ci (push) Successful in 37s
Release / release (push) Successful in 53s
test: ungenutzte imports entfernen (ruff F401)
2026-05-07 16:06:55 +02:00

39 lines
1.2 KiB
Python

import pytest
from app.ingest.extractors import extract, UnsupportedFileType
def test_extract_pdf_returns_pages(sample_pdf_bytes):
pages = extract(sample_pdf_bytes, "pdf", filename="x.pdf")
assert len(pages) == 2
assert pages[0].page == 1
assert "Seite eins" in pages[0].text
assert pages[1].page == 2
assert "Seite zwei" in pages[1].text
def test_extract_docx_returns_single_page(sample_docx_bytes):
pages = extract(sample_docx_bytes, "docx", filename="x.docx")
assert len(pages) == 1
assert pages[0].page == 1
assert "Erster Absatz." in pages[0].text
assert "Zweiter Absatz" in pages[0].text
def test_extract_md_returns_single_page(sample_md_bytes):
pages = extract(sample_md_bytes, "md", filename="x.md")
assert len(pages) == 1
assert pages[0].page == 1
assert "First paragraph." in pages[0].text
def test_extract_xlsx_returns_filename_pseudo_text(sample_xlsx_bytes):
pages = extract(sample_xlsx_bytes, "xlsx", filename="my-sheet.xlsx")
assert len(pages) == 1
assert pages[0].page == 1
assert pages[0].text == "Tabelle: my-sheet.xlsx"
def test_extract_unsupported_raises():
with pytest.raises(UnsupportedFileType):
extract(b"data", "txt", filename="x.txt")