import pytest from app.ingest.extractors import extract, ExtractedPage, UnsupportedFileType def test_extract_pdf_returns_pages(sample_pdf_bytes): pages = extract(sample_pdf_bytes, "pdf", filename="x.pdf") assert len(pages) == 2 assert pages[0].page == 1 assert "Seite eins" in pages[0].text assert pages[1].page == 2 assert "Seite zwei" in pages[1].text def test_extract_docx_returns_single_page(sample_docx_bytes): pages = extract(sample_docx_bytes, "docx", filename="x.docx") assert len(pages) == 1 assert pages[0].page == 1 assert "Erster Absatz." in pages[0].text assert "Zweiter Absatz" in pages[0].text def test_extract_md_returns_single_page(sample_md_bytes): pages = extract(sample_md_bytes, "md", filename="x.md") assert len(pages) == 1 assert pages[0].page == 1 assert "First paragraph." in pages[0].text def test_extract_xlsx_returns_filename_pseudo_text(sample_xlsx_bytes): pages = extract(sample_xlsx_bytes, "xlsx", filename="my-sheet.xlsx") assert len(pages) == 1 assert pages[0].page == 1 assert pages[0].text == "Tabelle: my-sheet.xlsx" def test_extract_unsupported_raises(): with pytest.raises(UnsupportedFileType): extract(b"data", "txt", filename="x.txt")