39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
import pytest
|
|
from app.ingest.extractors import extract, ExtractedPage, UnsupportedFileType
|
|
|
|
|
|
def test_extract_pdf_returns_pages(sample_pdf_bytes):
|
|
pages = extract(sample_pdf_bytes, "pdf", filename="x.pdf")
|
|
assert len(pages) == 2
|
|
assert pages[0].page == 1
|
|
assert "Seite eins" in pages[0].text
|
|
assert pages[1].page == 2
|
|
assert "Seite zwei" in pages[1].text
|
|
|
|
|
|
def test_extract_docx_returns_single_page(sample_docx_bytes):
|
|
pages = extract(sample_docx_bytes, "docx", filename="x.docx")
|
|
assert len(pages) == 1
|
|
assert pages[0].page == 1
|
|
assert "Erster Absatz." in pages[0].text
|
|
assert "Zweiter Absatz" in pages[0].text
|
|
|
|
|
|
def test_extract_md_returns_single_page(sample_md_bytes):
|
|
pages = extract(sample_md_bytes, "md", filename="x.md")
|
|
assert len(pages) == 1
|
|
assert pages[0].page == 1
|
|
assert "First paragraph." in pages[0].text
|
|
|
|
|
|
def test_extract_xlsx_returns_filename_pseudo_text(sample_xlsx_bytes):
|
|
pages = extract(sample_xlsx_bytes, "xlsx", filename="my-sheet.xlsx")
|
|
assert len(pages) == 1
|
|
assert pages[0].page == 1
|
|
assert pages[0].text == "Tabelle: my-sheet.xlsx"
|
|
|
|
|
|
def test_extract_unsupported_raises():
|
|
with pytest.raises(UnsupportedFileType):
|
|
extract(b"data", "txt", filename="x.txt")
|