import io import pytest import fitz # pymupdf from docx import Document @pytest.fixture def sample_pdf_bytes() -> bytes: doc = fitz.open() p1 = doc.new_page() p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.") p2 = doc.new_page() p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.") data = doc.tobytes() doc.close() return data @pytest.fixture def sample_docx_bytes() -> bytes: doc = Document() doc.add_paragraph("Erster Absatz.") doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.") buf = io.BytesIO() doc.save(buf) return buf.getvalue() @pytest.fixture def sample_md_bytes() -> bytes: return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8") @pytest.fixture def sample_xlsx_bytes() -> bytes: # Minimal placeholder; extractor doesn't read content for xlsx return b"PK\x03\x04dummy"