38 lines
895 B
Python
38 lines
895 B
Python
import io
|
|
import pytest
|
|
import fitz # pymupdf
|
|
from docx import Document
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pdf_bytes() -> bytes:
|
|
doc = fitz.open()
|
|
p1 = doc.new_page()
|
|
p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.")
|
|
p2 = doc.new_page()
|
|
p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.")
|
|
data = doc.tobytes()
|
|
doc.close()
|
|
return data
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_docx_bytes() -> bytes:
|
|
doc = Document()
|
|
doc.add_paragraph("Erster Absatz.")
|
|
doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.")
|
|
buf = io.BytesIO()
|
|
doc.save(buf)
|
|
return buf.getvalue()
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_md_bytes() -> bytes:
|
|
return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8")
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_xlsx_bytes() -> bytes:
|
|
# Minimal placeholder; extractor doesn't read content for xlsx
|
|
return b"PK\x03\x04dummy"
|