rag-ingestor/tests/conftest.py

import io
import pytest
import fitz  # pymupdf
from docx import Document


@pytest.fixture
def sample_pdf_bytes() -> bytes:
    doc = fitz.open()
    p1 = doc.new_page()
    p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.")
    p2 = doc.new_page()
    p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.")
    data = doc.tobytes()
    doc.close()
    return data


@pytest.fixture
def sample_docx_bytes() -> bytes:
    doc = Document()
    doc.add_paragraph("Erster Absatz.")
    doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.")
    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()


@pytest.fixture
def sample_md_bytes() -> bytes:
    return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8")


@pytest.fixture
def sample_xlsx_bytes() -> bytes:
    # Minimal placeholder; extractor doesn't read content for xlsx
    return b"PK\x03\x04dummy"