rag-ingestor/tests/conftest.py

import io
import pytest
import fitz  # pymupdf
from docx import Document

from app.config import Settings


@pytest.fixture(autouse=True)
def _ignore_dotenv():
    """Tests must be deterministic regardless of a developer .env in the repo
    root. Settings reads env_file='.env'; neutralise it so tests see only the
    environment they explicitly set (e.g. via monkeypatch)."""
    original = Settings.model_config.get("env_file")
    Settings.model_config["env_file"] = None
    yield
    Settings.model_config["env_file"] = original


@pytest.fixture
def sample_pdf_bytes() -> bytes:
    doc = fitz.open()
    p1 = doc.new_page()
    p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.")
    p2 = doc.new_page()
    p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.")
    data = doc.tobytes()
    doc.close()
    return data


@pytest.fixture
def sample_docx_bytes() -> bytes:
    doc = Document()
    doc.add_paragraph("Erster Absatz.")
    doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.")
    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()


@pytest.fixture
def sample_md_bytes() -> bytes:
    return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8")


@pytest.fixture
def sample_xlsx_bytes() -> bytes:
    # Minimal placeholder; extractor doesn't read content for xlsx
    return b"PK\x03\x04dummy"