Files
rag-ingestor/tests/conftest.py

38 lines
895 B
Python

import io
import pytest
import fitz # pymupdf
from docx import Document
@pytest.fixture
def sample_pdf_bytes() -> bytes:
doc = fitz.open()
p1 = doc.new_page()
p1.insert_text((72, 72), "Seite eins enthaelt Lorem Ipsum.")
p2 = doc.new_page()
p2.insert_text((72, 72), "Seite zwei enthaelt mehr Text.")
data = doc.tobytes()
doc.close()
return data
@pytest.fixture
def sample_docx_bytes() -> bytes:
doc = Document()
doc.add_paragraph("Erster Absatz.")
doc.add_paragraph("Zweiter Absatz mit mehr Inhalt.")
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
@pytest.fixture
def sample_md_bytes() -> bytes:
return "# Title\n\nFirst paragraph.\n\nSecond paragraph.\n".encode("utf-8")
@pytest.fixture
def sample_xlsx_bytes() -> bytes:
# Minimal placeholder; extractor doesn't read content for xlsx
return b"PK\x03\x04dummy"