feat: pfad-metadata-parser mit semester/fach/typ
This commit is contained in:
44
app/ingest/metadata.py
Normal file
44
app/ingest/metadata.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
|
||||
SEMESTER_RE = re.compile(r"^\d+\.Semester$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PathMetadata:
|
||||
semester: str
|
||||
fach: str
|
||||
typ: str | None
|
||||
|
||||
|
||||
def parse_path(file_path: str, ingest_root: str) -> PathMetadata | None:
|
||||
"""Parse a Nextcloud file path into structured metadata.
|
||||
|
||||
Returns None when the path is outside the ingest root or does not match
|
||||
the expected `<root>/<N>.Semester/<Fach>/[<typ>/...]/<file>` pattern.
|
||||
"""
|
||||
norm_path = file_path.lstrip("/")
|
||||
norm_root = ingest_root.strip("/")
|
||||
|
||||
if not norm_path.startswith(norm_root + "/"):
|
||||
return None
|
||||
|
||||
relative = norm_path[len(norm_root) + 1:]
|
||||
parts = PurePosixPath(relative).parts
|
||||
|
||||
# Need at least: semester / fach / file.ext → 3 parts
|
||||
if len(parts) < 3:
|
||||
return None
|
||||
|
||||
semester, fach = parts[0], parts[1]
|
||||
|
||||
if not SEMESTER_RE.match(semester):
|
||||
return None
|
||||
|
||||
# parts[-1] is the filename. Anything between fach and filename is "deeper".
|
||||
# The first deeper segment becomes `typ`. None if file lives directly in fach.
|
||||
typ = parts[2] if len(parts) > 3 else None
|
||||
|
||||
return PathMetadata(semester=semester, fach=fach, typ=typ)
|
||||
46
tests/test_metadata.py
Normal file
46
tests/test_metadata.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from app.ingest.metadata import parse_path, PathMetadata
|
||||
|
||||
|
||||
ROOT = "Documents/THB/Studium"
|
||||
|
||||
|
||||
def test_parse_path_with_typ():
|
||||
md = parse_path("Documents/THB/Studium/2.Semester/Databases/Vorlesungen/DBS1.pdf", ROOT)
|
||||
assert md == PathMetadata(semester="2.Semester", fach="Databases", typ="Vorlesungen")
|
||||
|
||||
|
||||
def test_parse_path_without_typ():
|
||||
md = parse_path("Documents/THB/Studium/2.Semester/Databases/DBS1.pdf", ROOT)
|
||||
assert md == PathMetadata(semester="2.Semester", fach="Databases", typ=None)
|
||||
|
||||
|
||||
def test_parse_path_deep_nested_keeps_first_subdir_as_typ():
|
||||
md = parse_path("Documents/THB/Studium/2.Semester/Databases/Uebungen/01/sheet.pdf", ROOT)
|
||||
assert md == PathMetadata(semester="2.Semester", fach="Databases", typ="Uebungen")
|
||||
|
||||
|
||||
def test_parse_path_outside_root_returns_none():
|
||||
assert parse_path("Documents/Other/file.pdf", ROOT) is None
|
||||
|
||||
|
||||
def test_parse_path_loose_file_in_thb_returns_none():
|
||||
assert parse_path("Documents/THB/Studienbescheinigung.pdf", ROOT) is None
|
||||
|
||||
|
||||
def test_parse_path_loose_file_under_root_returns_none():
|
||||
# File directly under Studium with no semester folder
|
||||
assert parse_path("Documents/THB/Studium/readme.txt", ROOT) is None
|
||||
|
||||
|
||||
def test_parse_path_invalid_semester_pattern_returns_none():
|
||||
assert parse_path("Documents/THB/Studium/Sommersemester/Databases/x.pdf", ROOT) is None
|
||||
|
||||
|
||||
def test_parse_path_no_fach_returns_none():
|
||||
# File directly in Semester folder with no fach subdir
|
||||
assert parse_path("Documents/THB/Studium/2.Semester/loose.pdf", ROOT) is None
|
||||
|
||||
|
||||
def test_parse_path_with_leading_slash_normalizes():
|
||||
md = parse_path("/Documents/THB/Studium/2.Semester/Databases/x.pdf", ROOT)
|
||||
assert md == PathMetadata(semester="2.Semester", fach="Databases", typ=None)
|
||||
Reference in New Issue
Block a user