From 0224581587af36cb0e713a2310d75ef115be80a6 Mon Sep 17 00:00:00 2001 From: Jean-Luc Makiola Date: Mon, 4 May 2026 22:09:41 +0200 Subject: [PATCH] refactor: klarere typ-extraktion, fullmatch, root-prefix-test --- app/ingest/metadata.py | 13 ++++++++----- tests/test_metadata.py | 4 ++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/app/ingest/metadata.py b/app/ingest/metadata.py index c49fb55..7a49012 100644 --- a/app/ingest/metadata.py +++ b/app/ingest/metadata.py @@ -18,6 +18,10 @@ def parse_path(file_path: str, ingest_root: str) -> PathMetadata | None: Returns None when the path is outside the ingest root or does not match the expected `/.Semester//[/...]/` pattern. + + Caller must pass a file path. Directory paths (with or without trailing + slash) produce undefined results — `PurePosixPath` strips trailing slashes, + so a directory ending in `/foo/` is indistinguishable from a file `/foo`. """ norm_path = file_path.lstrip("/") norm_root = ingest_root.strip("/") @@ -28,17 +32,16 @@ def parse_path(file_path: str, ingest_root: str) -> PathMetadata | None: relative = norm_path[len(norm_root) + 1:] parts = PurePosixPath(relative).parts - # Need at least: semester / fach / file.ext → 3 parts + # Layout: [semester, fach, *inner, filename] — minimum 3 parts if len(parts) < 3: return None semester, fach = parts[0], parts[1] - if not SEMESTER_RE.match(semester): + if not SEMESTER_RE.fullmatch(semester): return None - # parts[-1] is the filename. Anything between fach and filename is "deeper". - # The first deeper segment becomes `typ`. None if file lives directly in fach. - typ = parts[2] if len(parts) > 3 else None + inner = parts[2:-1] # everything between fach and filename + typ = inner[0] if inner else None return PathMetadata(semester=semester, fach=fach, typ=typ) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 93e62b6..50ac72c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -44,3 +44,7 @@ def test_parse_path_no_fach_returns_none(): def test_parse_path_with_leading_slash_normalizes(): md = parse_path("/Documents/THB/Studium/2.Semester/Databases/x.pdf", ROOT) assert md == PathMetadata(semester="2.Semester", fach="Databases", typ=None) + + +def test_parse_path_root_prefix_collision_returns_none(): + assert parse_path("Documents/THB/StudiumExam/2.Semester/Foo/x.pdf", ROOT) is None