Initial commit: CSV row filler for satellite metadata

2026-02-11 14:24:33 +01:00
commit 9faa02ee9f
2 changed files with 95 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,21 @@
 # fill_csv.py
 Füllt fehlende ROW-Nummern in einer Satellitenmetadaten-CSV auf.
 ## Nutzung
 ```bash
 python fill_csv.py eingabe.csv
 ```
 Die Ausgabe landet automatisch als `eingabe_filled.csv` im selben Ordner.
 Optional kann ein eigener Ausgabepfad angegeben werden:
 ```bash
 python fill_csv.py eingabe.csv ausgabe.csv
 ```
 ## Was macht das Skript?
 Für jede Kombination aus `PATH` und `DATE` schaut das Skript, welche ROW-Nummern vorhanden sind, und füllt die Lücken zwischen Minimum und Maximum mit Platzhalterzeilen auf. Die neuen Zeilen übernehmen die Grunddaten (MISSION, SENSOR, PATH, DATE usw.), alles ab Spalte 10 bleibt leer.
--- a/fill_csv.py
+++ b/fill_csv.py
@@ -0,0 +1,74 @@
 #!/usr/bin/env python3
 """Fill missing ROW integers within each (PATH, DATE) group of a satellite imagery CSV."""
 import csv
 import sys
 from collections import defaultdict
 from pathlib import Path
 def fill_csv(input_path: Path, output_path: Path) -> None:
    with open(input_path, newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader)
        rows = list(reader)
    num_cols = len(header)
    # Group data rows by (PATH, DATE) — columns 3 and 9
    groups: dict[tuple, list] = defaultdict(list)
    for row in rows:
        path_val = row[3]
        date_val = row[9]
        groups[(path_val, date_val)].append(row)
    synthetic_rows: list[list[str]] = []
    for (path_val, date_val), group in groups.items():
        row_numbers = {int(r[4]) for r in group}
        min_row = min(row_numbers)
        max_row = max(row_numbers)
        missing = set(range(min_row, max_row + 1)) - row_numbers
        template = group[0]
        for r in sorted(missing):
            new_row = (
                template[:4]                       # cols 0–3: RECORD_TYP, MISSION, SENSOR, PATH
                + [str(r)]                         # col 4: ROW
                + template[5:8]                    # cols 5–7: SCENE, SUB, SHIFT
                + [f"{template[3]}/{r}"]           # col 8: LABEL
                + [template[9]]                    # col 9: DATE
                + [""] * (num_cols - 10)           # cols 10–end: blank
            )
            synthetic_rows.append(new_row)
    all_rows = rows + synthetic_rows
    all_rows.sort(key=lambda r: (r[3], r[9], int(r[4])))
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(header)
        writer.writerows(all_rows)
    print(f"Original rows : {len(rows)}")
    print(f"Synthetic rows: {len(synthetic_rows)}")
    print(f"Total rows    : {len(all_rows)}")
    print(f"Output written: {output_path}")
 def main() -> None:
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]", file=sys.stderr)
        sys.exit(1)
    input_path = Path(sys.argv[1])
    if len(sys.argv) >= 3:
        output_path = Path(sys.argv[2])
    else:
        output_path = input_path.with_stem(input_path.stem + "_filled")
    fill_csv(input_path, output_path)
 if __name__ == "__main__":
    main()