From 9b01145f072669a29b230f419e5e722d58b4d172 Mon Sep 17 00:00:00 2001
From: Matthias Jacob <dev@matthiasjacob.net>
Date: Tue, 5 May 2026 03:54:21 +0200
Subject: [PATCH] add index endpoint

---
 README.md  |  44 +++++++++++++++
 api/app.py | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 196 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index bcef44d..7510880 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,39 @@ Response:
 }
 ```
 
+Index request (all discovered stations):
+
+```
+GET /api/pegel
+```
+
+Index request with limit:
+
+```
+GET /api/pegel?limit=5
+```
+
+Index response shape:
+
+```json
+{
+  "count": 5,
+  "total_available": 312,
+  "items": [
+    {
+      "station_id": "560021",
+      "station_title": "Golzern 1 / Vereinigte Mulde",
+      "source": "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/feed/wasserstand-pegel-560021"
+    }
+  ],
+  "source": "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/wasserstand-uebersicht",
+  "fetched_at": "2026-05-05T01:00:00+00:00",
+  "warnings": [
+    "optional, only present for partial failures"
+  ]
+}
+```
+
 ---
 
 ## 🚀 Local Development
@@ -62,6 +95,7 @@ uvicorn api.app:app --reload
 Open:
 
 ```
+http://127.0.0.1:8000/api/pegel
 http://127.0.0.1:8000/api/pegel/560021
 ```
 
@@ -109,6 +143,16 @@ Pegel ID is passed via URL:
 /api/pegel/<id>
 ```
 
+Index endpoint:
+
+```
+/api/pegel
+```
+
+Query params for index:
+
+ * `limit` (positive integer, optional; invalid values return HTTP 400)
+
 ---
 
 ## 📡 Data Source
diff --git a/api/app.py b/api/app.py
index e7158a0..c55f879 100644
--- a/api/app.py
+++ b/api/app.py
@@ -2,12 +2,31 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 import feedparser
 import re
+import html
 import datetime as dt
+from urllib.request import urlopen, Request
+from urllib.error import URLError, HTTPError
 from email.utils import parsedate_to_datetime
+from typing import TypedDict, Any
 
 app = FastAPI()
 
 BASE = "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/feed/wasserstand-pegel-{}"
+DISCOVERY_URL = "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/wasserstand-uebersicht"
+INDEX_CACHE_TTL_SECONDS = 900
+LATEST_CACHE_TTL_SECONDS = 120
+
+class IndexCache(TypedDict):
+    items: list[dict[str, Any]] | None
+    fetched_at: dt.datetime | None
+
+
+_index_cache: IndexCache = {
+    "items": None,
+    "fetched_at": None,
+}
+
+_latest_cache: dict[str, dict[str, Any]] = {}
 
 def parse_number(pattern, text):
     m = re.search(pattern, text, re.I)
@@ -22,13 +41,79 @@ def clean_text(text):
     text = re.sub(r"<[^>]+>", " ", text)
     return text
 
-def extract_station_name(title: str):
+def extract_station_name(title: str | None):
     if not title:
         return None
 
     m = re.search(r"Pegel\s+(.+)$", title)
     return m.group(1).strip() if m else title
 
+def _now_utc():
+    return dt.datetime.now(dt.UTC)
+
+def _is_cache_fresh(fetched_at, ttl_seconds: int):
+    if not fetched_at:
+        return False
+    return (_now_utc() - fetched_at).total_seconds() < ttl_seconds
+
+def _set_index_cache(items):
+    _index_cache["items"] = items
+    _index_cache["fetched_at"] = _now_utc()
+
+def _extract_station_title_from_id(pegel_id: str):
+    feed = feedparser.parse(BASE.format(pegel_id))
+    feed_obj = getattr(feed, "feed", None)
+    feed_title = feed_obj.get("title") if isinstance(feed_obj, dict) else None
+    if isinstance(feed_title, str) and feed_title:
+        return extract_station_name(feed_title)
+    if feed.entries:
+        first_title = feed.entries[0].get("title")
+        if isinstance(first_title, str):
+            return extract_station_name(first_title)
+    return None
+
+def _parse_stations_from_overview(body: str):
+    stations: dict[str, str | None] = {}
+
+    paired_pattern = re.compile(
+        r'<a\s+href="wasserstand-pegel-(\d{6,})"[^>]*></a>\s*'
+        r'<div\s+class="popUp\s+popUpMs"[^>]*>.*?'
+        r'<div\s+class="popUpTitle">\s*<span\s+class="popUpTitleBold">(.*?)</span>',
+        re.I | re.S,
+    )
+
+    for pegel_id, raw_title in paired_pattern.findall(body):
+        title = re.sub(r"\s+", " ", html.unescape(raw_title)).strip() or None
+        stations[pegel_id] = title
+
+    for pegel_id in re.findall(r"wasserstand-pegel-(\d{6,})", body):
+        stations.setdefault(pegel_id, None)
+
+    return stations
+
+def discover_stations():
+    req = Request(
+        DISCOVERY_URL,
+        headers={"User-Agent": "sachsen-pegel-proxy/1.0"},
+    )
+    with urlopen(req, timeout=12) as resp:
+        body = resp.read().decode("utf-8", errors="ignore")
+
+    stations = _parse_stations_from_overview(body)
+    ids = sorted(stations.keys())
+    if not ids:
+        raise ValueError("Keine Pegel-IDs in Discovery-Quelle gefunden")
+
+    items = []
+    for pegel_id in ids:
+        station_title = stations.get(pegel_id)
+        items.append({
+            "station_id": pegel_id,
+            "station_title": station_title or _extract_station_title_from_id(pegel_id),
+            "source": BASE.format(pegel_id),
+        })
+    return items
+
 def parse_timestamp_from_title(title: str):
     if not title:
         return None
@@ -72,6 +157,68 @@ def parse_entry(entry):
         ),
     }
 
+def _get_latest_for_station(station_id: str):
+    cache_hit = _latest_cache.get(station_id)
+    if cache_hit and _is_cache_fresh(cache_hit.get("fetched_at"), LATEST_CACHE_TTL_SECONDS):
+        return cache_hit.get("latest")
+
+    feed = feedparser.parse(BASE.format(station_id))
+    if not feed.entries:
+        raise ValueError("Keine Daten gefunden")
+
+    latest = parse_entry(feed.entries[0])
+    _latest_cache[station_id] = {
+        "latest": latest,
+        "fetched_at": _now_utc(),
+    }
+    return latest
+
+@app.get("/api/pegel")
+def list_pegel(limit: int | None = None):
+    if limit is not None and limit <= 0:
+        raise HTTPException(400, "Ungültiger limit-Parameter")
+
+    warnings = []
+    discovered_items: list[dict[str, Any]] | None = None
+    source_used = DISCOVERY_URL
+    cache_fallback = False
+
+    if _is_cache_fresh(_index_cache.get("fetched_at"), INDEX_CACHE_TTL_SECONDS):
+        discovered_items = _index_cache.get("items")
+    else:
+        try:
+            discovered_items = discover_stations()
+            _set_index_cache(discovered_items)
+        except (URLError, HTTPError, TimeoutError, ValueError, OSError) as e:
+            if _index_cache.get("items"):
+                discovered_items = _index_cache.get("items")
+                source_used = "cache"
+                cache_fallback = True
+                warnings.append(f"Discovery fehlgeschlagen, nutze Cache: {str(e)}")
+            else:
+                raise HTTPException(502, "Upstream-Discovery nicht verfügbar")
+
+    if discovered_items is None:
+        raise HTTPException(502, "Upstream-Discovery nicht verfügbar")
+
+    total_available = len(discovered_items)
+    items = discovered_items[:limit] if limit is not None else list(discovered_items)
+
+    response = {
+        "count": len(items),
+        "total_available": total_available,
+        "items": items,
+        "source": source_used,
+        "fetched_at": _now_utc().isoformat(),
+    }
+    if warnings:
+        response["warnings"] = warnings
+    cache_fetched_at = _index_cache.get("fetched_at")
+    if cache_fallback and cache_fetched_at:
+        response["cache_fetched_at"] = cache_fetched_at.isoformat()
+
+    return JSONResponse(response)
+
 @app.get("/api/pegel/{pegel_id}")
 def get_pegel(pegel_id: str):
     if not re.fullmatch(r"\d{6,}", pegel_id):
@@ -85,9 +232,12 @@ def get_pegel(pegel_id: str):
 
     entries = [parse_entry(entry) for entry in feed.entries]
 
+    feed_obj = getattr(feed, "feed", None)
+    feed_title = feed_obj.get("title") if isinstance(feed_obj, dict) else None
+
     return JSONResponse({
         "station_id": pegel_id,
-        "station_title": extract_station_name(feed.feed.get("title")),
+        "station_title": extract_station_name(feed_title),
         "count": len(entries),
         "latest": entries[0],
         "entries": entries,