add index endpoint

2026-05-05 03:54:21 +02:00
parent 883e3aa9d1
commit 9b01145f07
2 changed files with 196 additions and 2 deletions
@@ -46,6 +46,39 @@ Response:
 }
 ```
 Index request (all discovered stations):
 ```
 GET /api/pegel
 ```
 Index request with limit:
 ```
 GET /api/pegel?limit=5
 ```
 Index response shape:
 ```json
 {
  "count": 5,
  "total_available": 312,
  "items": [
    {
      "station_id": "560021",
      "station_title": "Golzern 1 / Vereinigte Mulde",
      "source": "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/feed/wasserstand-pegel-560021"
    }
  ],
  "source": "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/wasserstand-uebersicht",
  "fetched_at": "2026-05-05T01:00:00+00:00",
  "warnings": [
    "optional, only present for partial failures"
  ]
 }
 ```
 ---
 ## 🚀 Local Development
@@ -62,6 +95,7 @@ uvicorn api.app:app --reload
 Open:
 ```
 http://127.0.0.1:8000/api/pegel
 http://127.0.0.1:8000/api/pegel/560021
 ```
@@ -109,6 +143,16 @@ Pegel ID is passed via URL:
 /api/pegel/<id>
 ```
 Index endpoint:
 ```
 /api/pegel
 ```
 Query params for index:
 * `limit` (positive integer, optional; invalid values return HTTP 400)
 ---
 ## 📡 Data Source
@@ -2,12 +2,31 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 import feedparser
 import re
 import html
 import datetime as dt
 from urllib.request import urlopen, Request
 from urllib.error import URLError, HTTPError
 from email.utils import parsedate_to_datetime
 from typing import TypedDict, Any
 app = FastAPI()
 BASE = "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/feed/wasserstand-pegel-{}"
 DISCOVERY_URL = "https://www.umwelt.sachsen.de/umwelt/infosysteme/hwims/portal/web/wasserstand-uebersicht"
 INDEX_CACHE_TTL_SECONDS = 900
 LATEST_CACHE_TTL_SECONDS = 120
 class IndexCache(TypedDict):
    items: list[dict[str, Any]] | None
    fetched_at: dt.datetime | None
 _index_cache: IndexCache = {
    "items": None,
    "fetched_at": None,
 }
 _latest_cache: dict[str, dict[str, Any]] = {}
 def parse_number(pattern, text):
    m = re.search(pattern, text, re.I)
@@ -22,13 +41,79 @@ def clean_text(text):
    text = re.sub(r"<[^>]+>", " ", text)
    return text
-def extract_station_name(title: str):
+def extract_station_name(title: str | None):
    if not title:
        return None
    m = re.search(r"Pegel\s+(.+)$", title)
    return m.group(1).strip() if m else title
 def _now_utc():
    return dt.datetime.now(dt.UTC)
 def _is_cache_fresh(fetched_at, ttl_seconds: int):
    if not fetched_at:
        return False
    return (_now_utc() - fetched_at).total_seconds() < ttl_seconds
 def _set_index_cache(items):
    _index_cache["items"] = items
    _index_cache["fetched_at"] = _now_utc()
 def _extract_station_title_from_id(pegel_id: str):
    feed = feedparser.parse(BASE.format(pegel_id))
    feed_obj = getattr(feed, "feed", None)
    feed_title = feed_obj.get("title") if isinstance(feed_obj, dict) else None
    if isinstance(feed_title, str) and feed_title:
        return extract_station_name(feed_title)
    if feed.entries:
        first_title = feed.entries[0].get("title")
        if isinstance(first_title, str):
            return extract_station_name(first_title)
    return None
 def _parse_stations_from_overview(body: str):
    stations: dict[str, str | None] = {}
    paired_pattern = re.compile(
        r'<a\s+href="wasserstand-pegel-(\d{6,})"[^>]*></a>\s*'
        r'<div\s+class="popUp\s+popUpMs"[^>]*>.*?'
        r'<div\s+class="popUpTitle">\s*<span\s+class="popUpTitleBold">(.*?)</span>',
        re.I | re.S,
    )
    for pegel_id, raw_title in paired_pattern.findall(body):
        title = re.sub(r"\s+", " ", html.unescape(raw_title)).strip() or None
        stations[pegel_id] = title
    for pegel_id in re.findall(r"wasserstand-pegel-(\d{6,})", body):
        stations.setdefault(pegel_id, None)
    return stations
 def discover_stations():
    req = Request(
        DISCOVERY_URL,
        headers={"User-Agent": "sachsen-pegel-proxy/1.0"},
    )
    with urlopen(req, timeout=12) as resp:
        body = resp.read().decode("utf-8", errors="ignore")
    stations = _parse_stations_from_overview(body)
    ids = sorted(stations.keys())
    if not ids:
        raise ValueError("Keine Pegel-IDs in Discovery-Quelle gefunden")
    items = []
    for pegel_id in ids:
        station_title = stations.get(pegel_id)
        items.append({
            "station_id": pegel_id,
            "station_title": station_title or _extract_station_title_from_id(pegel_id),
            "source": BASE.format(pegel_id),
        })
    return items
 def parse_timestamp_from_title(title: str):
    if not title:
        return None
@@ -72,6 +157,68 @@ def parse_entry(entry):
        ),
    }
 def _get_latest_for_station(station_id: str):
    cache_hit = _latest_cache.get(station_id)
    if cache_hit and _is_cache_fresh(cache_hit.get("fetched_at"), LATEST_CACHE_TTL_SECONDS):
        return cache_hit.get("latest")
    feed = feedparser.parse(BASE.format(station_id))
    if not feed.entries:
        raise ValueError("Keine Daten gefunden")
    latest = parse_entry(feed.entries[0])
    _latest_cache[station_id] = {
        "latest": latest,
        "fetched_at": _now_utc(),
    }
    return latest
@app.get("/api/pegel")
 def list_pegel(limit: int | None = None):
    if limit is not None and limit <= 0:
        raise HTTPException(400, "Ungültiger limit-Parameter")
    warnings = []
    discovered_items: list[dict[str, Any]] | None = None
    source_used = DISCOVERY_URL
    cache_fallback = False
    if _is_cache_fresh(_index_cache.get("fetched_at"), INDEX_CACHE_TTL_SECONDS):
        discovered_items = _index_cache.get("items")
    else:
        try:
            discovered_items = discover_stations()
            _set_index_cache(discovered_items)
        except (URLError, HTTPError, TimeoutError, ValueError, OSError) as e:
            if _index_cache.get("items"):
                discovered_items = _index_cache.get("items")
                source_used = "cache"
                cache_fallback = True
                warnings.append(f"Discovery fehlgeschlagen, nutze Cache: {str(e)}")
            else:
                raise HTTPException(502, "Upstream-Discovery nicht verfügbar")
    if discovered_items is None:
        raise HTTPException(502, "Upstream-Discovery nicht verfügbar")
    total_available = len(discovered_items)
    items = discovered_items[:limit] if limit is not None else list(discovered_items)
    response = {
        "count": len(items),
        "total_available": total_available,
        "items": items,
        "source": source_used,
        "fetched_at": _now_utc().isoformat(),
    }
    if warnings:
        response["warnings"] = warnings
    cache_fetched_at = _index_cache.get("fetched_at")
    if cache_fallback and cache_fetched_at:
        response["cache_fetched_at"] = cache_fetched_at.isoformat()
    return JSONResponse(response)
@app.get("/api/pegel/{pegel_id}")
 def get_pegel(pegel_id: str):
    if not re.fullmatch(r"\d{6,}", pegel_id):
@@ -85,9 +232,12 @@ def get_pegel(pegel_id: str):
    entries = [parse_entry(entry) for entry in feed.entries]
    feed_obj = getattr(feed, "feed", None)
    feed_title = feed_obj.get("title") if isinstance(feed_obj, dict) else None
    return JSONResponse({
        "station_id": pegel_id,
-        "station_title": extract_station_name(feed.feed.get("title")),
+        "station_title": extract_station_name(feed_title),
        "count": len(entries),
        "latest": entries[0],
        "entries": entries,