refactor: normalize status text and improve whitespace handling

2026-02-05 02:02:40 +01:00
parent 977eccab24
commit 5daf62a51f
1 changed files with 16 additions and 1 deletions
@@ -3,6 +3,7 @@ from string import punctuation
 import time
 from datetime import datetime
 from urllib.parse import urlparse
+import re

 import requests
 from selenium import webdriver
@@ -25,6 +26,13 @@ start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUM
 # start_url = "http://127.0.0.1:8000/sample_1.html"
 download_dir = Path("./audio_captchas").resolve()

+# Not yet really used, just for reference
+statuses = {
+    "in Produktion": "Das Dokument ist noch in Produktion.",
+    "abholbereit (Reisepass)": "Ihr Reisepass liegt zur Abholung bereit.",
+    "abholbereit (Personalausweis)": "Ihr Personalausweis ist in der Ausweisbehörde eingetroffen. Das Dokument kann abgeholt werden.",
+}
+

 def _extract_mp3_filename(url):
    """Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
@@ -96,7 +104,14 @@ def process_captcha_page_with_audio_captcha(driver):


 def normalize_status_text(raw_text: str):
-    return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
+    # Remove extra whitespace
+    text = re.sub(r"\s+", " ", raw_text).strip()
+
+    # Normalize specific status texts
+    if text == statuses["in Produktion"]:
+        text = "noch in Produktion"
+    
+    return text


 # I tried the OCR way first, but was not successful.