refactor: normalize status text and improve whitespace handling
Some checks failed
Python tests / tests (push) Failing after 8s

This commit is contained in:
2026-02-05 02:02:40 +01:00
parent 977eccab24
commit 5daf62a51f

17
main.py
View File

@@ -3,6 +3,7 @@ from string import punctuation
import time
from datetime import datetime
from urllib.parse import urlparse
import re
import requests
from selenium import webdriver
@@ -25,6 +26,13 @@ start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUM
# start_url = "http://127.0.0.1:8000/sample_1.html"
download_dir = Path("./audio_captchas").resolve()
# Not yet really used, just for reference
statuses = {
"in Produktion": "Das Dokument ist noch in Produktion.",
"abholbereit (Reisepass)": "Ihr Reisepass liegt zur Abholung bereit.",
"abholbereit (Personalausweis)": "Ihr Personalausweis ist in der Ausweisbehörde eingetroffen. Das Dokument kann abgeholt werden.",
}
def _extract_mp3_filename(url):
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
@@ -96,7 +104,14 @@ def process_captcha_page_with_audio_captcha(driver):
def normalize_status_text(raw_text: str):
return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
# Remove extra whitespace
text = re.sub(r"\s+", " ", raw_text).strip()
# Normalize specific status texts
if text == statuses["in Produktion"]:
text = "noch in Produktion"
return text
# I tried the OCR way first, but was not successful.