refactor: normalize status text and improve whitespace handling
Some checks failed
Python tests / tests (push) Failing after 8s

This commit is contained in:
2026-02-05 02:02:40 +01:00
parent 977eccab24
commit 5daf62a51f

17
main.py
View File

@@ -3,6 +3,7 @@ from string import punctuation
import time import time
from datetime import datetime from datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
import re
import requests import requests
from selenium import webdriver from selenium import webdriver
@@ -25,6 +26,13 @@ start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUM
# start_url = "http://127.0.0.1:8000/sample_1.html" # start_url = "http://127.0.0.1:8000/sample_1.html"
download_dir = Path("./audio_captchas").resolve() download_dir = Path("./audio_captchas").resolve()
# Not yet really used, just for reference
statuses = {
"in Produktion": "Das Dokument ist noch in Produktion.",
"abholbereit (Reisepass)": "Ihr Reisepass liegt zur Abholung bereit.",
"abholbereit (Personalausweis)": "Ihr Personalausweis ist in der Ausweisbehörde eingetroffen. Das Dokument kann abgeholt werden.",
}
def _extract_mp3_filename(url): def _extract_mp3_filename(url):
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL.""" """Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
@@ -96,7 +104,14 @@ def process_captcha_page_with_audio_captcha(driver):
def normalize_status_text(raw_text: str): def normalize_status_text(raw_text: str):
return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip() # Remove extra whitespace
text = re.sub(r"\s+", " ", raw_text).strip()
# Normalize specific status texts
if text == statuses["in Produktion"]:
text = "noch in Produktion"
return text
# I tried the OCR way first, but was not successful. # I tried the OCR way first, but was not successful.