refactor: normalize status text and improve whitespace handling
Some checks failed
Python tests / tests (push) Failing after 8s
Some checks failed
Python tests / tests (push) Failing after 8s
This commit is contained in:
17
main.py
17
main.py
@@ -3,6 +3,7 @@ from string import punctuation
|
||||
import time
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
@@ -25,6 +26,13 @@ start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUM
|
||||
# start_url = "http://127.0.0.1:8000/sample_1.html"
|
||||
download_dir = Path("./audio_captchas").resolve()
|
||||
|
||||
# Not yet really used, just for reference
|
||||
statuses = {
|
||||
"in Produktion": "Das Dokument ist noch in Produktion.",
|
||||
"abholbereit (Reisepass)": "Ihr Reisepass liegt zur Abholung bereit.",
|
||||
"abholbereit (Personalausweis)": "Ihr Personalausweis ist in der Ausweisbehörde eingetroffen. Das Dokument kann abgeholt werden.",
|
||||
}
|
||||
|
||||
|
||||
def _extract_mp3_filename(url):
|
||||
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
|
||||
@@ -96,7 +104,14 @@ def process_captcha_page_with_audio_captcha(driver):
|
||||
|
||||
|
||||
def normalize_status_text(raw_text: str):
|
||||
return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r"\s+", " ", raw_text).strip()
|
||||
|
||||
# Normalize specific status texts
|
||||
if text == statuses["in Produktion"]:
|
||||
text = "noch in Produktion"
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# I tried the OCR way first, but was not successful.
|
||||
|
||||
Reference in New Issue
Block a user