refactor: normalize status text and improve whitespace handling
Some checks failed
Python tests / tests (push) Failing after 8s
Some checks failed
Python tests / tests (push) Failing after 8s
This commit is contained in:
17
main.py
17
main.py
@@ -3,6 +3,7 @@ from string import punctuation
|
|||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
@@ -25,6 +26,13 @@ start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUM
|
|||||||
# start_url = "http://127.0.0.1:8000/sample_1.html"
|
# start_url = "http://127.0.0.1:8000/sample_1.html"
|
||||||
download_dir = Path("./audio_captchas").resolve()
|
download_dir = Path("./audio_captchas").resolve()
|
||||||
|
|
||||||
|
# Not yet really used, just for reference
|
||||||
|
statuses = {
|
||||||
|
"in Produktion": "Das Dokument ist noch in Produktion.",
|
||||||
|
"abholbereit (Reisepass)": "Ihr Reisepass liegt zur Abholung bereit.",
|
||||||
|
"abholbereit (Personalausweis)": "Ihr Personalausweis ist in der Ausweisbehörde eingetroffen. Das Dokument kann abgeholt werden.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _extract_mp3_filename(url):
|
def _extract_mp3_filename(url):
|
||||||
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
|
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
|
||||||
@@ -96,7 +104,14 @@ def process_captcha_page_with_audio_captcha(driver):
|
|||||||
|
|
||||||
|
|
||||||
def normalize_status_text(raw_text: str):
|
def normalize_status_text(raw_text: str):
|
||||||
return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r"\s+", " ", raw_text).strip()
|
||||||
|
|
||||||
|
# Normalize specific status texts
|
||||||
|
if text == statuses["in Produktion"]:
|
||||||
|
text = "noch in Produktion"
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
# I tried the OCR way first, but was not successful.
|
# I tried the OCR way first, but was not successful.
|
||||||
|
|||||||
Reference in New Issue
Block a user