add working implementation

This commit is contained in:
2026-02-03 00:40:06 +01:00
parent 884f53e301
commit afd1dac916
5 changed files with 358 additions and 0 deletions

1
.gitignore vendored
View File

@@ -188,3 +188,4 @@ cython_debug/
# Built Visual Studio Code Extensions
*.vsix
settings.py

261
main.py Normal file
View File

@@ -0,0 +1,261 @@
from pathlib import Path
from string import punctuation
import time
from datetime import datetime
from urllib.parse import urlparse
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from settings import DOCUMENT_ID, WEBHOOK_URL
from transcription import transcribe_audio_with_whisper
MAX_CAPTCHA_ATTEMPTS = 3
USE_HEADLESS_MODE = False
start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUMENT_ID}"
# start_url = "http://127.0.0.1:8000/sample_1.html"
download_dir = Path("./audio_captchas").resolve()
def _extract_mp3_filename(url):
"""Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
parsed_url = urlparse(url)
filename = parsed_url.path.split("/")[-1]
return filename
def _get_driver():
options = webdriver.FirefoxOptions()
options.set_preference("intl.accept_languages", "de-DE, de")
options.set_preference("browser.download.useDownloadDir", True)
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", str(download_dir))
if USE_HEADLESS_MODE:
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
return driver
# Try the audio_captcha way
def process_captcha_page_with_audio_captcha(driver):
# Download audio captcha
driver.find_element(By.ID, "captcha").find_element(By.TAG_NAME, "img").screenshot(
"captcha.png"
)
audio_captcha = (
driver.find_element(By.ID, "captcha")
.find_element(By.CLASS_NAME, "audioCaptcha")
.find_element(By.TAG_NAME, "a")
.get_attribute("href")
)
print(audio_captcha)
driver.find_element(By.ID, "captcha").find_element(
By.CLASS_NAME, "audioCaptcha"
).find_element(By.TAG_NAME, "a").click()
print(f"Download dir: {str(download_dir)}")
mp3_filename = _extract_mp3_filename(audio_captcha)
print(f"Extracted MP3 filename: {mp3_filename}")
print("Warte auf den Download der Audiodatei...")
time.sleep(2) # Warte 2 Sekunden auf den Download (anpassen je nach Bedarf)
print(f"Lokaler Pfad der Audiodatei: {download_dir / mp3_filename}")
# perform transcription
transcribed_text = transcribe_audio_with_whisper(download_dir / mp3_filename)
print(f"Transkribierter Text: {transcribed_text}")
if len(transcribed_text) != 4:
print("Transkription hat nicht die erwartete Länge von 4 Zeichen. Abbruch.")
return False
# Fill in the form
driver.find_element(By.CLASS_NAME, "formtable").find_element(
By.ID, "code"
).send_keys(transcribed_text)
# Submit the form
driver.find_element(By.CLASS_NAME, "actionbuttons").find_element(
By.CLASS_NAME, "button"
).click()
return True
def normalize_status_text(raw_text: str):
return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
# I tried the OCR way first, but was not successful.
def screenshot_captcha(save_dir="./captchas", prefix="captcha"):
driver = _get_driver()
driver.get(start_url)
# Erzeuge einen eindeutigen Dateinamen mit Zeitstempel
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{save_dir}/{prefix}_{timestamp}.png"
driver.find_element(By.ID, "captcha").find_element(By.TAG_NAME, "img").screenshot(
filename
)
driver.quit()
print(f"Captcha gespeichert als {filename}")
def test_transcription():
"""
Processes all MP3 files in the specified download directory, transcribes each using Whisper,
and writes the results to a CSV file. Calculates and prints the accuracy based on the number
of transcriptions with exactly 4 characters.
The CSV file 'transcription_results.csv' will contain two columns: 'filename' and 'transcription'.
Returns:
None
"""
accuracy = 0
total = 0
# create or overwrite CSV file for transcription results
with open("transcription_results.csv", "w") as csvfile:
csvfile.write("filename,transcription\n")
for mp3_path in download_dir.glob("*.mp3"):
print(f"Verarbeite Datei: {mp3_path}")
transcription = transcribe_audio_with_whisper(mp3_path)
print(f"Transkription: {transcription}")
csvfile.write(f"{mp3_path.name},{transcription}\n")
if transcription and len(transcription) == 4:
accuracy += 1
total += 1
print(f"Genauigkeit: {accuracy}/{total} = {accuracy/total*100:.2f}%")
def is_captcha_page(html):
soup = BeautifulSoup(html, "html.parser")
fieldset = soup.find("fieldset", attrs={"aria-label": "CAPTCHA"})
return fieldset is not None
def is_status_page(html):
soup = BeautifulSoup(html, "html.parser")
fieldset = soup.find("fieldset", attrs={"aria-label": "Statusabfrage Suchergebnis"})
return fieldset is not None
def parse_status_page(html):
soup = BeautifulSoup(html, "html.parser")
box = soup.find("fieldset", attrs={"aria-label": "Statusabfrage Suchergebnis"})
if box is None:
raise ValueError("Statusbereich nicht gefunden (fehlendes Fieldset).")
warn = box.select_one(".warn p")
if warn is None:
raise ValueError("Statusabschnitt (.warn p) nicht gefunden.")
status_text = warn.get_text(strip=True)
timestamp_node = warn.find_next("p")
if timestamp_node is None:
raise ValueError("Zeitstempelabschnitt (nächstes <p>) nicht gefunden.")
timestamp_raw = timestamp_node.get_text(strip=True).replace("Stand: ", "")
timestamp_dt = datetime.strptime(timestamp_raw, "%d.%m.%Y %H:%M:%S")
print("Status:", status_text)
print("Stand :", timestamp_dt.isoformat())
return status_text, timestamp_dt
def test_parse_status_page():
with open("./sample_html/status.html", "r") as f:
html = f.read()
is_status = is_status_page(html)
print(f"is_status_page: {is_status}")
status, timestamp = parse_status_page(html)
status = normalize_status_text(status)
print(f"Status: {status}, Timestamp: {timestamp}")
def notify_webhook(status, last_updated, webhook_url):
data = {"status": status, "last_updated": last_updated.isoformat()}
response = requests.post(webhook_url, json=data)
success = response.status_code == 200
if success:
print("Daten erfolgreich gesendet.")
else:
print(f"Fehler beim Senden der Daten: {response.status_code}, {response.text}")
return success
def solve_captcha_flow(driver):
"""Versucht höchstens MAX_CAPTCHA_ATTEMPTS-mal, bis eine Statusseite erreicht wird."""
for attempt in range(1, MAX_CAPTCHA_ATTEMPTS + 1):
print(f"[Attempt {attempt}/{MAX_CAPTCHA_ATTEMPTS}] Löse Audio-Captcha …")
transcription_successful = process_captcha_page_with_audio_captcha(driver)
if not transcription_successful:
print("Transkription fehlgeschlagen, lade Captcha-Seite neu …")
driver.get(start_url)
continue
html = driver.page_source
if is_status_page(html):
status_raw, last_updated = parse_status_page(html)
status = normalize_status_text(status_raw)
print(f"Status ermittelt: {status} (Stand: {last_updated})")
return status, last_updated
if attempt == MAX_CAPTCHA_ATTEMPTS:
raise RuntimeError(
"Maximale Anzahl an Captcha-Versuchen erreicht, ohne Statusseite zu erhalten."
)
if not is_captcha_page(html):
raise RuntimeError("Weder Status- noch Captcha-Seite erkannt. Abbruch.")
print("Captcha nicht gelöst, versuche es erneut …")
driver.get(start_url)
raise RuntimeError("Status konnte nicht ermittelt werden.")
def main():
driver = _get_driver()
try:
driver.get(start_url)
status, last_updated = solve_captcha_flow(driver)
print(f"Final Status: {status}, Timestamp: {last_updated}")
if not notify_webhook(status, last_updated, WEBHOOK_URL):
raise RuntimeError("Webhook konnte nicht benachrichtigt werden.")
finally:
driver.quit()
if __name__ == "__main__":
#test_transcription()
# test_parse_status_page()
main()

19
requirements.txt Normal file
View File

@@ -0,0 +1,19 @@
# data acquisition and web scraping
selenium
beautifulsoup4
# audio processing
openai-whisper
# traditional OCR
pillow
pytesseract
opencv-python
tqdm
streamlit
python-Levenshtein
# ocr with keras/tensorflow
tensorflow
keras
matplotlib

5
settings.example.py Normal file
View File

@@ -0,0 +1,5 @@
# required
DOCUMENT_ID = ""
# optional: Webhook URL to send notifications to
#WEBHOOK_URL = "https://example.com/webhook/your_webhook_id"

72
transcription.py Normal file
View File

@@ -0,0 +1,72 @@
import re
# Mapping von gesprochenen Buchstaben (deutsch) auf Zeichen
SPOKEN_TO_CHAR = {
"a": "a", "ah": "a",
"be": "b", "bee": "b", "bei": "b",
"ce": "c", "see": "c", "ze": "c", "cheet": "c", "ci": "c",
"de": "d", "dee": "d",
"e": "e", "eh": "e",
"ef": "f", "eff": "f",
"ge": "g", "geh": "g",
"ha": "h", "hah": "h",
"i": "i", "ih": "i",
"jot": "j", "jay": "j", "yacht": "j", "jöt": "j",
"ka": "k", "kah": "k", "kar": "k", "car": "k",
"el": "l", "ell": "l",
"em": "m", "emm": "m",
"en": "n", "enn": "n",
"o": "o", "oh": "o",
"pe": "p", "peh": "p", "pi": "p", "pee": "p",
"ku": "q", "kuh": "q", "queue": "q", "coup": "q",
"er": "r", "err": "r",
"es": "s", "ess": "s",
"te": "t", "teh": "t", "ti": "t",
"u": "u", "uh": "u",
"vau": "v", "fau": "v", "faul": "v",
"we": "w", "weh": "w",
"ix": "x", "iks": "x",
"ypsilon": "y", "üpsilon": "y",
"zet": "z", "zett": "z", "set": "z", "fett": "z", "sedt": "z",
# Zahlen
"null": "0", "zero": "0",
"eins": "1", "one": "1",
"zwei": "2", "two": "2", "zwo": "2", "svi": "2", "svay": "2", "swei": "2",
"drei": "3", "three": "3",
"vier": "4", "four": "4", "fia": "4","fiar": "4", "sier": "4", "fier": "4",
"fünf": "5", "five": "5", "fönz": "5", "fünfs": "5", "fins": "5",
"sechs": "6", "six": "6",
"sieben": "7", "seven": "7", "zieben": "7", "riben": "7",
"acht": "8", "eight": "8",
"neun": "9", "nine": "9", "noin": "9",
}
def _normalize_transcription(raw_text):
"""Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
# Entferne Satzzeichen und splitte in Tokens
tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
result = []
for token in tokens:
if not token:
continue
# Prüfe, ob das Token ein bekanntes gesprochenes Wort ist
if token in SPOKEN_TO_CHAR:
result.append(SPOKEN_TO_CHAR[token])
# Falls es ein einzelnes Zeichen ist (a-z, 0-9), direkt übernehmen
elif len(token) == 1 and token.isalnum():
result.append(token)
# Sonst ignorieren oder loggen
else:
print(f"Unbekanntes Token: '{token}'")
return ''.join(result)
def transcribe_audio_with_whisper(mp3_path):
import whisper
model = whisper.load_model("small")
result = model.transcribe(str(mp3_path), language='de')
raw_text = result["text"]
print("Raw transcription:", raw_text)
cleaned = _normalize_transcription(raw_text)
print("Cleaned transcription:", cleaned)
return cleaned