add working implementation

2026-02-03 00:40:06 +01:00
parent 884f53e301
commit afd1dac916
5 changed files with 358 additions and 0 deletions
@@ -188,3 +188,4 @@ cython_debug/
 # Built Visual Studio Code Extensions
 *.vsix
 settings.py
@@ -0,0 +1,261 @@
 from pathlib import Path
 from string import punctuation
 import time
 from datetime import datetime
 from urllib.parse import urlparse
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from bs4 import BeautifulSoup
 from settings import DOCUMENT_ID, WEBHOOK_URL
 from transcription import transcribe_audio_with_whisper
 MAX_CAPTCHA_ATTEMPTS = 3
 USE_HEADLESS_MODE = False
 start_url = f"https://olmera.verwalt-berlin.de/std/olav/antrag/passpa/1?d={DOCUMENT_ID}"
 # start_url = "http://127.0.0.1:8000/sample_1.html"
 download_dir = Path("./audio_captchas").resolve()
 def _extract_mp3_filename(url):
    """Extrahiert den Dateinamen (z.B. '1770057062035.mp3') aus einer URL."""
    parsed_url = urlparse(url)
    filename = parsed_url.path.split("/")[-1]
    return filename
 def _get_driver():
    options = webdriver.FirefoxOptions()
    options.set_preference("intl.accept_languages", "de-DE, de")
    options.set_preference("browser.download.useDownloadDir", True)
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.dir", str(download_dir))
    if USE_HEADLESS_MODE:
        options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    return driver
 # Try the audio_captcha way
 def process_captcha_page_with_audio_captcha(driver):
    # Download audio captcha
    driver.find_element(By.ID, "captcha").find_element(By.TAG_NAME, "img").screenshot(
        "captcha.png"
    )
    audio_captcha = (
        driver.find_element(By.ID, "captcha")
        .find_element(By.CLASS_NAME, "audioCaptcha")
        .find_element(By.TAG_NAME, "a")
        .get_attribute("href")
    )
    print(audio_captcha)
    driver.find_element(By.ID, "captcha").find_element(
        By.CLASS_NAME, "audioCaptcha"
    ).find_element(By.TAG_NAME, "a").click()
    print(f"Download dir: {str(download_dir)}")
    mp3_filename = _extract_mp3_filename(audio_captcha)
    print(f"Extracted MP3 filename: {mp3_filename}")
    print("Warte auf den Download der Audiodatei...")
    time.sleep(2)  # Warte 2 Sekunden auf den Download (anpassen je nach Bedarf)
    print(f"Lokaler Pfad der Audiodatei: {download_dir / mp3_filename}")
    # perform transcription
    transcribed_text = transcribe_audio_with_whisper(download_dir / mp3_filename)
    print(f"Transkribierter Text: {transcribed_text}")
    if len(transcribed_text) != 4:
        print("Transkription hat nicht die erwartete Länge von 4 Zeichen. Abbruch.")
        return False
    # Fill in the form
    driver.find_element(By.CLASS_NAME, "formtable").find_element(
        By.ID, "code"
    ).send_keys(transcribed_text)
    # Submit the form
    driver.find_element(By.CLASS_NAME, "actionbuttons").find_element(
        By.CLASS_NAME, "button"
    ).click()
    return True
 def normalize_status_text(raw_text: str):
    return raw_text.replace("Das Dokument ist", "").strip(punctuation).strip()
 # I tried the OCR way first, but was not successful.
 def screenshot_captcha(save_dir="./captchas", prefix="captcha"):
    driver = _get_driver()
    driver.get(start_url)
    # Erzeuge einen eindeutigen Dateinamen mit Zeitstempel
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{save_dir}/{prefix}_{timestamp}.png"
    driver.find_element(By.ID, "captcha").find_element(By.TAG_NAME, "img").screenshot(
        filename
    )
    driver.quit()
    print(f"Captcha gespeichert als {filename}")
 def test_transcription():
    """
    Processes all MP3 files in the specified download directory, transcribes each using Whisper,
    and writes the results to a CSV file. Calculates and prints the accuracy based on the number
    of transcriptions with exactly 4 characters.
    The CSV file 'transcription_results.csv' will contain two columns: 'filename' and 'transcription'.
    Returns:
        None
    """
    accuracy = 0
    total = 0
    # create or overwrite CSV file for transcription results
    with open("transcription_results.csv", "w") as csvfile:
        csvfile.write("filename,transcription\n")
        for mp3_path in download_dir.glob("*.mp3"):
            print(f"Verarbeite Datei: {mp3_path}")
            transcription = transcribe_audio_with_whisper(mp3_path)
            print(f"Transkription: {transcription}")
            csvfile.write(f"{mp3_path.name},{transcription}\n")
            if transcription and len(transcription) == 4:
                accuracy += 1
            total += 1
    print(f"Genauigkeit: {accuracy}/{total} = {accuracy/total*100:.2f}%")
 def is_captcha_page(html):
    soup = BeautifulSoup(html, "html.parser")
    fieldset = soup.find("fieldset", attrs={"aria-label": "CAPTCHA"})
    return fieldset is not None
 def is_status_page(html):
    soup = BeautifulSoup(html, "html.parser")
    fieldset = soup.find("fieldset", attrs={"aria-label": "Statusabfrage Suchergebnis"})
    return fieldset is not None
 def parse_status_page(html):
    soup = BeautifulSoup(html, "html.parser")
    box = soup.find("fieldset", attrs={"aria-label": "Statusabfrage Suchergebnis"})
    if box is None:
        raise ValueError("Statusbereich nicht gefunden (fehlendes Fieldset).")
    warn = box.select_one(".warn p")
    if warn is None:
        raise ValueError("Statusabschnitt (.warn p) nicht gefunden.")
    status_text = warn.get_text(strip=True)
    timestamp_node = warn.find_next("p")
    if timestamp_node is None:
        raise ValueError("Zeitstempelabschnitt (nächstes <p>) nicht gefunden.")
    timestamp_raw = timestamp_node.get_text(strip=True).replace("Stand: ", "")
    timestamp_dt = datetime.strptime(timestamp_raw, "%d.%m.%Y %H:%M:%S")
    print("Status:", status_text)
    print("Stand :", timestamp_dt.isoformat())
    return status_text, timestamp_dt
 def test_parse_status_page():
    with open("./sample_html/status.html", "r") as f:
        html = f.read()
    is_status = is_status_page(html)
    print(f"is_status_page: {is_status}")
    status, timestamp = parse_status_page(html)
    status = normalize_status_text(status)
    print(f"Status: {status}, Timestamp: {timestamp}")
 def notify_webhook(status, last_updated, webhook_url):
    data = {"status": status, "last_updated": last_updated.isoformat()}
    response = requests.post(webhook_url, json=data)
    success = response.status_code == 200
    if success:
        print("Daten erfolgreich gesendet.")
    else:
        print(f"Fehler beim Senden der Daten: {response.status_code}, {response.text}")
    return success
 def solve_captcha_flow(driver):
    """Versucht höchstens MAX_CAPTCHA_ATTEMPTS-mal, bis eine Statusseite erreicht wird."""
    for attempt in range(1, MAX_CAPTCHA_ATTEMPTS + 1):
        print(f"[Attempt {attempt}/{MAX_CAPTCHA_ATTEMPTS}] Löse Audio-Captcha …")
        transcription_successful = process_captcha_page_with_audio_captcha(driver)
        if not transcription_successful:
            print("Transkription fehlgeschlagen, lade Captcha-Seite neu …")
            driver.get(start_url)
            continue
        html = driver.page_source
        if is_status_page(html):
            status_raw, last_updated = parse_status_page(html)
            status = normalize_status_text(status_raw)
            print(f"Status ermittelt: {status} (Stand: {last_updated})")
            return status, last_updated
        if attempt == MAX_CAPTCHA_ATTEMPTS:
            raise RuntimeError(
                "Maximale Anzahl an Captcha-Versuchen erreicht, ohne Statusseite zu erhalten."
            )
        if not is_captcha_page(html):
            raise RuntimeError("Weder Status- noch Captcha-Seite erkannt. Abbruch.")
        print("Captcha nicht gelöst, versuche es erneut …")
        driver.get(start_url)
    raise RuntimeError("Status konnte nicht ermittelt werden.")
 def main():
    driver = _get_driver()
    try:
        driver.get(start_url)
        status, last_updated = solve_captcha_flow(driver)
        print(f"Final Status: {status}, Timestamp: {last_updated}")
        if not notify_webhook(status, last_updated, WEBHOOK_URL):
            raise RuntimeError("Webhook konnte nicht benachrichtigt werden.")
    finally:
        driver.quit()
 if __name__ == "__main__":
    #test_transcription()
    # test_parse_status_page()
    main()
@@ -0,0 +1,19 @@
 # data acquisition and web scraping
 selenium
 beautifulsoup4
 # audio processing
 openai-whisper
 # traditional OCR
 pillow
 pytesseract
 opencv-python
 tqdm
 streamlit
 python-Levenshtein
 # ocr with keras/tensorflow
 tensorflow
 keras
 matplotlib
@@ -0,0 +1,5 @@
 # required
 DOCUMENT_ID = ""
 # optional: Webhook URL to send notifications to
 #WEBHOOK_URL = "https://example.com/webhook/your_webhook_id"
@@ -0,0 +1,72 @@
 import re
 # Mapping von gesprochenen Buchstaben (deutsch) auf Zeichen
 SPOKEN_TO_CHAR = {
    "a": "a", "ah": "a",
    "be": "b", "bee": "b", "bei": "b",
    "ce": "c", "see": "c", "ze": "c", "cheet": "c", "ci": "c",
    "de": "d", "dee": "d",
    "e": "e", "eh": "e",
    "ef": "f", "eff": "f",
    "ge": "g", "geh": "g",
    "ha": "h", "hah": "h",
    "i": "i", "ih": "i",
    "jot": "j", "jay": "j", "yacht": "j", "jöt": "j",
    "ka": "k", "kah": "k", "kar": "k", "car": "k",
    "el": "l", "ell": "l",
    "em": "m", "emm": "m",
    "en": "n", "enn": "n",
    "o": "o", "oh": "o",
    "pe": "p", "peh": "p", "pi": "p", "pee": "p",
    "ku": "q", "kuh": "q", "queue": "q", "coup": "q",
    "er": "r", "err": "r",
    "es": "s", "ess": "s",
    "te": "t", "teh": "t", "ti": "t",
    "u": "u", "uh": "u",
    "vau": "v", "fau": "v", "faul": "v",
    "we": "w", "weh": "w",
    "ix": "x", "iks": "x",
    "ypsilon": "y", "üpsilon": "y",
    "zet": "z", "zett": "z", "set": "z", "fett": "z", "sedt": "z",
    # Zahlen
    "null": "0", "zero": "0",
    "eins": "1", "one": "1",
    "zwei": "2", "two": "2", "zwo": "2", "svi": "2", "svay": "2", "swei": "2",
    "drei": "3", "three": "3",
    "vier": "4", "four": "4", "fia": "4","fiar": "4", "sier": "4", "fier": "4",
    "fünf": "5", "five": "5", "fönz": "5", "fünfs": "5", "fins": "5",
    "sechs": "6", "six": "6",
    "sieben": "7", "seven": "7", "zieben": "7", "riben": "7",
    "acht": "8", "eight": "8",
    "neun": "9", "nine": "9", "noin": "9",
 }
 def _normalize_transcription(raw_text):
    """Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
    # Entferne Satzzeichen und splitte in Tokens
    tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
    result = []
    for token in tokens:
        if not token:
            continue
        # Prüfe, ob das Token ein bekanntes gesprochenes Wort ist
        if token in SPOKEN_TO_CHAR:
            result.append(SPOKEN_TO_CHAR[token])
        # Falls es ein einzelnes Zeichen ist (a-z, 0-9), direkt übernehmen
        elif len(token) == 1 and token.isalnum():
            result.append(token)
        # Sonst ignorieren oder loggen
        else:
            print(f"Unbekanntes Token: '{token}'")
    return ''.join(result)
 def transcribe_audio_with_whisper(mp3_path):
    import whisper
    model = whisper.load_model("small")
    result = model.transcribe(str(mp3_path), language='de')
    raw_text = result["text"]
    print("Raw transcription:", raw_text)
    cleaned = _normalize_transcription(raw_text)
    print("Cleaned transcription:", cleaned)
    return cleaned