import re # Mapping von gesprochenen Buchstaben (deutsch) auf Zeichen SPOKEN_TO_CHAR = { "a": "a", "ah": "a", "be": "b", "bee": "b", "bei": "b", "ce": "c", "see": "c", "ze": "c", "cheet": "c", "ci": "c", "de": "d", "dee": "d", "e": "e", "eh": "e", "ef": "f", "eff": "f", "ge": "g", "geh": "g", "ha": "h", "hah": "h", "i": "i", "ih": "i", "jot": "j", "jay": "j", "yacht": "j", "jöt": "j", "ka": "k", "kah": "k", "kar": "k", "car": "k", "el": "l", "ell": "l", "em": "m", "emm": "m", "en": "n", "enn": "n", "o": "o", "oh": "o", "pe": "p", "peh": "p", "pi": "p", "pee": "p", "ku": "q", "kuh": "q", "queue": "q", "coup": "q", "er": "r", "err": "r", "es": "s", "ess": "s", "te": "t", "teh": "t", "ti": "t", "u": "u", "uh": "u", "vau": "v", "fau": "v", "faul": "v", "we": "w", "weh": "w", "ix": "x", "iks": "x", "ypsilon": "y", "üpsilon": "y", "zet": "z", "zett": "z", "set": "z", "fett": "z", "sedt": "z", # Zahlen "null": "0", "zero": "0", "eins": "1", "one": "1", "zwei": "2", "two": "2", "zwo": "2", "svi": "2", "svay": "2", "swei": "2", "drei": "3", "three": "3", "vier": "4", "four": "4", "fia": "4","fiar": "4", "sier": "4", "fier": "4", "fünf": "5", "five": "5", "fönz": "5", "fünfs": "5", "fins": "5", "sechs": "6", "six": "6", "sieben": "7", "seven": "7", "zieben": "7", "riben": "7", "acht": "8", "eight": "8", "neun": "9", "nine": "9", "noin": "9", } def _normalize_transcription(raw_text): """Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um.""" # Entferne Satzzeichen und splitte in Tokens tokens = re.split(r'[,.\s]+', raw_text.lower().strip()) result = [] for token in tokens: if not token: continue # Prüfe, ob das Token ein bekanntes gesprochenes Wort ist if token in SPOKEN_TO_CHAR: result.append(SPOKEN_TO_CHAR[token]) # Falls es ein einzelnes Zeichen ist (a-z, 0-9), direkt übernehmen elif len(token) == 1 and token.isalnum(): result.append(token) # Sonst ignorieren oder loggen else: print(f"Unbekanntes Token: '{token}'") return ''.join(result) def transcribe_audio_with_whisper(mp3_path): import whisper model = whisper.load_model("small") result = model.transcribe(str(mp3_path), language='de') raw_text = result["text"] print("Raw transcription:", raw_text) cleaned = _normalize_transcription(raw_text) print("Cleaned transcription:", cleaned) return cleaned