fix: update tokenization regex to handle hyphens in transcription

2026-02-05 02:35:29 +01:00
parent 0cb53b1822
commit 1943042a42
1 changed files with 1 additions and 1 deletions
@@ -50,7 +50,7 @@ SPOKEN_TO_CHAR = {
 def _normalize_transcription(raw_text):
    """Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
    # Entferne Satzzeichen und splitte in Tokens
-    tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
+    tokens = re.split(r'[-,\.\s]+', raw_text.lower().strip())
    result = []
    for token in tokens:
        if not token: