From 1943042a424b44fe5aa23f329a61a4776bb029c5 Mon Sep 17 00:00:00 2001 From: cyroxx Date: Thu, 5 Feb 2026 02:35:29 +0100 Subject: [PATCH] fix: update tokenization regex to handle hyphens in transcription --- transcription.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transcription.py b/transcription.py index 038ea08..e113095 100644 --- a/transcription.py +++ b/transcription.py @@ -50,7 +50,7 @@ SPOKEN_TO_CHAR = { def _normalize_transcription(raw_text): """Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um.""" # Entferne Satzzeichen und splitte in Tokens - tokens = re.split(r'[,.\s]+', raw_text.lower().strip()) + tokens = re.split(r'[-,\.\s]+', raw_text.lower().strip()) result = [] for token in tokens: if not token: