From 1943042a424b44fe5aa23f329a61a4776bb029c5 Mon Sep 17 00:00:00 2001
From: cyroxx <cyroxx@ccc-p.org>
Date: Thu, 5 Feb 2026 02:35:29 +0100
Subject: [PATCH] fix: update tokenization regex to handle hyphens in
 transcription

---
 transcription.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transcription.py b/transcription.py
index 038ea08..e113095 100644
--- a/transcription.py
+++ b/transcription.py
@@ -50,7 +50,7 @@ SPOKEN_TO_CHAR = {
 def _normalize_transcription(raw_text):
     """Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
     # Entferne Satzzeichen und splitte in Tokens
-    tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
+    tokens = re.split(r'[-,\.\s]+', raw_text.lower().strip())
     result = []
     for token in tokens:
         if not token: