fix: update tokenization regex to handle hyphens in transcription
Some checks failed
Python tests / tests (push) Failing after 2m5s
Some checks failed
Python tests / tests (push) Failing after 2m5s
This commit is contained in:
@@ -50,7 +50,7 @@ SPOKEN_TO_CHAR = {
|
||||
def _normalize_transcription(raw_text):
|
||||
"""Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
|
||||
# Entferne Satzzeichen und splitte in Tokens
|
||||
tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
|
||||
tokens = re.split(r'[-,\.\s]+', raw_text.lower().strip())
|
||||
result = []
|
||||
for token in tokens:
|
||||
if not token:
|
||||
|
||||
Reference in New Issue
Block a user