fix: update tokenization regex to handle hyphens in transcription
Some checks failed
Python tests / tests (push) Failing after 2m5s
Some checks failed
Python tests / tests (push) Failing after 2m5s
This commit is contained in:
@@ -50,7 +50,7 @@ SPOKEN_TO_CHAR = {
|
|||||||
def _normalize_transcription(raw_text):
|
def _normalize_transcription(raw_text):
|
||||||
"""Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
|
"""Wandelt eine Whisper-Transkription in die tatsächlichen Captcha-Zeichen um."""
|
||||||
# Entferne Satzzeichen und splitte in Tokens
|
# Entferne Satzzeichen und splitte in Tokens
|
||||||
tokens = re.split(r'[,.\s]+', raw_text.lower().strip())
|
tokens = re.split(r'[-,\.\s]+', raw_text.lower().strip())
|
||||||
result = []
|
result = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if not token:
|
if not token:
|
||||||
|
|||||||
Reference in New Issue
Block a user