add ocr code

2026-02-03 00:52:49 +01:00
parent 33cd5346bf
commit 7367226210
6 changed files with 3151 additions and 0 deletions
@@ -0,0 +1,5 @@
+My first attempt was to use OCR,
+but I was unable to get satisfying results quickly.
+
+However, I keep the code here for future reference,
+in case someone (even myself) might find it useful.
@@ -0,0 +1,56 @@
+import streamlit as st
+import os
+import csv
+from PIL import Image
+
+"""Einfaches Streamlit-Tool zum manuellen Labeln von Captcha-Bildern."""
+
+CAPTCHA_DIR = "./captchas"
+CSV_FILE = "captcha_labels.csv"
+
+def load_labeled():
+    if not os.path.exists(CSV_FILE):
+        return set()
+    with open(CSV_FILE, newline='') as f:
+        return set(row[0] for row in csv.reader(f))
+
+def save_label(filename, label):
+    with open(CSV_FILE, "a", newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([filename, label])
+
+def get_unlabeled_files():
+    labeled = load_labeled()
+    return [f for f in os.listdir(CAPTCHA_DIR) if f.endswith(".png") and f not in labeled]
+
+st.title("Captcha Labeler")
+
+files = get_unlabeled_files()
+if not files:
+    st.success("Alle Captchas sind gelabelt!")
+else:
+    if "idx" not in st.session_state:
+        st.session_state.idx = 0
+    if "flash" not in st.session_state:
+        st.session_state.flash = ""
+    if st.session_state.flash:
+        st.success(st.session_state.flash)
+        st.session_state.flash = ""  # Nach Anzeige zurücksetzen
+
+    if st.button("Nächstes Captcha"):
+        st.session_state.idx += 1
+        st.rerun()
+    if st.session_state.idx >= len(files):
+        st.success("Alle Captchas sind gelabelt!")
+    else:
+        fname = files[st.session_state.idx]
+        img = Image.open(os.path.join(CAPTCHA_DIR, fname))
+        st.image(img, caption=fname)
+        with st.form(key=f"form_{fname}"):
+            label = st.text_input("Lösung eingeben", key=f"label_{fname}")
+            submitted = st.form_submit_button("Speichern")
+            if submitted and label.strip():
+                save_label(fname, label.strip())
+                st.session_state.flash = f"Gespeichert: {fname} -> {label.strip()}"
+                st.session_state.idx += 1
+                st.rerun()
@@ -0,0 +1,34 @@
+from PIL import Image
+import pytesseract
+import sys
+
+"""OCR für Captcha-Bilder mit Tesseract only"""
+
+# TEST config
+custom_config = r'--psm 7 -c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyz123456890"'
+
+def recognize_captcha(img_path):
+    im = Image.open(img_path).convert("L")
+    # 1. threshold the image
+    threshold = 150
+    table = []
+    for i in range(256):
+        if i < threshold:
+            table.append(0)
+        else:
+            table.append(1)
+
+    out = im.point(table, '1')
+    #out.show()
+    # 2. recognize with tesseract
+    num = pytesseract.image_to_string(out, config=custom_config)
+    return num
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python recognize.py <image_filename>")
+    res = recognize_captcha(sys.argv[1])
+    strs = res.split("\n")
+    if len(strs) >=1:
+        print(strs[0])
@@ -0,0 +1,50 @@
+import cv2
+import pytesseract
+
+"""OCR für Captcha-Bilder mit OpenCV und Tesseract"""
+
+#img_path = "captcha.png"
+#img_path = "samples/sample_1-Dateien/1769810847305.jpg"
+#img_path = "samples/sample_2-Dateien/1769811067589.jpg"
+#img_path = "samples/sample_3_files/1769812197128.jpg"
+img_path = "samples/sample_4-Dateien/1769818949905.jpg"
+
+# Bild laden
+img = cv2.imread(img_path)
+
+# Graustufen
+gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+# leichtes Blur, um Grid-Rauschen zu reduzieren
+gray = cv2.GaussianBlur(gray, (3, 3), 0)
+
+# Threshold (Otsu)
+_, thresh = cv2.threshold(
+    gray, 0, 255,
+    cv2.THRESH_BINARY + cv2.THRESH_OTSU
+)
+
+###
+# Invertieren (falls nötig)
+#thresh = cv2.bitwise_not(thresh)
+
+# Morphologie, um Zeichen zu schließen
+#kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+#thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+
+###
+
+# save the processed image for debugging
+cv2.imwrite("processed_captcha.png", thresh)
+
+# Tesseract-Config
+custom_config = r"""
+--oem 3
+--psm 7
+-c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz0123456789
+"""
+
+# Text erkennen
+text = pytesseract.image_to_string(thresh, config=custom_config)
+
+print(text.strip())
@@ -0,0 +1,179 @@
+import csv
+import os
+from PIL import Image, ImageOps, ImageFilter
+import pytesseract
+import cv2
+import numpy as np
+import itertools
+from tqdm import tqdm
+import Levenshtein
+
+"""
+OCR für Captcha-Bilder mit verschiedenen Vorverarbeitungen und Tesseract-Konfigurationen
+
+Es werden verschiedene Kombinationen von Bildvorverarbeitungen (teils mit OpenCV)
+und Tesseract-Konfigurationen getestet, um die beste Erkennungsgenauigkeit zu ermitteln.
+"""
+
+CAPTCHA_DIR = "./captchas"
+CSV_FILE = "captcha_labels.csv"
+
+# --- Base preprocessing steps ---
+def to_grayscale(img):
+    return img.convert("L")
+
+def to_bw(img, threshold=140):
+    return img.convert("L").point(lambda x: 0 if x < threshold else 255, '1')
+
+def invert(img):
+    return ImageOps.invert(img.convert("L"))
+
+def sharpen(img):
+    if img.mode not in ("L", "RGB"):
+        img = img.convert("L")
+    return img.filter(ImageFilter.SHARPEN)
+
+def blur(img):
+    if img.mode not in ("L", "RGB"):
+        img = img.convert("L")
+    return img.filter(ImageFilter.GaussianBlur(1))
+
+def resize2x(img):
+    return img.resize((img.width * 2, img.height * 2), Image.LANCZOS)
+
+def crop_left_2_3(img):
+    w, h = img.size
+    return img.crop((0, 0, int(w * 2 / 3), h))
+
+def opencv_adaptive_thresh(img):
+    img_np = np.array(img.convert("L"))
+    th = cv2.adaptiveThreshold(img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                               cv2.THRESH_BINARY, 11, 2)
+    return Image.fromarray(th)
+
+def opencv_otsu(img):
+    img_np = np.array(img.convert("L"))
+    _, th = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return Image.fromarray(th)
+
+def opencv_erode(img):
+    img_np = np.array(img.convert("L"))
+    kernel = np.ones((2,2), np.uint8)
+    eroded = cv2.erode(img_np, kernel, iterations=1)
+    return Image.fromarray(eroded)
+
+def opencv_dilate(img):
+    img_np = np.array(img.convert("L"))
+    kernel = np.ones((2,2), np.uint8)
+    dilated = cv2.dilate(img_np, kernel, iterations=1)
+    return Image.fromarray(dilated)
+
+def opencv_contrast(img):
+    img_np = np.array(img.convert("L"))
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+    contrast = clahe.apply(img_np)
+    return Image.fromarray(contrast)
+
+# --- Compose preprocessing steps ---
+def compose(*funcs):
+    def composed(img):
+        for f in funcs:
+            img = f(img)
+        return img
+    return composed
+
+# --- Define base steps and generate combinations ---
+base_steps = {
+    "none": lambda img: img,
+    "grayscale": to_grayscale,
+    "bw": to_bw,
+    "invert": invert,
+    "sharpen": sharpen,
+    "blur": blur,
+    "resize2x": resize2x,
+    "crop_left_2_3": crop_left_2_3,
+    "opencv_adaptive_thresh": opencv_adaptive_thresh,
+    "opencv_otsu": opencv_otsu,
+    "opencv_erode": opencv_erode,
+    "opencv_dilate": opencv_dilate,
+    "opencv_contrast": opencv_contrast,
+}
+
+# Generate all 1-step and 2-step combinations (excluding "none" as a second step)
+preprocessings = {}
+for name, func in base_steps.items():
+    preprocessings[name] = func
+
+for (name1, func1), (name2, func2) in itertools.product(base_steps.items(), base_steps.items()):
+    if name1 != name2 and name2 != "none":
+        combo_name = f"{name1}+{name2}"
+        preprocessings[combo_name] = compose(func1, func2)
+
+# Characters present in your labels:
+whitelist = "23456789abcdefghijklmnopqrstuvwxyz"
+
+tess_configs = [
+    '--psm 7',
+    '--psm 8',
+    '--psm 6',
+    '--psm 13',
+    '--psm 7 --oem 1',
+    '--psm 7 --oem 3',
+    f'--psm 7 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 8 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 6 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 13 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 7 --oem 1 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 7 --oem 3 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 8 --oem 1 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 8 --oem 3 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 6 --oem 1 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 6 --oem 3 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 13 --oem 1 -c tessedit_char_whitelist={whitelist}',
+    f'--psm 13 --oem 3 -c tessedit_char_whitelist={whitelist}',
+]
+
+with open(CSV_FILE, newline='') as f:
+    reader = csv.reader(f)
+    data = list(reader)[:10]  # Nur die ersten 10 Bilder verwenden
+
+results = []
+
+for pname, pfunc in tqdm(preprocessings.items(), desc="Preprocessing variants"):
+    for config in tess_configs:
+        fuzzy_score_sum = 0
+        total = 0
+        for fname, label in data:
+            img_path = os.path.join(CAPTCHA_DIR, fname)
+            if not os.path.exists(img_path):
+                continue
+            img = Image.open(img_path)
+            img = pfunc(img)
+            pred = pytesseract.image_to_string(img, config=config).strip().lower().replace(" ", "")
+            # Fuzzy score: 1.0 = perfect, 0.0 = completely wrong
+            if max(len(label), len(pred)) > 0:
+                fuzzy_score = 1 - Levenshtein.distance(label, pred) / max(len(label), len(pred))
+            else:
+                fuzzy_score = 0
+            fuzzy_score_sum += fuzzy_score
+            total += 1
+        avg_fuzzy_score = fuzzy_score_sum / total if total else 0
+        results.append(( pname, config, avg_fuzzy_score))
+        print(f"Preprocessing: {pname}, Config: {config}, Avg fuzzy score: {avg_fuzzy_score:.3f}")
+
+# Sort results by accuracy (descending)
+results_sorted = sorted(results, key=lambda x: x[2], reverse=True)
+
+print("\nTop 5 combinations:")
+for i, (pname, config, acc) in enumerate(results_sorted[:5], 1):
+    print(f"{i}. Preprocessing: {pname}, Config: {config}, Accuracy: {acc:.3f}")
+
+# Output all results to a CSV file, ordered by accuracy DESC
+output_csv = "recognize3_results.csv"
+with open(output_csv, "w", newline='') as f:
+    writer = csv.writer(f)
+    writer.writerow(["preprocessing", "tesseract_config", "accuracy"])
+    for pname, config, acc in results_sorted:
+        writer.writerow([pname, config, acc])
+
+print(f"\nFull results written to {output_csv}")