check_pa/ocr/recognize3.py

import csv
import os
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import cv2
import numpy as np
import itertools
from tqdm import tqdm
import Levenshtein

"""
OCR für Captcha-Bilder mit verschiedenen Vorverarbeitungen und Tesseract-Konfigurationen

Es werden verschiedene Kombinationen von Bildvorverarbeitungen (teils mit OpenCV)
und Tesseract-Konfigurationen getestet, um die beste Erkennungsgenauigkeit zu ermitteln.
"""

CAPTCHA_DIR = "./captchas"
CSV_FILE = "captcha_labels.csv"

# --- Base preprocessing steps ---
def to_grayscale(img):
    return img.convert("L")

def to_bw(img, threshold=140):
    return img.convert("L").point(lambda x: 0 if x < threshold else 255, '1')

def invert(img):
    return ImageOps.invert(img.convert("L"))

def sharpen(img):
    if img.mode not in ("L", "RGB"):
        img = img.convert("L")
    return img.filter(ImageFilter.SHARPEN)

def blur(img):
    if img.mode not in ("L", "RGB"):
        img = img.convert("L")
    return img.filter(ImageFilter.GaussianBlur(1))

def resize2x(img):
    return img.resize((img.width * 2, img.height * 2), Image.LANCZOS)

def crop_left_2_3(img):
    w, h = img.size
    return img.crop((0, 0, int(w * 2 / 3), h))

def opencv_adaptive_thresh(img):
    img_np = np.array(img.convert("L"))
    th = cv2.adaptiveThreshold(img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 11, 2)
    return Image.fromarray(th)

def opencv_otsu(img):
    img_np = np.array(img.convert("L"))
    _, th = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(th)

def opencv_erode(img):
    img_np = np.array(img.convert("L"))
    kernel = np.ones((2,2), np.uint8)
    eroded = cv2.erode(img_np, kernel, iterations=1)
    return Image.fromarray(eroded)

def opencv_dilate(img):
    img_np = np.array(img.convert("L"))
    kernel = np.ones((2,2), np.uint8)
    dilated = cv2.dilate(img_np, kernel, iterations=1)
    return Image.fromarray(dilated)

def opencv_contrast(img):
    img_np = np.array(img.convert("L"))
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast = clahe.apply(img_np)
    return Image.fromarray(contrast)

# --- Compose preprocessing steps ---
def compose(*funcs):
    def composed(img):
        for f in funcs:
            img = f(img)
        return img
    return composed

# --- Define base steps and generate combinations ---
base_steps = {
    "none": lambda img: img,
    "grayscale": to_grayscale,
    "bw": to_bw,
    "invert": invert,
    "sharpen": sharpen,
    "blur": blur,
    "resize2x": resize2x,
    "crop_left_2_3": crop_left_2_3,
    "opencv_adaptive_thresh": opencv_adaptive_thresh,
    "opencv_otsu": opencv_otsu,
    "opencv_erode": opencv_erode,
    "opencv_dilate": opencv_dilate,
    "opencv_contrast": opencv_contrast,
}

# Generate all 1-step and 2-step combinations (excluding "none" as a second step)
preprocessings = {}
for name, func in base_steps.items():
    preprocessings[name] = func

for (name1, func1), (name2, func2) in itertools.product(base_steps.items(), base_steps.items()):
    if name1 != name2 and name2 != "none":
        combo_name = f"{name1}+{name2}"
        preprocessings[combo_name] = compose(func1, func2)

# Characters present in your labels:
whitelist = "23456789abcdefghijklmnopqrstuvwxyz"

tess_configs = [
    '--psm 7',
    '--psm 8',
    '--psm 6',
    '--psm 13',
    '--psm 7 --oem 1',
    '--psm 7 --oem 3',
    f'--psm 7 -c tessedit_char_whitelist={whitelist}',
    f'--psm 8 -c tessedit_char_whitelist={whitelist}',
    f'--psm 6 -c tessedit_char_whitelist={whitelist}',
    f'--psm 13 -c tessedit_char_whitelist={whitelist}',
    f'--psm 7 --oem 1 -c tessedit_char_whitelist={whitelist}',
    f'--psm 7 --oem 3 -c tessedit_char_whitelist={whitelist}',
    f'--psm 8 --oem 1 -c tessedit_char_whitelist={whitelist}',
    f'--psm 8 --oem 3 -c tessedit_char_whitelist={whitelist}',
    f'--psm 6 --oem 1 -c tessedit_char_whitelist={whitelist}',
    f'--psm 6 --oem 3 -c tessedit_char_whitelist={whitelist}',
    f'--psm 13 --oem 1 -c tessedit_char_whitelist={whitelist}',
    f'--psm 13 --oem 3 -c tessedit_char_whitelist={whitelist}',
]

with open(CSV_FILE, newline='') as f:
    reader = csv.reader(f)
    data = list(reader)[:10]  # Nur die ersten 10 Bilder verwenden

results = []

for pname, pfunc in tqdm(preprocessings.items(), desc="Preprocessing variants"):
    for config in tess_configs:
        fuzzy_score_sum = 0
        total = 0
        for fname, label in data:
            img_path = os.path.join(CAPTCHA_DIR, fname)
            if not os.path.exists(img_path):
                continue
            img = Image.open(img_path)
            img = pfunc(img)
            pred = pytesseract.image_to_string(img, config=config).strip().lower().replace(" ", "")
            # Fuzzy score: 1.0 = perfect, 0.0 = completely wrong
            if max(len(label), len(pred)) > 0:
                fuzzy_score = 1 - Levenshtein.distance(label, pred) / max(len(label), len(pred))
            else:
                fuzzy_score = 0
            fuzzy_score_sum += fuzzy_score
            total += 1
        avg_fuzzy_score = fuzzy_score_sum / total if total else 0
        results.append(( pname, config, avg_fuzzy_score))
        print(f"Preprocessing: {pname}, Config: {config}, Avg fuzzy score: {avg_fuzzy_score:.3f}")

# Sort results by accuracy (descending)
results_sorted = sorted(results, key=lambda x: x[2], reverse=True)

print("\nTop 5 combinations:")
for i, (pname, config, acc) in enumerate(results_sorted[:5], 1):
    print(f"{i}. Preprocessing: {pname}, Config: {config}, Accuracy: {acc:.3f}")

# Output all results to a CSV file, ordered by accuracy DESC
output_csv = "recognize3_results.csv"
with open(output_csv, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["preprocessing", "tesseract_config", "accuracy"])
    for pname, config, acc in results_sorted:
        writer.writerow([pname, config, acc])

print(f"\nFull results written to {output_csv}")