Files
check_pa/ocr/recognize3.py
2026-02-03 00:59:15 +01:00

179 lines
5.9 KiB
Python

import csv
import os
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import cv2
import numpy as np
import itertools
from tqdm import tqdm
import Levenshtein
"""
OCR für Captcha-Bilder mit verschiedenen Vorverarbeitungen und Tesseract-Konfigurationen
Es werden verschiedene Kombinationen von Bildvorverarbeitungen (teils mit OpenCV)
und Tesseract-Konfigurationen getestet, um die beste Erkennungsgenauigkeit zu ermitteln.
"""
CAPTCHA_DIR = "./captchas"
CSV_FILE = "captcha_labels.csv"
# --- Base preprocessing steps ---
def to_grayscale(img):
return img.convert("L")
def to_bw(img, threshold=140):
return img.convert("L").point(lambda x: 0 if x < threshold else 255, '1')
def invert(img):
return ImageOps.invert(img.convert("L"))
def sharpen(img):
if img.mode not in ("L", "RGB"):
img = img.convert("L")
return img.filter(ImageFilter.SHARPEN)
def blur(img):
if img.mode not in ("L", "RGB"):
img = img.convert("L")
return img.filter(ImageFilter.GaussianBlur(1))
def resize2x(img):
return img.resize((img.width * 2, img.height * 2), Image.LANCZOS)
def crop_left_2_3(img):
w, h = img.size
return img.crop((0, 0, int(w * 2 / 3), h))
def opencv_adaptive_thresh(img):
img_np = np.array(img.convert("L"))
th = cv2.adaptiveThreshold(img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
return Image.fromarray(th)
def opencv_otsu(img):
img_np = np.array(img.convert("L"))
_, th = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return Image.fromarray(th)
def opencv_erode(img):
img_np = np.array(img.convert("L"))
kernel = np.ones((2,2), np.uint8)
eroded = cv2.erode(img_np, kernel, iterations=1)
return Image.fromarray(eroded)
def opencv_dilate(img):
img_np = np.array(img.convert("L"))
kernel = np.ones((2,2), np.uint8)
dilated = cv2.dilate(img_np, kernel, iterations=1)
return Image.fromarray(dilated)
def opencv_contrast(img):
img_np = np.array(img.convert("L"))
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
contrast = clahe.apply(img_np)
return Image.fromarray(contrast)
# --- Compose preprocessing steps ---
def compose(*funcs):
def composed(img):
for f in funcs:
img = f(img)
return img
return composed
# --- Define base steps and generate combinations ---
base_steps = {
"none": lambda img: img,
"grayscale": to_grayscale,
"bw": to_bw,
"invert": invert,
"sharpen": sharpen,
"blur": blur,
"resize2x": resize2x,
"crop_left_2_3": crop_left_2_3,
"opencv_adaptive_thresh": opencv_adaptive_thresh,
"opencv_otsu": opencv_otsu,
"opencv_erode": opencv_erode,
"opencv_dilate": opencv_dilate,
"opencv_contrast": opencv_contrast,
}
# Generate all 1-step and 2-step combinations (excluding "none" as a second step)
preprocessings = {}
for name, func in base_steps.items():
preprocessings[name] = func
for (name1, func1), (name2, func2) in itertools.product(base_steps.items(), base_steps.items()):
if name1 != name2 and name2 != "none":
combo_name = f"{name1}+{name2}"
preprocessings[combo_name] = compose(func1, func2)
# Characters present in your labels:
whitelist = "23456789abcdefghijklmnopqrstuvwxyz"
tess_configs = [
'--psm 7',
'--psm 8',
'--psm 6',
'--psm 13',
'--psm 7 --oem 1',
'--psm 7 --oem 3',
f'--psm 7 -c tessedit_char_whitelist={whitelist}',
f'--psm 8 -c tessedit_char_whitelist={whitelist}',
f'--psm 6 -c tessedit_char_whitelist={whitelist}',
f'--psm 13 -c tessedit_char_whitelist={whitelist}',
f'--psm 7 --oem 1 -c tessedit_char_whitelist={whitelist}',
f'--psm 7 --oem 3 -c tessedit_char_whitelist={whitelist}',
f'--psm 8 --oem 1 -c tessedit_char_whitelist={whitelist}',
f'--psm 8 --oem 3 -c tessedit_char_whitelist={whitelist}',
f'--psm 6 --oem 1 -c tessedit_char_whitelist={whitelist}',
f'--psm 6 --oem 3 -c tessedit_char_whitelist={whitelist}',
f'--psm 13 --oem 1 -c tessedit_char_whitelist={whitelist}',
f'--psm 13 --oem 3 -c tessedit_char_whitelist={whitelist}',
]
with open(CSV_FILE, newline='') as f:
reader = csv.reader(f)
data = list(reader)[:10] # Nur die ersten 10 Bilder verwenden
results = []
for pname, pfunc in tqdm(preprocessings.items(), desc="Preprocessing variants"):
for config in tess_configs:
fuzzy_score_sum = 0
total = 0
for fname, label in data:
img_path = os.path.join(CAPTCHA_DIR, fname)
if not os.path.exists(img_path):
continue
img = Image.open(img_path)
img = pfunc(img)
pred = pytesseract.image_to_string(img, config=config).strip().lower().replace(" ", "")
# Fuzzy score: 1.0 = perfect, 0.0 = completely wrong
if max(len(label), len(pred)) > 0:
fuzzy_score = 1 - Levenshtein.distance(label, pred) / max(len(label), len(pred))
else:
fuzzy_score = 0
fuzzy_score_sum += fuzzy_score
total += 1
avg_fuzzy_score = fuzzy_score_sum / total if total else 0
results.append(( pname, config, avg_fuzzy_score))
print(f"Preprocessing: {pname}, Config: {config}, Avg fuzzy score: {avg_fuzzy_score:.3f}")
# Sort results by accuracy (descending)
results_sorted = sorted(results, key=lambda x: x[2], reverse=True)
print("\nTop 5 combinations:")
for i, (pname, config, acc) in enumerate(results_sorted[:5], 1):
print(f"{i}. Preprocessing: {pname}, Config: {config}, Accuracy: {acc:.3f}")
# Output all results to a CSV file, ordered by accuracy DESC
output_csv = "recognize3_results.csv"
with open(output_csv, "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(["preprocessing", "tesseract_config", "accuracy"])
for pname, config, acc in results_sorted:
writer.writerow([pname, config, acc])
print(f"\nFull results written to {output_csv}")