add ocr code

2026-02-03 00:52:49 +01:00
parent 33cd5346bf
commit 7367226210
6 changed files with 3151 additions and 0 deletions
--- a/ocr/recognize.py
+++ b/ocr/recognize.py
@@ -0,0 +1,34 @@
+from PIL import Image
+import pytesseract
+import sys
+
+"""OCR für Captcha-Bilder mit Tesseract only"""
+
+# TEST config
+custom_config = r'--psm 7 -c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyz123456890"'
+
+def recognize_captcha(img_path):
+    im = Image.open(img_path).convert("L")
+    # 1. threshold the image
+    threshold = 150
+    table = []
+    for i in range(256):
+        if i < threshold:
+            table.append(0)
+        else:
+            table.append(1)
+
+    out = im.point(table, '1')
+    #out.show()
+    # 2. recognize with tesseract
+    num = pytesseract.image_to_string(out, config=custom_config)
+    return num
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python recognize.py <image_filename>")
+    res = recognize_captcha(sys.argv[1])
+    strs = res.split("\n")
+    if len(strs) >=1:
+        print(strs[0])