From d3b8bce6df57b72d32408ccb23a2b0acd6f62cdf Mon Sep 17 00:00:00 2001 From: rootless Date: Sat, 14 Oct 2023 10:53:56 +0300 Subject: [PATCH] improve page number recognition --- pagenum/image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pagenum/image.py b/pagenum/image.py index 04f106b..8031b8b 100644 --- a/pagenum/image.py +++ b/pagenum/image.py @@ -117,8 +117,8 @@ def zonecrop(png_path, z): def img2pagenum(img_file, maxlen): s = pytesseract.image_to_string(img_file, - lang='rus', - config='--psm 11') + lang='eng', + config='--psm 13 -c tessedit_char_whitelist=0123456789') for r in '_-.,—': s = s.replace(r, '') s = s.strip()