From b7c4e402f9cde87e22a8cf2d08479450f428a724 Mon Sep 17 00:00:00 2001 From: rootless Date: Sat, 14 Oct 2023 11:09:06 +0300 Subject: [PATCH] use reasonable timeout for tesseract callsw --- pagenum/image.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pagenum/image.py b/pagenum/image.py index 8031b8b..6bdf60b 100644 --- a/pagenum/image.py +++ b/pagenum/image.py @@ -1,10 +1,11 @@ import pytesseract import subprocess +import logging from .system import randomtempname from PIL import Image - +_logger = logging.getLogger(__name__) ZONES = ('topleft', 'topright', 'bottomleft', 'bottomright', 'topcenter', 'bottomcenter') @@ -54,8 +55,7 @@ class Zone: self.margin_top, self.margin_right, self.margin_bottom, - self.margin_left - ) + self.margin_left) @staticmethod def from_string(s): @@ -92,7 +92,6 @@ def zonecrop(png_path, z): elif z.isright(): x1 = iw-z.width elif z.iscenter(): - # not tested x1 = int(iw/2-z.width/2) if z.istop(): @@ -116,9 +115,15 @@ def zonecrop(png_path, z): def img2pagenum(img_file, maxlen): - s = pytesseract.image_to_string(img_file, - lang='eng', - config='--psm 13 -c tessedit_char_whitelist=0123456789') + try: + s = pytesseract.image_to_string(img_file, + timeout=10, + lang='eng', + config='--psm 13 -c tessedit_char_whitelist=0123456789') + except RuntimeError as timeout_error: + _logger.exception(timeout_error) + return None + for r in '_-.,—': s = s.replace(r, '') s = s.strip()