import pytesseract import subprocess import logging from .system import randomtempname from PIL import Image _logger = logging.getLogger(__name__) ZONES = ('topleft', 'topright', 'bottomleft', 'bottomright', 'topcenter', 'bottomcenter') class Zone: zone: str width: int height: int margin_top: int margin_bottom: int marign_left: int margin_right: int def __init__(self, z, w, h, mt, mr, mb, ml): if z not in ZONES: return ValueError(f'invalid zone spec: zone "{z}" is invalid') self.zone = z self.width = int(w) self.height = int(h) self.margin_top = int(mt) self.margin_bottom = int(mb) self.margin_left = int(ml) self.margin_right = int(mr) def isright(self): return self.zone.endswith('right') def isleft(self): return self.zone.endswith('left') def iscenter(self): return self.zone.endswith('center') def istop(self): return self.zone.startswith('top') def isbottom(self): return self.zone.startswith('bottom') def __repr__(self): return '%s,%d,%d,%d,%d,%d,%d' % ( self.zone, self. width, self.height, self.margin_top, self.margin_right, self.margin_bottom, self.margin_left) @staticmethod def from_string(s): p = list(s.split(',')) if len(p) != 7: raise ValueError(f'invalid zone spec: {s}') return Zone(*p) @staticmethod def getzones(): return ZONES def pdf2png(pdf_path, page=1): file = randomtempname() p = subprocess.run(["pdftoppm", pdf_path, file, '-png', # generate PNG instead of PPM '-f', str(page), # page number '-r', '300', # PPI '-singlefile']) if p.returncode != 0: raise RuntimeError(f'pdftoppm returned {p.returncode}') return file+'.png' def zonecrop(png_path, z): image = Image.open(png_path) iw, ih = image.size x1 = x2 = y1 = y2 = None if z.isleft(): x1 = 0 elif z.isright(): x1 = iw-z.width elif z.iscenter(): x1 = int(iw/2-z.width/2) if z.istop(): y1 = 0 elif z.isbottom(): y1 = ih-z.height x1 += z.margin_left x1 -= z.margin_right y1 += z.margin_top y1 -= z.margin_bottom x2 = x1+z.width y2 = y1+z.height filename = randomtempname(suffix='.png') cropped = image.crop((x1, y1, x2, y2)) cropped.save(filename) return filename def img2pagenum(img_file, maxlen): try: s = pytesseract.image_to_string(img_file, timeout=10, lang='eng', config='--psm 13 -c tessedit_char_whitelist=0123456789') except RuntimeError as timeout_error: _logger.exception(timeout_error) return None for r in '_-.,—': s = s.replace(r, '') s = s.strip() return s if s.isnumeric() and len(s) <= maxlen else None