2023-10-14 11:35:01 +03:00

133 lines
3.2 KiB
Python

import pytesseract
import subprocess
import logging
from .system import randomtempname
from PIL import Image
_logger = logging.getLogger(__name__)
ZONES = ('topleft', 'topright',
'bottomleft', 'bottomright',
'topcenter', 'bottomcenter')
class Zone:
zone: str
width: int
height: int
margin_top: int
margin_bottom: int
marign_left: int
margin_right: int
def __init__(self, z, w, h, mt, mr, mb, ml):
if z not in ZONES:
return ValueError(f'invalid zone spec: zone "{z}" is invalid')
self.zone = z
self.width = int(w)
self.height = int(h)
self.margin_top = int(mt)
self.margin_bottom = int(mb)
self.margin_left = int(ml)
self.margin_right = int(mr)
def isright(self):
return self.zone.endswith('right')
def isleft(self):
return self.zone.endswith('left')
def iscenter(self):
return self.zone.endswith('center')
def istop(self):
return self.zone.startswith('top')
def isbottom(self):
return self.zone.startswith('bottom')
def __repr__(self):
return '%s,%d,%d,%d,%d,%d,%d' % (
self.zone,
self.
width,
self.height,
self.margin_top,
self.margin_right,
self.margin_bottom,
self.margin_left)
@staticmethod
def from_string(s):
p = list(s.split(','))
if len(p) != 7:
raise ValueError(f'invalid zone spec: {s}')
return Zone(*p)
@staticmethod
def getzones():
return ZONES
def pdf2png(pdf_path, page=1):
file = randomtempname()
p = subprocess.run(["pdftoppm", pdf_path, file,
'-png', # generate PNG instead of PPM
'-f', str(page), # page number
'-r', '300', # PPI
'-singlefile'])
if p.returncode != 0:
raise RuntimeError(f'pdftoppm returned {p.returncode}')
return file+'.png'
def zonecrop(png_path, z):
image = Image.open(png_path)
iw, ih = image.size
x1 = x2 = y1 = y2 = None
if z.isleft():
x1 = 0
elif z.isright():
x1 = iw-z.width
elif z.iscenter():
x1 = int(iw/2-z.width/2)
if z.istop():
y1 = 0
elif z.isbottom():
y1 = ih-z.height
x1 += z.margin_left
x1 -= z.margin_right
y1 += z.margin_top
y1 -= z.margin_bottom
x2 = x1+z.width
y2 = y1+z.height
filename = randomtempname(suffix='.png')
cropped = image.crop((x1, y1, x2, y2))
cropped.save(filename)
return filename
def img2pagenum(img_file, maxlen, psm=13):
try:
s = pytesseract.image_to_string(img_file,
timeout=10,
lang='eng',
config=f'--psm {psm} -c tessedit_char_whitelist=0123456789')
except RuntimeError as timeout_error:
_logger.exception(timeout_error)
return None
_logger.debug(f'raw string: {s}')
for r in '_-.,—':
s = s.replace(r, '')
s = s.strip()
return s if s.isnumeric() and len(s) <= maxlen else None