133 lines
3.2 KiB
Python
133 lines
3.2 KiB
Python
import pytesseract
|
|
import subprocess
|
|
import logging
|
|
|
|
from .system import randomtempname
|
|
from PIL import Image
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
ZONES = ('topleft', 'topright',
|
|
'bottomleft', 'bottomright',
|
|
'topcenter', 'bottomcenter')
|
|
|
|
|
|
class Zone:
|
|
zone: str
|
|
width: int
|
|
height: int
|
|
margin_top: int
|
|
margin_bottom: int
|
|
marign_left: int
|
|
margin_right: int
|
|
|
|
def __init__(self, z, w, h, mt, mr, mb, ml):
|
|
if z not in ZONES:
|
|
return ValueError(f'invalid zone spec: zone "{z}" is invalid')
|
|
self.zone = z
|
|
self.width = int(w)
|
|
self.height = int(h)
|
|
self.margin_top = int(mt)
|
|
self.margin_bottom = int(mb)
|
|
self.margin_left = int(ml)
|
|
self.margin_right = int(mr)
|
|
|
|
def isright(self):
|
|
return self.zone.endswith('right')
|
|
|
|
def isleft(self):
|
|
return self.zone.endswith('left')
|
|
|
|
def iscenter(self):
|
|
return self.zone.endswith('center')
|
|
|
|
def istop(self):
|
|
return self.zone.startswith('top')
|
|
|
|
def isbottom(self):
|
|
return self.zone.startswith('bottom')
|
|
|
|
def __repr__(self):
|
|
return '%s,%d,%d,%d,%d,%d,%d' % (
|
|
self.zone,
|
|
self.
|
|
width,
|
|
self.height,
|
|
self.margin_top,
|
|
self.margin_right,
|
|
self.margin_bottom,
|
|
self.margin_left)
|
|
|
|
@staticmethod
|
|
def from_string(s):
|
|
p = list(s.split(','))
|
|
if len(p) != 7:
|
|
raise ValueError(f'invalid zone spec: {s}')
|
|
return Zone(*p)
|
|
|
|
@staticmethod
|
|
def getzones():
|
|
return ZONES
|
|
|
|
|
|
def pdf2png(pdf_path, page=1):
|
|
file = randomtempname()
|
|
p = subprocess.run(["pdftoppm", pdf_path, file,
|
|
'-png', # generate PNG instead of PPM
|
|
'-f', str(page), # page number
|
|
'-r', '300', # PPI
|
|
'-singlefile'])
|
|
if p.returncode != 0:
|
|
raise RuntimeError(f'pdftoppm returned {p.returncode}')
|
|
return file+'.png'
|
|
|
|
|
|
def zonecrop(png_path, z):
|
|
image = Image.open(png_path)
|
|
iw, ih = image.size
|
|
|
|
x1 = x2 = y1 = y2 = None
|
|
|
|
if z.isleft():
|
|
x1 = 0
|
|
elif z.isright():
|
|
x1 = iw-z.width
|
|
elif z.iscenter():
|
|
x1 = int(iw/2-z.width/2)
|
|
|
|
if z.istop():
|
|
y1 = 0
|
|
elif z.isbottom():
|
|
y1 = ih-z.height
|
|
|
|
x1 += z.margin_left
|
|
x1 -= z.margin_right
|
|
y1 += z.margin_top
|
|
y1 -= z.margin_bottom
|
|
|
|
x2 = x1+z.width
|
|
y2 = y1+z.height
|
|
|
|
filename = randomtempname(suffix='.png')
|
|
cropped = image.crop((x1, y1, x2, y2))
|
|
cropped.save(filename)
|
|
|
|
return filename
|
|
|
|
|
|
def img2pagenum(img_file, maxlen, psm=13):
|
|
try:
|
|
s = pytesseract.image_to_string(img_file,
|
|
timeout=10,
|
|
lang='eng',
|
|
config=f'--psm {psm} -c tessedit_char_whitelist=0123456789')
|
|
except RuntimeError as timeout_error:
|
|
_logger.exception(timeout_error)
|
|
return None
|
|
|
|
_logger.debug(f'raw string: {s}')
|
|
|
|
for r in '_-.,—':
|
|
s = s.replace(r, '')
|
|
s = s.strip()
|
|
return s if s.isnumeric() and len(s) <= maxlen else None
|