support specifying tesseract's --psm

This commit is contained in:
rootless 2023-10-14 11:35:01 +03:00
parent b7c4e402f9
commit 85dbfc5b0c
3 changed files with 22 additions and 4 deletions

View File

@ -20,6 +20,7 @@ _max_page_num_length = None
_rename_lock = Lock()
_outdir: str
_indir: str
_psm: Optional[int] = None
def safe_copyfile(oldname, newname):
@ -59,7 +60,10 @@ class PagenumWorker(Thread):
num = None
for z in _zones:
cropped = zonecrop(png_file, z)
num = img2pagenum(cropped, args.max_page_num_length)
i2pn_kw = {}
if _psm:
i2pn_kw['psm'] = _psm
num = img2pagenum(cropped, args.max_page_num_length, **i2pn_kw)
os.unlink(cropped)
if num is not None:
break
@ -86,6 +90,7 @@ if __name__ == '__main__':
parser.add_argument('-p', '--pretend', action='store_true',
help="Don't save files but print info to stdout")
parser.add_argument('--max-page-num-length', type=int, default=3)
parser.add_argument('--tesseract-psm', type=int)
parser.add_argument('-V', '--verbose', action='store_true')
args = parser.parse_args()
@ -96,6 +101,8 @@ if __name__ == '__main__':
_max_page_num_length = args.max_page_num_length
_outdir = args.output_directory
_indir = args.input_directory
if args.tesseract_psm:
_psm = args.tesseract_psm
if not os.path.isdir(_indir):
raise OSError(f'{_indir}: no such directory')

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import os
import logging
from argparse import ArgumentParser
from pagenum.system import ensure_dependencies, desktop_open_image
@ -26,8 +27,12 @@ if __name__ == '__main__':
parser.add_argument('--max-page-num-length', type=int, default=3)
parser.add_argument('--preview', action='store_true',
help="open cropped image part in image viewer")
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--tesseract-psm', type=int)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if not os.path.exists(args.input):
raise OSError(f'{args.input}: no such file')
@ -40,7 +45,11 @@ if __name__ == '__main__':
# desktop_open_image(png_file)
if args.preview:
desktop_open_image(cropped_file)
num = img2pagenum(cropped_file, args.max_page_num_length)
i2pn_kw = {}
if args.tesseract_psm:
i2pn_kw['psm'] = args.tesseract_psm
num = img2pagenum(cropped_file, args.max_page_num_length, **i2pn_kw)
print('num:', num)
os.unlink(png_file)
os.unlink(cropped_file)

View File

@ -114,16 +114,18 @@ def zonecrop(png_path, z):
return filename
def img2pagenum(img_file, maxlen):
def img2pagenum(img_file, maxlen, psm=13):
try:
s = pytesseract.image_to_string(img_file,
timeout=10,
lang='eng',
config='--psm 13 -c tessedit_char_whitelist=0123456789')
config=f'--psm {psm} -c tessedit_char_whitelist=0123456789')
except RuntimeError as timeout_error:
_logger.exception(timeout_error)
return None
_logger.debug(f'raw string: {s}')
for r in '_-.,—':
s = s.replace(r, '')
s = s.strip()