diff --git a/pagenum-mass.py b/pagenum-mass.py index 08895e8..2932df7 100755 --- a/pagenum-mass.py +++ b/pagenum-mass.py @@ -20,6 +20,7 @@ _max_page_num_length = None _rename_lock = Lock() _outdir: str _indir: str +_psm: Optional[int] = None def safe_copyfile(oldname, newname): @@ -59,7 +60,10 @@ class PagenumWorker(Thread): num = None for z in _zones: cropped = zonecrop(png_file, z) - num = img2pagenum(cropped, args.max_page_num_length) + i2pn_kw = {} + if _psm: + i2pn_kw['psm'] = _psm + num = img2pagenum(cropped, args.max_page_num_length, **i2pn_kw) os.unlink(cropped) if num is not None: break @@ -86,6 +90,7 @@ if __name__ == '__main__': parser.add_argument('-p', '--pretend', action='store_true', help="Don't save files but print info to stdout") parser.add_argument('--max-page-num-length', type=int, default=3) + parser.add_argument('--tesseract-psm', type=int) parser.add_argument('-V', '--verbose', action='store_true') args = parser.parse_args() @@ -96,6 +101,8 @@ if __name__ == '__main__': _max_page_num_length = args.max_page_num_length _outdir = args.output_directory _indir = args.input_directory + if args.tesseract_psm: + _psm = args.tesseract_psm if not os.path.isdir(_indir): raise OSError(f'{_indir}: no such directory') diff --git a/pagenum-probe.py b/pagenum-probe.py index f38b399..93dc866 100755 --- a/pagenum-probe.py +++ b/pagenum-probe.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import os +import logging from argparse import ArgumentParser from pagenum.system import ensure_dependencies, desktop_open_image @@ -26,8 +27,12 @@ if __name__ == '__main__': parser.add_argument('--max-page-num-length', type=int, default=3) parser.add_argument('--preview', action='store_true', help="open cropped image part in image viewer") + parser.add_argument('--verbose', action='store_true') + parser.add_argument('--tesseract-psm', type=int) args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if not os.path.exists(args.input): raise OSError(f'{args.input}: no such file') @@ -40,7 +45,11 @@ if __name__ == '__main__': # desktop_open_image(png_file) if args.preview: desktop_open_image(cropped_file) - num = img2pagenum(cropped_file, args.max_page_num_length) + + i2pn_kw = {} + if args.tesseract_psm: + i2pn_kw['psm'] = args.tesseract_psm + num = img2pagenum(cropped_file, args.max_page_num_length, **i2pn_kw) print('num:', num) os.unlink(png_file) os.unlink(cropped_file) diff --git a/pagenum/image.py b/pagenum/image.py index 6bdf60b..538bc2c 100644 --- a/pagenum/image.py +++ b/pagenum/image.py @@ -114,16 +114,18 @@ def zonecrop(png_path, z): return filename -def img2pagenum(img_file, maxlen): +def img2pagenum(img_file, maxlen, psm=13): try: s = pytesseract.image_to_string(img_file, timeout=10, lang='eng', - config='--psm 13 -c tessedit_char_whitelist=0123456789') + config=f'--psm {psm} -c tessedit_char_whitelist=0123456789') except RuntimeError as timeout_error: _logger.exception(timeout_error) return None + _logger.debug(f'raw string: {s}') + for r in '_-.,—': s = s.replace(r, '') s = s.strip()