support specifying tesseract's --psm
This commit is contained in:
parent
b7c4e402f9
commit
85dbfc5b0c
@ -20,6 +20,7 @@ _max_page_num_length = None
|
||||
_rename_lock = Lock()
|
||||
_outdir: str
|
||||
_indir: str
|
||||
_psm: Optional[int] = None
|
||||
|
||||
|
||||
def safe_copyfile(oldname, newname):
|
||||
@ -59,7 +60,10 @@ class PagenumWorker(Thread):
|
||||
num = None
|
||||
for z in _zones:
|
||||
cropped = zonecrop(png_file, z)
|
||||
num = img2pagenum(cropped, args.max_page_num_length)
|
||||
i2pn_kw = {}
|
||||
if _psm:
|
||||
i2pn_kw['psm'] = _psm
|
||||
num = img2pagenum(cropped, args.max_page_num_length, **i2pn_kw)
|
||||
os.unlink(cropped)
|
||||
if num is not None:
|
||||
break
|
||||
@ -86,6 +90,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument('-p', '--pretend', action='store_true',
|
||||
help="Don't save files but print info to stdout")
|
||||
parser.add_argument('--max-page-num-length', type=int, default=3)
|
||||
parser.add_argument('--tesseract-psm', type=int)
|
||||
parser.add_argument('-V', '--verbose', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -96,6 +101,8 @@ if __name__ == '__main__':
|
||||
_max_page_num_length = args.max_page_num_length
|
||||
_outdir = args.output_directory
|
||||
_indir = args.input_directory
|
||||
if args.tesseract_psm:
|
||||
_psm = args.tesseract_psm
|
||||
|
||||
if not os.path.isdir(_indir):
|
||||
raise OSError(f'{_indir}: no such directory')
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import logging
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from pagenum.system import ensure_dependencies, desktop_open_image
|
||||
@ -26,8 +27,12 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--max-page-num-length', type=int, default=3)
|
||||
parser.add_argument('--preview', action='store_true',
|
||||
help="open cropped image part in image viewer")
|
||||
parser.add_argument('--verbose', action='store_true')
|
||||
parser.add_argument('--tesseract-psm', type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if not os.path.exists(args.input):
|
||||
raise OSError(f'{args.input}: no such file')
|
||||
|
||||
@ -40,7 +45,11 @@ if __name__ == '__main__':
|
||||
# desktop_open_image(png_file)
|
||||
if args.preview:
|
||||
desktop_open_image(cropped_file)
|
||||
num = img2pagenum(cropped_file, args.max_page_num_length)
|
||||
|
||||
i2pn_kw = {}
|
||||
if args.tesseract_psm:
|
||||
i2pn_kw['psm'] = args.tesseract_psm
|
||||
num = img2pagenum(cropped_file, args.max_page_num_length, **i2pn_kw)
|
||||
print('num:', num)
|
||||
os.unlink(png_file)
|
||||
os.unlink(cropped_file)
|
||||
|
@ -114,16 +114,18 @@ def zonecrop(png_path, z):
|
||||
return filename
|
||||
|
||||
|
||||
def img2pagenum(img_file, maxlen):
|
||||
def img2pagenum(img_file, maxlen, psm=13):
|
||||
try:
|
||||
s = pytesseract.image_to_string(img_file,
|
||||
timeout=10,
|
||||
lang='eng',
|
||||
config='--psm 13 -c tessedit_char_whitelist=0123456789')
|
||||
config=f'--psm {psm} -c tessedit_char_whitelist=0123456789')
|
||||
except RuntimeError as timeout_error:
|
||||
_logger.exception(timeout_error)
|
||||
return None
|
||||
|
||||
_logger.debug(f'raw string: {s}')
|
||||
|
||||
for r in '_-.,—':
|
||||
s = s.replace(r, '')
|
||||
s = s.strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user