support specifying tesseract's --psm
This commit is contained in:
parent
b7c4e402f9
commit
85dbfc5b0c
@ -20,6 +20,7 @@ _max_page_num_length = None
|
|||||||
_rename_lock = Lock()
|
_rename_lock = Lock()
|
||||||
_outdir: str
|
_outdir: str
|
||||||
_indir: str
|
_indir: str
|
||||||
|
_psm: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
def safe_copyfile(oldname, newname):
|
def safe_copyfile(oldname, newname):
|
||||||
@ -59,7 +60,10 @@ class PagenumWorker(Thread):
|
|||||||
num = None
|
num = None
|
||||||
for z in _zones:
|
for z in _zones:
|
||||||
cropped = zonecrop(png_file, z)
|
cropped = zonecrop(png_file, z)
|
||||||
num = img2pagenum(cropped, args.max_page_num_length)
|
i2pn_kw = {}
|
||||||
|
if _psm:
|
||||||
|
i2pn_kw['psm'] = _psm
|
||||||
|
num = img2pagenum(cropped, args.max_page_num_length, **i2pn_kw)
|
||||||
os.unlink(cropped)
|
os.unlink(cropped)
|
||||||
if num is not None:
|
if num is not None:
|
||||||
break
|
break
|
||||||
@ -86,6 +90,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('-p', '--pretend', action='store_true',
|
parser.add_argument('-p', '--pretend', action='store_true',
|
||||||
help="Don't save files but print info to stdout")
|
help="Don't save files but print info to stdout")
|
||||||
parser.add_argument('--max-page-num-length', type=int, default=3)
|
parser.add_argument('--max-page-num-length', type=int, default=3)
|
||||||
|
parser.add_argument('--tesseract-psm', type=int)
|
||||||
parser.add_argument('-V', '--verbose', action='store_true')
|
parser.add_argument('-V', '--verbose', action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -96,6 +101,8 @@ if __name__ == '__main__':
|
|||||||
_max_page_num_length = args.max_page_num_length
|
_max_page_num_length = args.max_page_num_length
|
||||||
_outdir = args.output_directory
|
_outdir = args.output_directory
|
||||||
_indir = args.input_directory
|
_indir = args.input_directory
|
||||||
|
if args.tesseract_psm:
|
||||||
|
_psm = args.tesseract_psm
|
||||||
|
|
||||||
if not os.path.isdir(_indir):
|
if not os.path.isdir(_indir):
|
||||||
raise OSError(f'{_indir}: no such directory')
|
raise OSError(f'{_indir}: no such directory')
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pagenum.system import ensure_dependencies, desktop_open_image
|
from pagenum.system import ensure_dependencies, desktop_open_image
|
||||||
@ -26,8 +27,12 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--max-page-num-length', type=int, default=3)
|
parser.add_argument('--max-page-num-length', type=int, default=3)
|
||||||
parser.add_argument('--preview', action='store_true',
|
parser.add_argument('--preview', action='store_true',
|
||||||
help="open cropped image part in image viewer")
|
help="open cropped image part in image viewer")
|
||||||
|
parser.add_argument('--verbose', action='store_true')
|
||||||
|
parser.add_argument('--tesseract-psm', type=int)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
if not os.path.exists(args.input):
|
if not os.path.exists(args.input):
|
||||||
raise OSError(f'{args.input}: no such file')
|
raise OSError(f'{args.input}: no such file')
|
||||||
|
|
||||||
@ -40,7 +45,11 @@ if __name__ == '__main__':
|
|||||||
# desktop_open_image(png_file)
|
# desktop_open_image(png_file)
|
||||||
if args.preview:
|
if args.preview:
|
||||||
desktop_open_image(cropped_file)
|
desktop_open_image(cropped_file)
|
||||||
num = img2pagenum(cropped_file, args.max_page_num_length)
|
|
||||||
|
i2pn_kw = {}
|
||||||
|
if args.tesseract_psm:
|
||||||
|
i2pn_kw['psm'] = args.tesseract_psm
|
||||||
|
num = img2pagenum(cropped_file, args.max_page_num_length, **i2pn_kw)
|
||||||
print('num:', num)
|
print('num:', num)
|
||||||
os.unlink(png_file)
|
os.unlink(png_file)
|
||||||
os.unlink(cropped_file)
|
os.unlink(cropped_file)
|
||||||
|
@ -114,16 +114,18 @@ def zonecrop(png_path, z):
|
|||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def img2pagenum(img_file, maxlen):
|
def img2pagenum(img_file, maxlen, psm=13):
|
||||||
try:
|
try:
|
||||||
s = pytesseract.image_to_string(img_file,
|
s = pytesseract.image_to_string(img_file,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
lang='eng',
|
lang='eng',
|
||||||
config='--psm 13 -c tessedit_char_whitelist=0123456789')
|
config=f'--psm {psm} -c tessedit_char_whitelist=0123456789')
|
||||||
except RuntimeError as timeout_error:
|
except RuntimeError as timeout_error:
|
||||||
_logger.exception(timeout_error)
|
_logger.exception(timeout_error)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
_logger.debug(f'raw string: {s}')
|
||||||
|
|
||||||
for r in '_-.,—':
|
for r in '_-.,—':
|
||||||
s = s.replace(r, '')
|
s = s.replace(r, '')
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user