commit e5169586a06bbeb23e6cb7603446665b79f06bea Author: rootless Date: Tue Oct 10 02:36:57 2023 +0300 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a57e8ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +*.png +/venv +/.ropeproject diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..34f9e78 --- /dev/null +++ b/NOTES @@ -0,0 +1,24 @@ +Unite PDFs: + + pdfunite IMG* union.pdf + + +Rotate 90 degress clockwise: + + pdftk union.pdf cat 1-endeast output 1.pdf + + +Split vertically (use -x for horizontal split): + + mutool poster -y 2 1.pdf + + +Split into separate files, one page per file: + + pdfseparate out.pdf %d.pdf + + +Check Djvu metadata: + + djvused -e print-meta book.djvu + diff --git a/README b/README new file mode 100644 index 0000000..b09989e --- /dev/null +++ b/README @@ -0,0 +1,39 @@ +SUMMARY + + This is a collection of tools that helps me digitizing books. + + In particular, it helps assembling a bunch of random page scans into a book + with correct page order, mainly by using OCR and text (number) recognition. + + I use it to prepare my book releases on torrents. + + +SYSTEM REQUIREMENTS + + Theoretically should work on any system that supports Python 3.9+ and has + required dependencies, but might need some minor modifications in the code. + + Tested only on FreeBSD 13. + + +DEPENDENCIES + + System utilities: + + - tesseract + - pdftoppm + + Python packages: + + - pytesseract + - Pillow + + +AUTHORS + + rootless (c) 2023 + + +LICENSE + + BSD-2-Clause diff --git a/gaps.py b/gaps.py new file mode 100644 index 0000000..a0ebdf0 --- /dev/null +++ b/gaps.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import os + +from argparse import ArgumentParser + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('-i', '--input', type=str, required=True, + help='Input directory') + parser.add_argument('-p', '--pages', type=int, required=True, + help='Expected count of pages') + args = parser.parse_args() + + ids = {} + files = os.listdir(args.dir) + for file in files: + if not file.endswith('.pdf'): + continue + + id = file[0:file.index('.')] + if not id.isnumeric(): + continue + + id = int(id) + if id not in ids: + ids[id] = 1 + + for i in range(1, args.pages): + if i not in ids: + print(f'missing {i}') diff --git a/pagenum-mass.py b/pagenum-mass.py new file mode 100755 index 0000000..08895e8 --- /dev/null +++ b/pagenum-mass.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +import os +import logging +import shutil +import sys + +from queue import Queue, Empty +from threading import Thread, Lock +from argparse import ArgumentParser, ArgumentError + +from typing import Optional +from pagenum.system import ensure_dependencies +from pagenum.image import pdf2png, zonecrop, img2pagenum, Zone + +_logger = logging.getLogger(__name__) +_queue = Queue() +_zones: Optional[list[Zone]] +_pretend = False +_max_page_num_length = None +_rename_lock = Lock() +_outdir: str +_indir: str + + +def safe_copyfile(oldname, newname): + with _rename_lock: + if os.path.exists(os.path.join(_outdir, newname)): + filename, fileext = os.path.splitext(newname) + probe = 1 + while True: + newname2 = f'{filename}-v{probe}{fileext}' + if os.path.exists(os.path.join(_outdir, newname2)): + probe += 1 + else: + newname = newname2 + break + shutil.copyfile( + os.path.join(_indir, oldname), + os.path.join(_outdir, newname) + ) + + +class PagenumWorker(Thread): + def __init__(self, name): + Thread.__init__(self) + self.name = name + + def run(self): + global _queue + while True: + try: + self.process(_queue.get(block=False)) + except Empty: + break + + def process(self, file): + file_path = os.path.join(_indir, file) + png_file = pdf2png(file_path) + num = None + for z in _zones: + cropped = zonecrop(png_file, z) + num = img2pagenum(cropped, args.max_page_num_length) + os.unlink(cropped) + if num is not None: + break + os.unlink(png_file) + if _pretend: + _logger.info(f'{file}: {num}') + else: + newname = f'{num}.pdf' if num is not None else 'unknown.pdf' + safe_copyfile(file, newname) + _logger.debug(f'{file} done ({num})') + + +if __name__ == '__main__': + ensure_dependencies() + + parser = ArgumentParser() + parser.add_argument('-i', '--input-directory', type=str, required=True) + parser.add_argument('-o', '--output-directory', type=str, required=True) + parser.add_argument('-z', '--zones', type=str, nargs='+', required=True, + help=("One or more zones to search for page number. Format: zone,w,h,mt,mr,mb,ml. " + "See pagenumb-probe.py for more info")) + parser.add_argument('-t', '--threads', type=int, default=4, + help="Number of threads") + parser.add_argument('-p', '--pretend', action='store_true', + help="Don't save files but print info to stdout") + parser.add_argument('--max-page-num-length', type=int, default=3) + parser.add_argument('-V', '--verbose', action='store_true') + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + _zones = list(map(lambda zs: Zone.from_string(zs), args.zones)) + _pretend = args.pretend + _max_page_num_length = args.max_page_num_length + _outdir = args.output_directory + _indir = args.input_directory + + if not os.path.isdir(_indir): + raise OSError(f'{_indir}: no such directory') + + if _indir == _outdir: + raise ArgumentError(None, '--input-directory must be different than --output-directory') + + if not os.path.isdir(_outdir): + os.makedirs(_outdir) + else: + try: + input(f'Directory {_outdir} already exists. Press ENTER to erase it and continue or Ctrl+C to exit.') + except KeyboardInterrupt: + sys.exit(0) + shutil.rmtree(_outdir) + os.makedirs(_outdir) + + files = os.listdir(_indir) + for file in files: + if not os.path.isfile(os.path.join(_indir, file)): + continue + filename, fileext = os.path.splitext(file) + if fileext.lower() != '.pdf': + continue + _queue.put(file) + + threads = [] + for i in range(args.threads): + thread = PagenumWorker(f'thread-{i}') + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() diff --git a/pagenum-probe.py b/pagenum-probe.py new file mode 100755 index 0000000..f38b399 --- /dev/null +++ b/pagenum-probe.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import os + +from argparse import ArgumentParser +from pagenum.system import ensure_dependencies, desktop_open_image +from pagenum.image import pdf2png, zonecrop, img2pagenum, Zone + + +if __name__ == '__main__': + ensure_dependencies() + + parser = ArgumentParser() + parser.add_argument('--input', + help="input pdf file") + parser.add_argument('--input-page', type=int, default=1, + help="page in pdf file") + parser.add_argument('--zone', required=True, + choices=Zone.getzones(), + help="where to look for page number") + parser.add_argument('--height', type=int, required=True) + parser.add_argument('--width', type=int, required=True) + parser.add_argument('--margin-top', type=int, default=0) + parser.add_argument('--margin-right', type=int, default=0) + parser.add_argument('--margin-bottom', type=int, default=0) + parser.add_argument('--margin-left', type=int, default=0) + parser.add_argument('--max-page-num-length', type=int, default=3) + parser.add_argument('--preview', action='store_true', + help="open cropped image part in image viewer") + args = parser.parse_args() + + if not os.path.exists(args.input): + raise OSError(f'{args.input}: no such file') + + z = Zone(args.zone, args.width, args.height, + args.margin_top, args.margin_right, + args.margin_bottom, args.margin_left) + + png_file = pdf2png(args.input, args.input_page) + cropped_file = zonecrop(png_file, z) + # desktop_open_image(png_file) + if args.preview: + desktop_open_image(cropped_file) + num = img2pagenum(cropped_file, args.max_page_num_length) + print('num:', num) + os.unlink(png_file) + os.unlink(cropped_file) + print(f'zone: {z}') diff --git a/pagenum/__init__.py b/pagenum/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pagenum/image.py b/pagenum/image.py new file mode 100644 index 0000000..04f106b --- /dev/null +++ b/pagenum/image.py @@ -0,0 +1,125 @@ +import pytesseract +import subprocess + +from .system import randomtempname +from PIL import Image + + +ZONES = ('topleft', 'topright', + 'bottomleft', 'bottomright', + 'topcenter', 'bottomcenter') + + +class Zone: + zone: str + width: int + height: int + margin_top: int + margin_bottom: int + marign_left: int + margin_right: int + + def __init__(self, z, w, h, mt, mr, mb, ml): + if z not in ZONES: + return ValueError(f'invalid zone spec: zone "{z}" is invalid') + self.zone = z + self.width = int(w) + self.height = int(h) + self.margin_top = int(mt) + self.margin_bottom = int(mb) + self.margin_left = int(ml) + self.margin_right = int(mr) + + def isright(self): + return self.zone.endswith('right') + + def isleft(self): + return self.zone.endswith('left') + + def iscenter(self): + return self.zone.endswith('center') + + def istop(self): + return self.zone.startswith('top') + + def isbottom(self): + return self.zone.startswith('bottom') + + def __repr__(self): + return '%s,%d,%d,%d,%d,%d,%d' % ( + self.zone, + self. + width, + self.height, + self.margin_top, + self.margin_right, + self.margin_bottom, + self.margin_left + ) + + @staticmethod + def from_string(s): + p = list(s.split(',')) + if len(p) != 7: + raise ValueError(f'invalid zone spec: {s}') + return Zone(*p) + + @staticmethod + def getzones(): + return ZONES + + +def pdf2png(pdf_path, page=1): + file = randomtempname() + p = subprocess.run(["pdftoppm", pdf_path, file, + '-png', # generate PNG instead of PPM + '-f', str(page), # page number + '-r', '300', # PPI + '-singlefile']) + if p.returncode != 0: + raise RuntimeError(f'pdftoppm returned {p.returncode}') + return file+'.png' + + +def zonecrop(png_path, z): + image = Image.open(png_path) + iw, ih = image.size + + x1 = x2 = y1 = y2 = None + + if z.isleft(): + x1 = 0 + elif z.isright(): + x1 = iw-z.width + elif z.iscenter(): + # not tested + x1 = int(iw/2-z.width/2) + + if z.istop(): + y1 = 0 + elif z.isbottom(): + y1 = ih-z.height + + x1 += z.margin_left + x1 -= z.margin_right + y1 += z.margin_top + y1 -= z.margin_bottom + + x2 = x1+z.width + y2 = y1+z.height + + filename = randomtempname(suffix='.png') + cropped = image.crop((x1, y1, x2, y2)) + cropped.save(filename) + + return filename + + +def img2pagenum(img_file, maxlen): + s = pytesseract.image_to_string(img_file, + lang='rus', + config='--psm 11') + for r in '_-.,—': + s = s.replace(r, '') + s = s.strip() + return s if s.isnumeric() and len(s) <= maxlen else None diff --git a/pagenum/system.py b/pagenum/system.py new file mode 100644 index 0000000..a5c65e4 --- /dev/null +++ b/pagenum/system.py @@ -0,0 +1,24 @@ +import subprocess +import tempfile + +from shutil import which + +DEPENDENCIES = ('pdftoppm',) +IMAGE_OPENER = 'ristretto' + + +def ensure_dependencies(): + for s in DEPENDENCIES: + if which(s) is None: + return RuntimeError(f'required dependency not found: {s}') + + +def randomtempname(suffix=None): + name = next(tempfile._get_candidate_names()) + if suffix is not None: + name += suffix + return name + + +def desktop_open_image(f): + subprocess.run([IMAGE_OPENER, f]) diff --git a/rename16.py b/rename16.py new file mode 100644 index 0000000..84450c1 --- /dev/null +++ b/rename16.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import os +import shutil +import natsort + +from os.path import join +from argparse import ArgumentParser + +mapping = ( + 1, 16, + 15, 2, + + 3, 14, + 13, 4, + + 5, 12, + 11, 6, + + 7, 10, + 9, 8, +) + + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--indir', type=str, required=True, + help='Input directory') + parser.add_argument('--outdir', type=str, required=True, + help='Output directory') + args = parser.parse_args() + + if not os.path.exists(args.outdir): + os.mkdir(args.outdir) + + files = os.listdir(args.indir) + files = natsort.natsorted(files) + + offset = 0 + + for pages in chunks(files, 16): + if len(pages) == 16: + for i in range(16): + file = pages[i] + n = mapping[i] + + new_name = str(offset + n) + '.pdf' + shutil.copyfile( + join(args.indir, file), + join(args.outdir, new_name) + ) + + print(f'{file} => {new_name}') + offset += 16 + else: + break