This commit is contained in:
rootless 2023-10-10 02:36:57 +03:00
commit e5169586a0
10 changed files with 486 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
__pycache__
*.png
/venv
/.ropeproject

24
NOTES Normal file
View File

@ -0,0 +1,24 @@
Unite PDFs:
pdfunite IMG* union.pdf
Rotate 90 degress clockwise:
pdftk union.pdf cat 1-endeast output 1.pdf
Split vertically (use -x for horizontal split):
mutool poster -y 2 1.pdf
Split into separate files, one page per file:
pdfseparate out.pdf %d.pdf
Check Djvu metadata:
djvused -e print-meta book.djvu

39
README Normal file
View File

@ -0,0 +1,39 @@
SUMMARY
This is a collection of tools that helps me digitizing books.
In particular, it helps assembling a bunch of random page scans into a book
with correct page order, mainly by using OCR and text (number) recognition.
I use it to prepare my book releases on torrents.
SYSTEM REQUIREMENTS
Theoretically should work on any system that supports Python 3.9+ and has
required dependencies, but might need some minor modifications in the code.
Tested only on FreeBSD 13.
DEPENDENCIES
System utilities:
- tesseract
- pdftoppm
Python packages:
- pytesseract
- Pillow
AUTHORS
rootless (c) 2023
LICENSE
BSD-2-Clause

31
gaps.py Normal file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env python3
import os
from argparse import ArgumentParser
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-i', '--input', type=str, required=True,
help='Input directory')
parser.add_argument('-p', '--pages', type=int, required=True,
help='Expected count of pages')
args = parser.parse_args()
ids = {}
files = os.listdir(args.dir)
for file in files:
if not file.endswith('.pdf'):
continue
id = file[0:file.index('.')]
if not id.isnumeric():
continue
id = int(id)
if id not in ids:
ids[id] = 1
for i in range(1, args.pages):
if i not in ids:
print(f'missing {i}')

132
pagenum-mass.py Executable file
View File

@ -0,0 +1,132 @@
#!/usr/bin/env python3
import os
import logging
import shutil
import sys
from queue import Queue, Empty
from threading import Thread, Lock
from argparse import ArgumentParser, ArgumentError
from typing import Optional
from pagenum.system import ensure_dependencies
from pagenum.image import pdf2png, zonecrop, img2pagenum, Zone
_logger = logging.getLogger(__name__)
_queue = Queue()
_zones: Optional[list[Zone]]
_pretend = False
_max_page_num_length = None
_rename_lock = Lock()
_outdir: str
_indir: str
def safe_copyfile(oldname, newname):
with _rename_lock:
if os.path.exists(os.path.join(_outdir, newname)):
filename, fileext = os.path.splitext(newname)
probe = 1
while True:
newname2 = f'{filename}-v{probe}{fileext}'
if os.path.exists(os.path.join(_outdir, newname2)):
probe += 1
else:
newname = newname2
break
shutil.copyfile(
os.path.join(_indir, oldname),
os.path.join(_outdir, newname)
)
class PagenumWorker(Thread):
def __init__(self, name):
Thread.__init__(self)
self.name = name
def run(self):
global _queue
while True:
try:
self.process(_queue.get(block=False))
except Empty:
break
def process(self, file):
file_path = os.path.join(_indir, file)
png_file = pdf2png(file_path)
num = None
for z in _zones:
cropped = zonecrop(png_file, z)
num = img2pagenum(cropped, args.max_page_num_length)
os.unlink(cropped)
if num is not None:
break
os.unlink(png_file)
if _pretend:
_logger.info(f'{file}: {num}')
else:
newname = f'{num}.pdf' if num is not None else 'unknown.pdf'
safe_copyfile(file, newname)
_logger.debug(f'{file} done ({num})')
if __name__ == '__main__':
ensure_dependencies()
parser = ArgumentParser()
parser.add_argument('-i', '--input-directory', type=str, required=True)
parser.add_argument('-o', '--output-directory', type=str, required=True)
parser.add_argument('-z', '--zones', type=str, nargs='+', required=True,
help=("One or more zones to search for page number. Format: zone,w,h,mt,mr,mb,ml. "
"See pagenumb-probe.py for more info"))
parser.add_argument('-t', '--threads', type=int, default=4,
help="Number of threads")
parser.add_argument('-p', '--pretend', action='store_true',
help="Don't save files but print info to stdout")
parser.add_argument('--max-page-num-length', type=int, default=3)
parser.add_argument('-V', '--verbose', action='store_true')
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
_zones = list(map(lambda zs: Zone.from_string(zs), args.zones))
_pretend = args.pretend
_max_page_num_length = args.max_page_num_length
_outdir = args.output_directory
_indir = args.input_directory
if not os.path.isdir(_indir):
raise OSError(f'{_indir}: no such directory')
if _indir == _outdir:
raise ArgumentError(None, '--input-directory must be different than --output-directory')
if not os.path.isdir(_outdir):
os.makedirs(_outdir)
else:
try:
input(f'Directory {_outdir} already exists. Press ENTER to erase it and continue or Ctrl+C to exit.')
except KeyboardInterrupt:
sys.exit(0)
shutil.rmtree(_outdir)
os.makedirs(_outdir)
files = os.listdir(_indir)
for file in files:
if not os.path.isfile(os.path.join(_indir, file)):
continue
filename, fileext = os.path.splitext(file)
if fileext.lower() != '.pdf':
continue
_queue.put(file)
threads = []
for i in range(args.threads):
thread = PagenumWorker(f'thread-{i}')
thread.start()
threads.append(thread)
for thread in threads:
thread.join()

47
pagenum-probe.py Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python3
import os
from argparse import ArgumentParser
from pagenum.system import ensure_dependencies, desktop_open_image
from pagenum.image import pdf2png, zonecrop, img2pagenum, Zone
if __name__ == '__main__':
ensure_dependencies()
parser = ArgumentParser()
parser.add_argument('--input',
help="input pdf file")
parser.add_argument('--input-page', type=int, default=1,
help="page in pdf file")
parser.add_argument('--zone', required=True,
choices=Zone.getzones(),
help="where to look for page number")
parser.add_argument('--height', type=int, required=True)
parser.add_argument('--width', type=int, required=True)
parser.add_argument('--margin-top', type=int, default=0)
parser.add_argument('--margin-right', type=int, default=0)
parser.add_argument('--margin-bottom', type=int, default=0)
parser.add_argument('--margin-left', type=int, default=0)
parser.add_argument('--max-page-num-length', type=int, default=3)
parser.add_argument('--preview', action='store_true',
help="open cropped image part in image viewer")
args = parser.parse_args()
if not os.path.exists(args.input):
raise OSError(f'{args.input}: no such file')
z = Zone(args.zone, args.width, args.height,
args.margin_top, args.margin_right,
args.margin_bottom, args.margin_left)
png_file = pdf2png(args.input, args.input_page)
cropped_file = zonecrop(png_file, z)
# desktop_open_image(png_file)
if args.preview:
desktop_open_image(cropped_file)
num = img2pagenum(cropped_file, args.max_page_num_length)
print('num:', num)
os.unlink(png_file)
os.unlink(cropped_file)
print(f'zone: {z}')

0
pagenum/__init__.py Normal file
View File

125
pagenum/image.py Normal file
View File

@ -0,0 +1,125 @@
import pytesseract
import subprocess
from .system import randomtempname
from PIL import Image
ZONES = ('topleft', 'topright',
'bottomleft', 'bottomright',
'topcenter', 'bottomcenter')
class Zone:
zone: str
width: int
height: int
margin_top: int
margin_bottom: int
marign_left: int
margin_right: int
def __init__(self, z, w, h, mt, mr, mb, ml):
if z not in ZONES:
return ValueError(f'invalid zone spec: zone "{z}" is invalid')
self.zone = z
self.width = int(w)
self.height = int(h)
self.margin_top = int(mt)
self.margin_bottom = int(mb)
self.margin_left = int(ml)
self.margin_right = int(mr)
def isright(self):
return self.zone.endswith('right')
def isleft(self):
return self.zone.endswith('left')
def iscenter(self):
return self.zone.endswith('center')
def istop(self):
return self.zone.startswith('top')
def isbottom(self):
return self.zone.startswith('bottom')
def __repr__(self):
return '%s,%d,%d,%d,%d,%d,%d' % (
self.zone,
self.
width,
self.height,
self.margin_top,
self.margin_right,
self.margin_bottom,
self.margin_left
)
@staticmethod
def from_string(s):
p = list(s.split(','))
if len(p) != 7:
raise ValueError(f'invalid zone spec: {s}')
return Zone(*p)
@staticmethod
def getzones():
return ZONES
def pdf2png(pdf_path, page=1):
file = randomtempname()
p = subprocess.run(["pdftoppm", pdf_path, file,
'-png', # generate PNG instead of PPM
'-f', str(page), # page number
'-r', '300', # PPI
'-singlefile'])
if p.returncode != 0:
raise RuntimeError(f'pdftoppm returned {p.returncode}')
return file+'.png'
def zonecrop(png_path, z):
image = Image.open(png_path)
iw, ih = image.size
x1 = x2 = y1 = y2 = None
if z.isleft():
x1 = 0
elif z.isright():
x1 = iw-z.width
elif z.iscenter():
# not tested
x1 = int(iw/2-z.width/2)
if z.istop():
y1 = 0
elif z.isbottom():
y1 = ih-z.height
x1 += z.margin_left
x1 -= z.margin_right
y1 += z.margin_top
y1 -= z.margin_bottom
x2 = x1+z.width
y2 = y1+z.height
filename = randomtempname(suffix='.png')
cropped = image.crop((x1, y1, x2, y2))
cropped.save(filename)
return filename
def img2pagenum(img_file, maxlen):
s = pytesseract.image_to_string(img_file,
lang='rus',
config='--psm 11')
for r in '_-.,—':
s = s.replace(r, '')
s = s.strip()
return s if s.isnumeric() and len(s) <= maxlen else None

24
pagenum/system.py Normal file
View File

@ -0,0 +1,24 @@
import subprocess
import tempfile
from shutil import which
DEPENDENCIES = ('pdftoppm',)
IMAGE_OPENER = 'ristretto'
def ensure_dependencies():
for s in DEPENDENCIES:
if which(s) is None:
return RuntimeError(f'required dependency not found: {s}')
def randomtempname(suffix=None):
name = next(tempfile._get_candidate_names())
if suffix is not None:
name += suffix
return name
def desktop_open_image(f):
subprocess.run([IMAGE_OPENER, f])

60
rename16.py Normal file
View File

@ -0,0 +1,60 @@
#!/usr/bin/env python3
import os
import shutil
import natsort
from os.path import join
from argparse import ArgumentParser
mapping = (
1, 16,
15, 2,
3, 14,
13, 4,
5, 12,
11, 6,
7, 10,
9, 8,
)
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--indir', type=str, required=True,
help='Input directory')
parser.add_argument('--outdir', type=str, required=True,
help='Output directory')
args = parser.parse_args()
if not os.path.exists(args.outdir):
os.mkdir(args.outdir)
files = os.listdir(args.indir)
files = natsort.natsorted(files)
offset = 0
for pages in chunks(files, 16):
if len(pages) == 16:
for i in range(16):
file = pages[i]
n = mapping[i]
new_name = str(offset + n) + '.pdf'
shutil.copyfile(
join(args.indir, file),
join(args.outdir, new_name)
)
print(f'{file} => {new_name}')
offset += 16
else:
break