From 4d9cbde822f60b92d86a1319f5b2a9971c5af5d0 Mon Sep 17 00:00:00 2001 From: Dan Lowe Date: Wed, 31 Dec 2025 00:40:22 -0500 Subject: [PATCH 1/3] find ranges of page numbers ports a feature from PPtools: find page numbers in various formats and display them (roman first, arabic next). understands different formats like Page_1, page_1, page1 (in id attribute) and attempts to parse out span class=pagenum formats like p. 1, [Pg 1] and so forth. --- pphtml.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 115 insertions(+) diff --git a/pphtml.py b/pphtml.py index 5685048..f449f00 100644 --- a/pphtml.py +++ b/pphtml.py @@ -11,6 +11,8 @@ import sys import os import argparse +import itertools +import roman from time import strftime from html.parser import HTMLParser import regex as re # for unicode support (pip install regex) @@ -73,6 +75,8 @@ class initialization self.udefcss = {} # user defined CSS self.usedcss = {} # CSS used by user self.errormessage = "" # for unwrap failure + self.ranges_arabic = [] + self.ranges_roman = [] def crash(self): self.saveReport() @@ -462,6 +466,115 @@ def cleanExt(self): r.append("[pass] external links check") self.apl(r) + def documentInfo(self): + """ + Section to contain general document information + """ + self.ap("") + t = "document info" + self.ap("----- {} ".format(t) + "-" * (73 - len(t))) + self.findPageRanges() + + + def findPageRanges(self): + """ + Find ranges of page locations for later reporting + """ + r = [] + pages_arabic = [] + pages_roman = [] + + # Look for ,
, with 'id' attribute; \3 is the match + pat1 = re.compile(r"""<(a|div|span)\s+[^>]*\bid=["'](page|pg)_?([\divxlcdm]+)["']""", + re.IGNORECASE) + # Alternately: look for span class=pagenum; \1 is the match + pat2 = re.compile(r"""]*\bclass=["']pagenum['"].*>([^<]+)=7.1.0 regex>=2019.4.12 +roman>=5.2 From 88f15b8abfc21ee46b5e1f0ab3b568e8fa69592e Mon Sep 17 00:00:00 2001 From: Dan Lowe Date: Wed, 31 Dec 2025 15:17:54 -0500 Subject: [PATCH 2/3] follow import sorting/grouping convention --- pphtml.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pphtml.py b/pphtml.py index f449f00..fe1f0e5 100644 --- a/pphtml.py +++ b/pphtml.py @@ -8,15 +8,16 @@ # pylint: disable=C0103, R0912, R0915 # pylint: disable=too-many-instance-attributes, too-many-locals, no-self-use -import sys -import os import argparse import itertools -import roman -from time import strftime +import os +import sys from html.parser import HTMLParser -import regex as re # for unicode support (pip install regex) +from time import strftime + from PIL import Image # from pip install pillow +import regex as re # for unicode support (pip install regex) +import roman # for pphtml class MyHTMLParser(HTMLParser): From 3eb45f7f510e0b094a87e63236a012be7e8933de Mon Sep 17 00:00:00 2001 From: Dan Lowe Date: Thu, 1 Jan 2026 02:40:53 -0500 Subject: [PATCH 3/3] =?UTF-8?q?Show=20'3',=20not=20'3=E2=80=933'=20when=20?= =?UTF-8?q?page=20range=20is=201=20page=20long?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pphtml.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pphtml.py b/pphtml.py index fe1f0e5..b18359f 100644 --- a/pphtml.py +++ b/pphtml.py @@ -567,12 +567,23 @@ def findPageRanges(self): last = roman.fromRoman(page_h.upper()) - r.append("[info] page numbers ( roman): " + - ", ".join([f"{r[0]}–{r[1]}" for r in self.ranges_roman]) - ) - r.append("[info] page numbers (arabic): " + - ", ".join([f"{r[0]}–{r[1]}" for r in self.ranges_arabic]) - ) + # render found page ranges list (roman) + _rbuf = [] + for _r in self.ranges_roman: + if _r[0] == _r[1]: + _rbuf.append(_r[0]) + else: + _rbuf.append(f"{_r[0]}–{_r[1]}") + r.append(f"[info] page numbers ( roman): {', '.join(_rbuf)}") + + # render found page ranges list (arabic) + _abuf = [] + for _a in self.ranges_arabic: + if _a[0] == _a[1]: + _abuf.append(str(_a[0])) + else: + _abuf.append(f"{_a[0]}–{_a[1]}") + r.append(f"[info] page numbers (arabic): {', '.join(_abuf)}") self.apl(r)