diff --git a/pphtml.py b/pphtml.py index 5685048..b18359f 100644 --- a/pphtml.py +++ b/pphtml.py @@ -8,13 +8,16 @@ # pylint: disable=C0103, R0912, R0915 # pylint: disable=too-many-instance-attributes, too-many-locals, no-self-use -import sys -import os import argparse -from time import strftime +import itertools +import os +import sys from html.parser import HTMLParser -import regex as re # for unicode support (pip install regex) +from time import strftime + from PIL import Image # from pip install pillow +import regex as re # for unicode support (pip install regex) +import roman # for pphtml class MyHTMLParser(HTMLParser): @@ -73,6 +76,8 @@ class initialization self.udefcss = {} # user defined CSS self.usedcss = {} # CSS used by user self.errormessage = "" # for unwrap failure + self.ranges_arabic = [] + self.ranges_roman = [] def crash(self): self.saveReport() @@ -462,6 +467,126 @@ def cleanExt(self): r.append("[pass] external links check") self.apl(r) + def documentInfo(self): + """ + Section to contain general document information + """ + self.ap("") + t = "document info" + self.ap("----- {} ".format(t) + "-" * (73 - len(t))) + self.findPageRanges() + + + def findPageRanges(self): + """ + Find ranges of page locations for later reporting + """ + r = [] + pages_arabic = [] + pages_roman = [] + + # Look for ,
, with 'id' attribute; \3 is the match + pat1 = re.compile(r"""<(a|div|span)\s+[^>]*\bid=["'](page|pg)_?([\divxlcdm]+)["']""", + re.IGNORECASE) + # Alternately: look for span class=pagenum; \1 is the match + pat2 = re.compile(r"""]*\bclass=["']pagenum['"].*>([^<]+)=7.1.0 regex>=2019.4.12 +roman>=5.2