From 735ae29c065f9ed119f5e659a728784e87abc943 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Fri, 3 Apr 2026 18:54:17 +0200 Subject: [PATCH 1/2] add duplicate images handling --- src/__init__.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index b06babea0..e10a1670b 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -29,6 +29,7 @@ import warnings import weakref import zipfile +from operator import itemgetter from . import extra @@ -2921,6 +2922,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.is_encrypted = False self.is_encrypted = False self.metadata = None + self.has_duplicate_images = False + self.images_xrefs_by_page = None self.FontInfos = [] self.Graftmaps = {} self.ShownPages = {} @@ -3045,6 +3048,26 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.page_count2 = extra.page_count_pdf else: self.page_count2 = extra.page_count_fz + + if len(self.page_count) > 1: + self.has_duplicate_images = True + first_page_n_images = len(self.get_page_images(0)) + for page in self.pages(start=1): + # we need at least one page with a different number of images + # to exclude full document duplication + if len(page.get_images()) != first_page_n_images: + self.has_duplicate_images = False + break + + if self.has_duplicate_images: + self.images_xrefs_by_page = [] + for page in self.pages(): + # store only images referenced by page + page_xrefs = list(map( + itemgetter("xref"), + page.get_image_info(xrefs=True) + )) + self.images_xrefs_by_page = page_xrefs finally: JM_mupdf_show_errors = JM_mupdf_show_errors_old @@ -5076,7 +5099,14 @@ def get_page_images(self, pno: int, full: bool =False) -> list: return () val = self._getPageInfo(pno, 2) if not full: - return [v[:-1] for v in val] + val = [v[:-1] for v in val] + if self.has_duplicate_images: + deduplicated_val = [] + for v in val: + # v[0] is "xref" + if v[0] in self.images_xrefs_by_page[pno]: + deduplicated_val.append(v) + return deduplicated_val return val def get_page_labels(self): From d3dff58408eb6c270ad4378c7b7b641d6827ad40 Mon Sep 17 00:00:00 2001 From: andreasntr Date: Fri, 3 Apr 2026 19:10:11 +0200 Subject: [PATCH 2/2] resolve circular dependency on first images duplication check --- src/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index e10a1670b..c3ecd9ae9 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -3050,14 +3050,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.page_count2 = extra.page_count_fz if len(self.page_count) > 1: - self.has_duplicate_images = True + has_duplicate_images = True first_page_n_images = len(self.get_page_images(0)) for page in self.pages(start=1): # we need at least one page with a different number of images # to exclude full document duplication if len(page.get_images()) != first_page_n_images: - self.has_duplicate_images = False + has_duplicate_images = False break + self.has_duplicate_images = has_duplicate_images if self.has_duplicate_images: self.images_xrefs_by_page = []