From 69912b3cb3f62e236a30b5eb5e903d565d66ac6b Mon Sep 17 00:00:00 2001 From: Gerit Wagner Date: Fri, 14 Feb 2025 08:08:57 +0100 Subject: [PATCH 1/5] add code pointer --- colrev/packages/files_dir/src/files_dir.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/colrev/packages/files_dir/src/files_dir.py b/colrev/packages/files_dir/src/files_dir.py index 2be696e3d..c65870d36 100644 --- a/colrev/packages/files_dir/src/files_dir.py +++ b/colrev/packages/files_dir/src/files_dir.py @@ -547,6 +547,9 @@ def _index_pdf( self.review_manager.logger.info(f" extract metadata from {file_path}") try: if not self.review_manager.settings.is_curated_masterdata_repo(): + # TODO : skip cover page + # extract _get_coverpages in colrev/packages/remove_coverpage/src/remove_cover_page.py + # retrieve_based_on_colrev_pdf_id colrev_pdf_id = colrev.record.record.Record.get_colrev_pdf_id( pdf_path=file_path_abs From 30a8e6c1b53f5873e293687600cacc98fd83f162 Mon Sep 17 00:00:00 2001 From: Gerit Wagner Date: Fri, 14 Feb 2025 08:12:08 +0100 Subject: [PATCH 2/5] notes --- colrev/packages/files_dir/src/files_dir.py | 1 + colrev/record/record_identifier.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/colrev/packages/files_dir/src/files_dir.py b/colrev/packages/files_dir/src/files_dir.py index c65870d36..7f1da9a69 100644 --- a/colrev/packages/files_dir/src/files_dir.py +++ b/colrev/packages/files_dir/src/files_dir.py @@ -549,6 +549,7 @@ def _index_pdf( if not self.review_manager.settings.is_curated_masterdata_repo(): # TODO : skip cover page # extract _get_coverpages in colrev/packages/remove_coverpage/src/remove_cover_page.py + # use get_colrev_pdf_id() with custom_page = 1 (for coverpages) # retrieve_based_on_colrev_pdf_id colrev_pdf_id = colrev.record.record.Record.get_colrev_pdf_id( diff --git a/colrev/record/record_identifier.py b/colrev/record/record_identifier.py index bec49b00a..c58382540 100644 --- a/colrev/record/record_identifier.py +++ b/colrev/record/record_identifier.py @@ -192,11 +192,12 @@ def get_colrev_id(record: colrev.record.record.Record, *, assume_complete: bool) return srep -def _get_colrev_pdf_id_cpid2(pdf_path: Path) -> str: +def _get_colrev_pdf_id_cpid2(pdf_path: Path, custom_page: int) -> str: with tempfile.NamedTemporaryFile(suffix=".png") as temp_file: file_name = temp_file.name try: doc: pymupdf.Document = pymupdf.open(pdf_path) + # TODO : select custom_page page = next(iter(doc)) # get the first page pix = page.get_pixmap(dpi=200) pix.save(file_name) # store image as a PNG @@ -214,7 +215,7 @@ def _get_colrev_pdf_id_cpid2(pdf_path: Path) -> str: raise colrev_exceptions.PDFHashError(path=pdf_path) from exc -def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2") -> str: +def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int = 0) -> str: """Get the PDF hash""" pdf_path = pdf_path.resolve() @@ -223,7 +224,7 @@ def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2") -> str: raise colrev_exceptions.InvalidPDFException(path=pdf_path) if cpid_version == "cpid2": - return _get_colrev_pdf_id_cpid2(pdf_path) + return _get_colrev_pdf_id_cpid2(pdf_path, custom_page=custom_page) raise NotImplementedError From 1ab261795d8a6ec4794ce2adaedd040463ae3152 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 07:12:42 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colrev/record/record_identifier.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/colrev/record/record_identifier.py b/colrev/record/record_identifier.py index c58382540..c5d2d93ba 100644 --- a/colrev/record/record_identifier.py +++ b/colrev/record/record_identifier.py @@ -215,7 +215,9 @@ def _get_colrev_pdf_id_cpid2(pdf_path: Path, custom_page: int) -> str: raise colrev_exceptions.PDFHashError(path=pdf_path) from exc -def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int = 0) -> str: +def get_colrev_pdf_id( + pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int = 0 +) -> str: """Get the PDF hash""" pdf_path = pdf_path.resolve() From e3d88cd4983462c38a8644063535f1134d4a5e1b Mon Sep 17 00:00:00 2001 From: Gerit Wagner Date: Fri, 14 Feb 2025 08:13:32 +0100 Subject: [PATCH 4/5] update record.get_colrev_pdf_id() --- colrev/record/record.py | 3 ++- colrev/record/record_identifier.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/colrev/record/record.py b/colrev/record/record.py index f31eab750..f4a2cee69 100644 --- a/colrev/record/record.py +++ b/colrev/record/record.py @@ -616,10 +616,11 @@ def get_colrev_id( def get_colrev_pdf_id( cls, pdf_path: Path, + custom_page: int = 0, ) -> str: # pragma: no cover """Generate the colrev_pdf_id""" - return colrev.record.record_identifier.get_colrev_pdf_id(pdf_path) + return colrev.record.record_identifier.get_colrev_pdf_id(pdf_path, custom_page=custom_page) def get_toc_key(self) -> str: """Get the record's toc-key""" diff --git a/colrev/record/record_identifier.py b/colrev/record/record_identifier.py index c5d2d93ba..c78deb71c 100644 --- a/colrev/record/record_identifier.py +++ b/colrev/record/record_identifier.py @@ -215,9 +215,7 @@ def _get_colrev_pdf_id_cpid2(pdf_path: Path, custom_page: int) -> str: raise colrev_exceptions.PDFHashError(path=pdf_path) from exc -def get_colrev_pdf_id( - pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int = 0 -) -> str: +def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int) -> str: """Get the PDF hash""" pdf_path = pdf_path.resolve() From 66c51331f18def0862102fce645eef2c29982b71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 07:15:00 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colrev/record/record.py | 4 +++- colrev/record/record_identifier.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/colrev/record/record.py b/colrev/record/record.py index f4a2cee69..fbb805798 100644 --- a/colrev/record/record.py +++ b/colrev/record/record.py @@ -620,7 +620,9 @@ def get_colrev_pdf_id( ) -> str: # pragma: no cover """Generate the colrev_pdf_id""" - return colrev.record.record_identifier.get_colrev_pdf_id(pdf_path, custom_page=custom_page) + return colrev.record.record_identifier.get_colrev_pdf_id( + pdf_path, custom_page=custom_page + ) def get_toc_key(self) -> str: """Get the record's toc-key""" diff --git a/colrev/record/record_identifier.py b/colrev/record/record_identifier.py index c78deb71c..f0ba0e278 100644 --- a/colrev/record/record_identifier.py +++ b/colrev/record/record_identifier.py @@ -215,7 +215,9 @@ def _get_colrev_pdf_id_cpid2(pdf_path: Path, custom_page: int) -> str: raise colrev_exceptions.PDFHashError(path=pdf_path) from exc -def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int) -> str: +def get_colrev_pdf_id( + pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int +) -> str: """Get the PDF hash""" pdf_path = pdf_path.resolve()